25
11/2015
面试题:分词统计
题目:给定一个目录,统计该目录及其子目录下所有文件内容中的词频,并输出最多的词频。假设所有的文件都是txt格式,且所有的内容都是英文。
分析:
1、可以用Dictionary来存储每个词的出现次数
2、时间复杂度至少为O(n)
似乎无法再进行优化了。那么我们通过多线程来并行处理。C#多线程非常方便。代码如下:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; using System.Threading.Tasks; using System.IO; namespace MyTest { class Program { static object _sync = new object(); static int _unreadFileCount; static AutoResetEvent _event = new AutoResetEvent(false); static Dictionary<string, int> wordsCount = new Dictionary<string, int>(); static void Main(string[] args) { string path = @"D:\Users\ruiyli\testData"; List<FileInfo> files = getFiles(path); _unreadFileCount = files.Count; foreach (var file in files) { CountWordsAsync(file); } _event.WaitOne(); int i = 0; foreach (var kv in wordsCount.OrderByDescending(o => o.Value)) { Console.WriteLine(kv.Key + ":" + kv.Value); i++; if (i > 10) { break; } } Console.ReadKey(); } static async void CountWordsAsync(FileInfo file) { try { //Console.WriteLine(file.FullName); using (StreamReader sr = new StreamReader(file.FullName)) { StringBuilder word = new StringBuilder(); string line; while ((line = await sr.ReadLineAsync()) != null) { for (int i = 0; i < line.Length; i++ ) { if (check(line[i])) { word.Append(line[i]); } else { if (word.Length > 0) { string words = word.ToString(); lock (_sync) { if (!wordsCount.ContainsKey(words)) { wordsCount[words] = 1; } else { wordsCount[words]++; } } word.Clear(); } } } } } } catch (Exception e) { } lock (_sync) { _unreadFileCount--; if (_unreadFileCount == 0) { _event.Set(); } } } static bool check(char c) { return char.IsLetterOrDigit(c); } static List<FileInfo> getFiles(string path) { List<FileInfo> files = new List<FileInfo>(); DirectoryInfo di = new DirectoryInfo(path); files.AddRange(di.GetFiles()); foreach (var d in di.GetDirectories()) { files.AddRange(getFiles(path + @"\" + d.Name)); } return files; } } }
0 条评论