面试题:分词统计

题目:给定一个目录,统计该目录及其子目录下所有文件内容中的词频,并输出最多的词频。假设所有的文件都是txt格式,且所有的内容都是英文。


分析:

1、可以用Dictionary来存储每个词的出现次数

2、时间复杂度至少为O(n)

似乎无法再进行优化了。那么我们通过多线程来并行处理。C#多线程非常方便。代码如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.IO;

namespace MyTest
{
    class Program
    {
        static object _sync = new object();
        static int _unreadFileCount;
        static AutoResetEvent _event = new AutoResetEvent(false);
        static Dictionary<string, int> wordsCount = new Dictionary<string, int>();

        static void Main(string[] args)
        {
            string path = @"D:\Users\ruiyli\testData";

            List<FileInfo> files = getFiles(path);
            _unreadFileCount = files.Count;
            foreach (var file in files)
            {
                CountWordsAsync(file);
            }

            _event.WaitOne();

            int i = 0;
            foreach (var kv in wordsCount.OrderByDescending(o => o.Value))
            {
                Console.WriteLine(kv.Key + ":" + kv.Value);
                i++;
                if (i > 10)
                {
                    break;
                }
            }
            

            Console.ReadKey();
        }

        static async void CountWordsAsync(FileInfo file)
        {
            try
            {
                //Console.WriteLine(file.FullName);
                using (StreamReader sr = new StreamReader(file.FullName))
                {
                    StringBuilder word = new StringBuilder();
                    string line;
                    while ((line = await sr.ReadLineAsync()) != null)
                    {
                        for (int i = 0; i < line.Length; i++ )
                        {
                            if (check(line[i]))
                            {
                                word.Append(line[i]);
                            }
                            else
                            {
                                if (word.Length > 0)
                                {
                                    string words = word.ToString();
                                    lock (_sync)
                                    {
                                        if (!wordsCount.ContainsKey(words))
                                        {
                                            wordsCount[words] = 1;
                                        }
                                        else
                                        {
                                            wordsCount[words]++;
                                        }
                                    }
                                    word.Clear();
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {

            }
            lock (_sync)
            {
                _unreadFileCount--;
                if (_unreadFileCount == 0)
                {
                    _event.Set();
                }
            }
        }

        static bool check(char c)
        {
            return char.IsLetterOrDigit(c);
        }

        static List<FileInfo> getFiles(string path)
        {
            List<FileInfo> files = new List<FileInfo>();
            DirectoryInfo di = new DirectoryInfo(path);
            files.AddRange(di.GetFiles());
            foreach (var d in di.GetDirectories())
            {
                files.AddRange(getFiles(path + @"\" + d.Name));
            }
            return files;
        }

    }
}


0 条评论

    发表评论

    电子邮件地址不会被公开。 必填项已用 * 标注