C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。
2017-02-27 21:15
465 查看
1.引用文件下载地址:
http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar
2.引用一个Lucene.Net.dll文件
3.添加新类库文件 WordTree.cs
4.添加cs文件 ChineseTokenizer.cs
5.添加cs 文件 SplitAdapter.cs
6.实现类库
http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar
2.引用一个Lucene.Net.dll文件
3.添加新类库文件 WordTree.cs
using System; using System.Collections; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace A.SplitString { public class WordTree { //需要添加的对照文件 sdict.txt文件 private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt"); public static Hashtable chartable = new Hashtable(); public static bool DictLoaded = false; public static double DictLoad_Span = 0.0; public string strChinese = "[一-龥]"; public string strNumber = "[0-9]"; public string strEnglish = "[a-zA-Z]"; public int GetCharType(string Char) { int result; if (new Regex(this.strChinese).IsMatch(Char)) { result = 0; } else if (new Regex(this.strEnglish).IsMatch(Char)) { result = 1; } else if (new Regex(this.strNumber).IsMatch(Char)) { result = 2; } else { result = -1; } return result; } public void LoadDict() { if (!WordTree.DictLoaded) { this.BuidDictTree(); WordTree.DictLoaded = true; } } private void BuidDictTree() { long ticks = DateTime.Now.Ticks; StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8); string text = streamReader.ReadLine(); if (!chartable.Contains("word")) { WordTree.chartable.Add("word", null); } while (!string.IsNullOrEmpty(text)) { Hashtable hashtable = WordTree.chartable; for (int i = 0; i < text.Length; i++) { string key = text.Substring(i, 1); if (!hashtable.Contains(key)) { hashtable.Add(key, new Hashtable()); } hashtable = (Hashtable)hashtable[key]; } if (!hashtable.Contains("word")) { hashtable.Add("word", null); } text = streamReader.ReadLine(); } streamReader.Close(); } } }
4.添加cs文件 ChineseTokenizer.cs
using Lucene.Net.Analysis; using System; using System.Collections; using System.IO; namespace A.SplitString { internal class ChineseTokenizer : Tokenizer { private int bufferIndex = 0; private int dataLen = 0; private int start; private string text; public ChineseTokenizer(TextReader reader) { this.input = reader; this.text = this.input.ReadToEnd(); this.dataLen = this.text.Length; } public override Token Next() { WordTree wordTree = new WordTree(); wordTree.LoadDict(); Hashtable hashtable = WordTree.chartable; string text = string.Empty; this.bufferIndex = this.start; int num = this.start; int num2 = this.bufferIndex; string text2 = string.Empty; Token result; while (this.start < this.dataLen) { string text3 = this.text.Substring(this.start, 1); if (!string.IsNullOrEmpty(text3.Trim())) { if (!hashtable.Contains(text3)) { if (text == string.Empty) { int i = this.start + 1; switch (wordTree.GetCharType(text3)) { case 0: text += text3; break; case 1: while (i < this.dataLen) { if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1) { break; } i++; } text += this.text.Substring(this.start, i - this.start).ToLower(); break; case 2: while (i < this.dataLen) { if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2) { break; } i++; } text += this.text.Substring(this.start, i - this.start); break; default: this.start++; this.bufferIndex = this.start; continue; } this.start = i; } else if (wordTree.GetCharType(text3) == -1) { this.start++; } if (hashtable.Contains("word")) { result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length); } else { this.start = num + 1; result = new Token(text2, num2, num2 + text2.Length); } } else { text += text3; hashtable = (Hashtable)hashtable[text3]; if (hashtable.Contains("word") || text.Length == 1) { text2 = text; num = this.start; num2 = this.bufferIndex; } this.start++; if (this.start != this.dataLen) { continue; } if (hashtable.Contains("word") || text.Length == 1) { result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length); } else { this.start = num + 1; result = new Token(text2, num2, num2 + text2.Length); } } return result; } this.start++; this.bufferIndex = this.start; } result = null; return result; } } }
5.添加cs 文件 SplitAdapter.cs
using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace A.SplitString { public class SplitAdapter : Analyzer { public static string[] CHINESE_ENGLISH_STOP_WORDS; public static readonly string[] Filter = new string[321]; public SplitAdapter(string path) { StreamReader streamReader = new StreamReader(path, Encoding.UTF8); string text = streamReader.ReadLine(); int num = 0; while (!string.IsNullOrEmpty(text)) { SplitAdapter.Filter[num] = text; text = streamReader.ReadLine(); num++; } } public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream tokenStream = new ChineseTokenizer(reader); tokenStream = new StandardFilter(tokenStream); return new StopFilter(tokenStream, SplitAdapter.Filter); } } }
6.实现类库
using Lucene.Net.Analysis; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace A.Helper { public class MatchingHelper { public static List<string> GetMatchingList(string inputString) { string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config"); List<string> resultList = new List<string>(); SplitAdapter analyzer = new SplitAdapter(snoisePath); StringReader reader = n b63f ew StringReader(inputString); TokenStream tokenStream = analyzer.TokenStream(null, reader); Token token = tokenStream.Next(); while (token != null) { resultList.Add(token.TermText()); token = tokenStream.Next(); } return resultList; //这个 list,就是拆分后的 词汇 } } }
相关文章推荐
- C#中文分词算法:ChineseAnalyzer
- C# 一个简单分词程序的思路和代码(六) 源代码 ,测试程序,词库下载地址
- 各种算法的C#实现系列1 - 合并排序的原理及代码分析
- 群蚁算法理论与实践全攻略——旅行商等路径优化问题的新方法【附C#群蚁算法完整项目代码】
- 多种排序算法性能分析代码 C++
- FFmpeg源代码结构图 - 雷神经典代码分析
- 高手收集整理的baidu分词算法分析之一 查询处理以及分词技术(1)
- 快排 和 堆排序算法的细节代码分析
- 编写高质量代码改善C#程序的157个建议——建议68:从System.Exception或其他常见的基本异常中派生异常
- C#生成CHM文件(应用篇)之代码库编辑器(5)【总结、程序、源代码】
- HTML代码转JS|C#字符串工具(附源代码)
- [CLR via C#]1.1 将源代码编译成托管代码
- [转]在C#代码中应用Log4Net系列教程(附源代码)
- [导入]C#分析数据库结构,使用XSL模板自动生成代码
- 一个C#算法分析求解
- code_analyzer(代码分析助手)
- 在C#代码中应用Log4Net系列教程(附源代码)【转载】
- 对Weka中DBSCAN算法的分析以及在C#中的实现
- Git 远程仓库(Github) Git 并不像 SVN 那样有个中心服务器。 目前我们使用到的 Git 命令都是在本地执行,如果你想通过 Git 分享你的代码或者与其他开发人员合作。 你就需要
- 修道士与野人问题——C++源代码,伪代码,详细分析