您的位置:首页 > 编程语言 > C#

C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。

2017-02-27 21:15 465 查看
1.引用文件下载地址:

http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar

2.引用一个Lucene.Net.dll文件

3.添加新类库文件 WordTree.cs

using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

namespace A.SplitString
{
public class WordTree
{
//需要添加的对照文件 sdict.txt文件
private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt");
public static Hashtable chartable = new Hashtable();

public static bool DictLoaded = false;

public static double DictLoad_Span = 0.0;

public string strChinese = "[一-龥]";

public string strNumber = "[0-9]";

public string strEnglish = "[a-zA-Z]";

public int GetCharType(string Char)
{
int result;
if (new Regex(this.strChinese).IsMatch(Char))
{
result = 0;
}
else if (new Regex(this.strEnglish).IsMatch(Char))
{
result = 1;
}
else if (new Regex(this.strNumber).IsMatch(Char))
{
result = 2;
}
else
{
result = -1;
}
return result;
}

public void LoadDict()
{
if (!WordTree.DictLoaded)
{
this.BuidDictTree();
WordTree.DictLoaded = true;
}
}

private void BuidDictTree()
{
long ticks = DateTime.Now.Ticks;

StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8);
string text = streamReader.ReadLine();
if (!chartable.Contains("word"))
{
WordTree.chartable.Add("word", null);
}
while (!string.IsNullOrEmpty(text))
{
Hashtable hashtable = WordTree.chartable;
for (int i = 0; i < text.Length; i++)
{
string key = text.Substring(i, 1);
if (!hashtable.Contains(key))
{
hashtable.Add(key, new Hashtable());
}
hashtable = (Hashtable)hashtable[key];
}
if (!hashtable.Contains("word"))
{
hashtable.Add("word", null);
}
text = streamReader.ReadLine();
}
streamReader.Close();
}
}
}


4.添加cs文件 ChineseTokenizer.cs

using Lucene.Net.Analysis;
using System;
using System.Collections;
using System.IO;

namespace A.SplitString
{
internal class ChineseTokenizer : Tokenizer
{
private int bufferIndex = 0;

private int dataLen = 0;

private int start;

private string text;

public ChineseTokenizer(TextReader reader)
{
this.input = reader;
this.text = this.input.ReadToEnd();
this.dataLen = this.text.Length;
}

public override Token Next()
{
WordTree wordTree = new WordTree();
wordTree.LoadDict();
Hashtable hashtable = WordTree.chartable;
string text = string.Empty;
this.bufferIndex = this.start;
int num = this.start;
int num2 = this.bufferIndex;
string text2 = string.Empty;
Token result;
while (this.start < this.dataLen)
{
string text3 = this.text.Substring(this.start, 1);
if (!string.IsNullOrEmpty(text3.Trim()))
{
if (!hashtable.Contains(text3))
{
if (text == string.Empty)
{
int i = this.start + 1;
switch (wordTree.GetCharType(text3))
{
case 0:
text += text3;
break;
case 1:
while (i < this.dataLen)
{
if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1)
{
break;
}
i++;
}
text += this.text.Substring(this.start, i - this.start).ToLower();
break;
case 2:
while (i < this.dataLen)
{
if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2)
{
break;
}
i++;
}
text += this.text.Substring(this.start, i - this.start);
break;
default:
this.start++;
this.bufferIndex = this.start;
continue;
}
this.start = i;
}
else if (wordTree.GetCharType(text3) == -1)
{
this.start++;
}
if (hashtable.Contains("word"))
{
result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
}
else
{
this.start = num + 1;
result = new Token(text2, num2, num2 + text2.Length);
}
}
else
{
text += text3;
hashtable = (Hashtable)hashtable[text3];
if (hashtable.Contains("word") || text.Length == 1)
{
text2 = text;
num = this.start;
num2 = this.bufferIndex;
}
this.start++;
if (this.start != this.dataLen)
{
continue;
}
if (hashtable.Contains("word") || text.Length == 1)
{
result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
}
else
{
this.start = num + 1;
result = new Token(text2, num2, num2 + text2.Length);
}
}
return result;
}
this.start++;
this.bufferIndex = this.start;
}
result = null;
return result;
}
}
}


5.添加cs 文件 SplitAdapter.cs

using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace A.SplitString
{
public class SplitAdapter : Analyzer
{

public static string[] CHINESE_ENGLISH_STOP_WORDS;

public static readonly string[] Filter = new string[321];

public SplitAdapter(string path)
{

StreamReader streamReader = new StreamReader(path, Encoding.UTF8);
string text = streamReader.ReadLine();
int num = 0;
while (!string.IsNullOrEmpty(text))
{
SplitAdapter.Filter[num] = text;
text = streamReader.ReadLine();
num++;
}
}

public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream tokenStream = new ChineseTokenizer(reader);
tokenStream = new StandardFilter(tokenStream);
return new StopFilter(tokenStream, SplitAdapter.Filter);
}
}
}


6.实现类库

using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace A.Helper
{
public class MatchingHelper
{
public static List<string> GetMatchingList(string inputString)
{

string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config");
List<string> resultList = new List<string>();

SplitAdapter analyzer = new SplitAdapter(snoisePath);

StringReader reader = n
b63f
ew StringReader(inputString);
TokenStream tokenStream = analyzer.TokenStream(null, reader);
Token token = tokenStream.Next();
while (token != null)
{
resultList.Add(token.TermText());
token = tokenStream.Next();
}
return resultList;
//这个 list,就是拆分后的 词汇

}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐