您的位置：首页 > 其它

Lucene.Net 开发介绍 —— 二、分词（二）

2008-10-23 12:53 337 查看

1.2、分词的过程 1.2.1、分词器工作的过程内置的分词器效果都不好，那怎么办？只能自己写了！在写之前当然是要先看看内置的分词器是怎么实现的了。从1.1分析分词效果，可以看出KeywordAnalyzer这个分词器最懒惰，基本什么事情也没做。并不是它不会做，而是我们没找到使用它的方法，就像手上拿着个盒子，不知道里面是什么，就不知道这个是干嘛的，有什么用。打开盒子，那就是要查看源代码了！代码 1.2.1.1

using System;
2

namespace Lucene.Net.Analysis
4

{
5

/// <summary> "Tokenizes" the entire stream as a single token. This is useful
7

/// for data like zip codes, ids, and some product names.
8

/// </summary>
9

public class KeywordAnalyzer : Analyzer
10

{
11

public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
12

{
13

return new KeywordTokenizer(reader);
14

}
15

public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
17

{
18

Tokenizer tokenizer = (Tokenizer)GetPreviousTokenStream();
19

if (tokenizer == null)
20

{
21

tokenizer = new KeywordTokenizer(reader);
22

SetPreviousTokenStream(tokenizer);
23

}
24

else
25

tokenizer.Reset(reader);
26

return tokenizer;
27

}
28

}
29

} 代码1.2.1.1 就是传说中的源码了。先看看注释，意思大体是“‘Tokenizes’整体的流变成一个个词。这个特别适用于邮编，ID，和商品名称。”Tokenizes应该是拆分的意思，字典上查不到这个词。这段代码比较简单，只有两个方法，而第二个方法就是我们先前分析结果的时候用的（见段落1.1）。关键点就在于调用了KeywordTokenizer类。切到KeywordTokenizer类查看一下。代码1.2.1.2

using System;
2

namespace Lucene.Net.Analysis
4

{
5

/// <summary> Emits the entire input as a single token.</summary>
7

public class KeywordTokenizer : Tokenizer
8

{
9

private const int DEFAULT_BUFFER_SIZE = 256;
11

private bool done;
13

public KeywordTokenizer(System.IO.TextReader input) : this(input, DEFAULT_BUFFER_SIZE)
15

{
16

}
17

public KeywordTokenizer(System.IO.TextReader input, int bufferSize) : base(input)
19

{
20

this.done = false;
21

}
22

public override Token Next(Token result)
24

{
25

if (!done)
26

{
27

done = true;
28

int upto = 0;
29

result.Clear();
30

char[] buffer = result.TermBuffer();
31

while (true)
32

{
33

int length = input.Read(buffer, upto, buffer.Length - upto);
34

if (length <= 0)
35

break;
36

upto += length;
37

if (upto == buffer.Length)
38

buffer = result.ResizeTermBuffer(1 + buffer.Length);
39

}
40

result.termLength = upto;
41

return result;
42

}
43

return null;
44

}
45

public override void Reset(System.IO.TextReader input)
47

{
48

base.Reset(input);
49

this.done = false;
50

}
51

}
52

} 代码 1.2.1.2 就是KeywordTokenizer的源码。代码量很小，却没有完成全部工作，而是将部分工作交给了父类。关注Lucene的人都可以知道，新版本中，分词这里换掉了，现在多了一个重载的Next方法。这里不讨论为什么要加这个重载，这篇文章主要是讲应用的。因为取词是用Next方法走的，那么只需要关注Next方法就可以了。KeywordTokenizer的父类是Tokenizer，但是在Tokenizer里找不到我们想要的关系，但是Tokenizer又继承自TokenStream。查看TokenStream类。代码 1.2.1.3

using System;
3

using Payload = Lucene.Net.Index.Payload;
5

namespace Lucene.Net.Analysis
7

{
8

/// <summary>A TokenStream enumerates the sequence of tokens, either from
10

/// fields of a document or from query text.
11

/// <p>
12

/// This is an abstract class. Concrete subclasses are:
13

/// <ul>
14

/// <li>{@link Tokenizer}, a TokenStream
15

/// whose input is a Reader; and
16

/// <li>{@link TokenFilter}, a TokenStream
17

/// whose input is another TokenStream.
18

/// </ul>
19

/// NOTE: subclasses must override at least one of {@link
20

/// #Next()} or {@link #Next(Token)}.
21

/// </summary>
22

public abstract class TokenStream
24

{
25

/// <summary>Returns the next token in the stream, or null at EOS.
27

/// The returned Token is a "full private copy" (not
28

/// re-used across calls to next()) but will be slower
29

/// than calling {@link #Next(Token)} instead..
30

/// </summary>
31

public virtual Token Next()
32

{
33

Token result = Next(new Token());
34

if (result != null)
36

{
37

Payload p = result.GetPayload();
38

if (p != null)
39

{
40

result.SetPayload((Payload) p.Clone());
41

}
42

}
43

return result;
45

}
46

/// <summary>Returns the next token in the stream, or null at EOS.
48

/// When possible, the input Token should be used as the
49

/// returned Token (this gives fastest tokenization
50

/// performance), but this is not required and a new Token
51

/// may be returned. Callers may re-use a single Token
52

/// instance for successive calls to this method.
53

/// <p>
54

/// This implicitly defines a "contract" between
55

/// consumers (callers of this method) and
56

/// producers (implementations of this method
57

/// that are the source for tokens):
58

/// <ul>
59

/// <li>A consumer must fully consume the previously
60

/// returned Token before calling this method again.</li>
61

/// <li>A producer must call {@link Token#Clear()}
62

/// before setting the fields in it & returning it</li>
63

/// </ul>
64

/// Note that a {@link TokenFilter} is considered a consumer.
65

/// </summary>
66

/// <param name="result">a Token that may or may not be used to return
67

/// </param>
68

/// <returns> next token in the stream or null if end-of-stream was hit
69

/// </returns>
70

public virtual Token Next(Token result)
71

{
72

return Next();
73

}
74

/// <summary>Resets this stream to the beginning. This is an
76

/// optional operation, so subclasses may or may not
77

/// implement this method. Reset() is not needed for
78

/// the standard indexing process. However, if the Tokens
79

/// of a TokenStream are intended to be consumed more than
80

/// once, it is necessary to implement reset().
81

/// </summary>
82

public virtual void Reset()
83

{
84

}
85

/// <summary>Releases resources associated with this stream. </summary>
87

public virtual void Close()
88

{
89

}
90

}
91

} 代码 1.2.1.3 就是TokenStream类的源码。Next(Token)方法和Next()是相互调用的关系。但是因为Next(Token)方法在KeywordTokenizer里被重写掉了，因此，这里就可以忽略TokenStream的Next(Token)方法了。从上面代码可以看出，调用Next()方法，实际上是传递给Next(Token)方法一个新Token实例。即使直接调用Next(Token)，传递一个带有数据的Token，也会先被清除。在循环中，会把构造函数传入的流缓冲进Token类的缓冲区。ResizeTermBuffer方法是自动扩容用的，就像.Net Framework里的一些类能够自然扩容一样。比如List<T>,Hashtable或StringBuilder等。这个过程看不到分词的过程。不过这样就大致明白了分词器工作的流程。 1.2.2 如何让分词器分词 知道分词器如何工作了，但是现在还不明白分词如何分词。再回到1.1.2节，看到WhitespaceAnalyzer分词器似乎是学习的好选择。因为这个分词器只有遇到空格才会进行分词操作。根据1.2.1的经验，直接查看WhitespaceTokenizer类。代码1.2.2.1

using System;
2

namespace Lucene.Net.Analysis
4

{
5

/// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
7

/// Adjacent sequences of non-Whitespace characters form tokens.
8

/// </summary>
9

public class WhitespaceTokenizer : CharTokenizer
11

{
12

/// <summary>Construct a new WhitespaceTokenizer. </summary>
13

public WhitespaceTokenizer(System.IO.TextReader in_Renamed) : base(in_Renamed)
14

{
15

}
16

/// <summary>Collects only characters which do not satisfy
18

/// {@link Character#isWhitespace(char)}.
19

/// </summary>
20

protected internal override bool IsTokenChar(char c)
21

{
22

return !System.Char.IsWhiteSpace(c);
23

}
24

}
25

} 很好，这段代码很短，可是没有看到我们想要的东西。继续看父类。代码1.2.2.2

using System;
2

namespace Lucene.Net.Analysis
4

{
5

/// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
7

public abstract class CharTokenizer : Tokenizer
8

{
9

public CharTokenizer(System.IO.TextReader input) : base(input)
10

{
11

}
12

private int offset = 0, bufferIndex = 0, dataLen = 0;
14

private const int MAX_WORD_LEN = 255;
15

private const int IO_BUFFER_SIZE = 1024;
16

private char[] ioBuffer = new char[IO_BUFFER_SIZE];
17

/// <summary>Returns true iff a character should be included in a token. This
19

/// tokenizer generates as tokens adjacent sequences of characters which
20

/// satisfy this predicate. Characters for which this is false are used to
21

/// define token boundaries and are not included in tokens.
22

/// </summary>
23

protected internal abstract bool IsTokenChar(char c);
24

/// <summary>Called on each token character to normalize it before it is added to the
26

/// token. The default implementation does nothing. Subclasses may use this
27

/// to, e.g., lowercase tokens.
28

/// </summary>
29

protected internal virtual char Normalize(char c)
30

{
31

return c;
32

}
33

public override Token Next(Token token)
35

{
36

token.Clear();
37

int length = 0;
38

int start = bufferIndex;
39

char[] buffer = token.TermBuffer();
40

while (true)
41

{
42

if (bufferIndex >= dataLen)
44

{
45

offset += dataLen;
46

dataLen = input is Lucene.Net.Index.DocumentsWriter.ReusableStringReader ? ((Lucene.Net.Index.DocumentsWriter.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
47

if (dataLen <= 0)
48

{
49

if (length > 0)
50

break;
51

else
52

return null;
53

}
54

bufferIndex = 0;
55

}
56

char c = ioBuffer[bufferIndex++];
58

if (IsTokenChar(c))
60

{
61

// if it's a token char
62

if (length == 0)
64

// start of token
65

start = offset + bufferIndex - 1;
66

else if (length == buffer.Length)
67

buffer = token.ResizeTermBuffer(1 + length);
68

buffer[length++] = Normalize(c); // buffer it, normalized
70

if (length == MAX_WORD_LEN)
72

// buffer overflow!
73

break;
74

}
75

else if (length > 0)
76

// at non-Letter w/ chars
77

break; // return 'em
78

}
79

token.termLength = length;
81

token.startOffset = start;
82

token.endOffset = start + length;
83

return token;
84

}
85

public override void Reset(System.IO.TextReader input)
87

{
88

base.Reset(input);
89

bufferIndex = 0;
90

offset = 0;
91

dataLen = 0;
92

}
93

}
94

} 天公不作美，刚看到简单的，就来了个长的。无奈中。不过为什么要多一重继承呢？那就是有其他分词器也用到CharTokenizer了。而WhitespaceTokenizer中没有重写Next方法，而只是重写了IsTokenChar方法，几乎可以肯定。这个IsTokenChar才是重点。IsTokenChar故名思意，一看注释，果然！这个方法是判断是否遇到了分词的点的。这个其实和string类的Split方法相似。注意到Next方法关于IsTokenChar逻辑那一段，恩，果然是这样分词的。实际上就是拆分字符串嘛。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航