.NET中文分词
下面我发俩段代码,前一段是我通过写一个二分法类来实现双字切分,后一段是通过调用肖波的分词器,自己又写了一个.net环境接口来实现基于词典的分词,以供大家学习之用……
代码段一
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
using System.IO;
using System.Net;
using System.Collections;
namespace erfenfa
{
private class filter
{
public static void main(string [ ]arg)
{
string str="我们都是好孩子,即使你不同意,但又能如何,是不?";
TextReader tr = new StringReader(str );
CJKAnalyzer wa = new CJKAnalyzer();
TokenStream ts = wa.TokenStream("*",tr);
Token t = null;
while ((t = ts.Next()) != null)
{
messagebox.show(t.TermText().ToString());//实现循环输出
}
}
}
public class CJKAnalyzer:Analyzer
{
public static string[] STOP_WORDS = {
"a", "and", "are", "as", "at", "be",
"but", "by", "for", "if", "in",
"into", "is", "it", "no", "not",
"of", "on", "or", "s", "such", "t",
"that", "the", "their", "then",
"there", "these", "they", "this",
"to", "was", "will", "with", "",
"www"
};
private Hashtable stopTable;
public CJKAnalyzer()
{
stopTable = StopFilter.MakeStopSet(STOP_WORDS);
}
public CJKAnalyzer(string[] stopWords)
{
stopTable = StopFilter.MakeStopSet(stopWords);
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream ts=new CJKTokenizer(reader);
return new StopFilter(ts, stopTable);
}
}
public class CJKTokenizer:Tokenizer
{
private static int MAX_WORD_LEN = 255;
private static int IO_BUFFER_SIZE = 256;
private int offset = 0;
private int bufferIndex = 0;
private int dataLen = 0;
private char[] buffer = new char[MAX_WORD_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
private string tokenType = "word";
private bool preIsTokened = false;
public CJKTokenizer(TextReader reader)
{
input = reader;
}
public override Token Next()
{
int length = 0;
int start = offset;
while (true)
{
char c;
offset++;
if (bufferIndex >= dataLen )
{
if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
{
dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
bufferIndex = 0;
}
else
{
dataLen=0;
}
}
if (dataLen ==0)
{
if (length > 0)
{
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
}
break;
}
else
{
return null;
}
}
else
{
c = ioBuffer[bufferIndex++];
}
if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
{
if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
{
int i = (int) c;
i = i - 65248;
c = (char) i;
}
#region if the current character is a letter or "_" "+" "#
if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
{
if (length == 0)
{
start = offset - 1;
}
else if (tokenType == "double")
{
offset--;
bufferIndex--;
tokenType = "single";
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
break;
}
else
{
break;
}
}
buffer[length++] = char.ToLower(c);
tokenType = "single";
if (length == MAX_WORD_LEN)
{
break;
}
}
else if (length > 0)
{
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
}
else
{
break;
}
}
#endregion
}
else
{
#region // non-ASCII letter, eg."C1C2C3C4"
if (char.IsLetter(c))
{
if (length == 0)
{
start = offset - 1;
buffer[length++] = c;
tokenType = "double";
}
else
{
if (tokenType == "single")
{
offset--;
bufferIndex--;
break;
}
else
{
buffer[length++] = c;
tokenType = "double";
if (length == 2)
{
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
}
else if (length > 0)
{
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
}
else
{
break;
}
}
#endregion
}
}
return new Token(new String(buffer, 0, length), start, start + length,
tokenType
);
}
public bool IsAscii(char c)
{
return c<256 && c>=0;
}
public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
{
return c<=0xFFEF && c>=0xFF00;
}
}
}
注:这段程序其实理解起来很简单,整个中文分词的调用方法都一样,如果在Java环境下这些没有必要写出来,可是.net环境下我程序中后两个类就是很难找的了,希望对你们学习有帮助。
代码段二
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
using System.Collections;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using KTDictSeg;
namespace xiaobo
{
private class fiter
{
public static void Main(string[ ] str)
{
string str="我们都是好孩子,即使你不同意又如何,是不?";
TextReader tr = new StringReader(str );
KTDictSegAnalyzer wa = new KTDictSegAnalyzer();
TokenStream ts = wa.TokenStream("*", tr);
Token t = null;
while ((t = ts.Next()) != null)
{
messagebox.show( t.TermText().ToString());
}
}
}
public class KTDictSegAnalyzer : Analyzer //这个类是用来实现和肖波提供的分词器连接的接 口,现在的分词器很多,但效果参差不齐,肖波是比较不错的分词器
{
public KTDictSegAnalyzer()
{
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream result = new KTDictSegTokenizer(reader);
result = new LowerCaseFilter(result);
return result;
}
}
public class KTDictSegTokenizer : Tokenizer
{
public static CSimpleDictSeg m_SimpleDictSeg;
private ListioBuffer = new List();
private int offSet = 0; //偏移量.
private int position = -1; //词汇在缓冲中的位置.
private int length = 0; //词汇的长度.
private int start = 0; //开始偏移量.
public KTDictSegTokenizer(System.IO.TextReader input)
: base(input)
{
//这里用了一个第三方的中文分词组件.
//ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
if (m_SimpleDictSeg == null)
{
try
{
m_SimpleDictSeg = new CSimpleDictSeg();
m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
m_SimpleDictSeg.LoadDict();
}
catch (Exception e1)
{
m_SimpleDictSeg = null;
throw e1;
}
}
m_SimpleDictSeg.FilterStopWords = true;
m_SimpleDictSeg.MatchName = true;
ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());
}
//DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。
public override Token Next()
{
position++;
if (position < ioBuffer.Count)
{
length = ioBuffer[position].ToString().Length;
start = offSet;
offSet += length;
return new Token(ioBuffer[position].ToString(), start, start + length);
}
return null;
}
}
}
注:校波的分词类我们不提供,大家在引入程序集的时候引入就可以了,我在此只提供关键的对。net的接口程序,有兴趣可以研究一下哦。
以上代码,谁有不理解可以直接联系我。