.NET中文分词

发布日期:2013-02-20 16:38:09
这两天因为需要就研究了.net环境下的中文分词,发现在目前的最高2.3.1Lucene.net版本下中文分词效果不好,他自带了一些分词器,均没有什么效果,出来单词切分,连JCK二分法都不提供,更谈不上基于词典的分词了。

        下面我发俩段代码,前一段是我通过写一个二分法类来实现双字切分,后一段是通过调用肖波的分词器,自己又写了一个.net环境接口来实现基于词典的分词,以供大家学习之用……

代码段一

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
using System.IO;
using System.Net;
using System.Collections;

namespace erfenfa

{

   private class filter

   {

       public   static void main(string [ ]arg)
        {

           string str="我们都是好孩子,即使你不同意,但又能如何,是不?";
            TextReader tr = new StringReader(str );
            CJKAnalyzer wa = new CJKAnalyzer();
            TokenStream ts = wa.TokenStream("*",tr);
            Token t = null;
            while ((t = ts.Next()) != null)
            {
                   messagebox.show(t.TermText().ToString());//实现循环输出

            }
          
          
        }

   }

 public class CJKAnalyzer:Analyzer
    {

           public  static string[] STOP_WORDS = {
                                                    "a", "and", "are", "as", "at", "be",
                                                    "but", "by", "for", "if", "in",
                                                    "into", "is", "it", "no", "not",
                                                     "of", "on", "or", "s", "such", "t",
                                                   "that", "the", "their", "then",
                                                   "there", "these", "they", "this",
                                                    "to", "was", "will", "with", "",
                                                    "www"
                                                };


           private Hashtable stopTable;

            public CJKAnalyzer()
           {
                stopTable = StopFilter.MakeStopSet(STOP_WORDS);
           }

           public CJKAnalyzer(string[] stopWords)
           {
              stopTable = StopFilter.MakeStopSet(stopWords);
           }

    
           public override TokenStream TokenStream(string fieldName, TextReader reader)
           {
                TokenStream ts=new CJKTokenizer(reader);
                return new StopFilter(ts, stopTable);
              
           }
    }
    public  class CJKTokenizer:Tokenizer
    {
     
        private static int MAX_WORD_LEN = 255;

        private static int IO_BUFFER_SIZE = 256;

        private int offset = 0;

  
        private int bufferIndex = 0;

        private int dataLen = 0;
        private  char[] buffer = new char[MAX_WORD_LEN];
 
        private  char[] ioBuffer = new char[IO_BUFFER_SIZE];
        private string tokenType = "word";
        private bool preIsTokened = false;
        public CJKTokenizer(TextReader reader)
        {
           input = reader;
        }
        public override Token Next()
        {         
           int length = 0;      
           int start = offset;

           while (true)
           {
              
               char c;        
               offset++;
                if (bufferIndex >= dataLen )
                {
                   if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
                  {
                       dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
                       bufferIndex = 0;
                  }
                   else
                   {
                       dataLen=0;
                   }
              }

               if (dataLen ==0)
               {
                    if (length > 0)
                    {
                       if (preIsTokened == true)
                       {
                           length = 0;
                           preIsTokened = false;
                       }

                     break;
                    }
                   else
                  {
                      return null;
                    }
               }
              else
                {
                  
                    c = ioBuffer[bufferIndex++];
                }

               if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
              {
                    if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
                    {
                      
                      int i = (int) c;
                       i = i - 65248;
                       c = (char) i;
                   }
                   #region if the current character is a letter or "_" "+" "#

                    if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
                   {

                        if (length == 0)
                       {
                          
                            start = offset - 1;
                        }
                       else if (tokenType == "double")
                        {                        
                          offset--;
                           bufferIndex--;
                          tokenType = "single";

                          if (preIsTokened == true)
                          {
                           
                              length = 0;
                             preIsTokened = false;

                            break;
                           }
                           else
                           {
                              break;
                          }
                      }

                 
                       buffer[length++] = char.ToLower(c);
                      tokenType = "single";
               
                       if (length == MAX_WORD_LEN)
                       {
                          break;
                      }
                  }
                  else if (length > 0)
                   {
                       if (preIsTokened == true)
                       {
                           length = 0;
                           preIsTokened = false;
                       }
                        else
                       {
                           break;
                      }
                   }
                    #endregion

                }
               else
               {
                 #region // non-ASCII letter, eg."C1C2C3C4"         
                 if (char.IsLetter(c))
                    {
                        if (length == 0)
                       {
                          start = offset - 1;
                           buffer[length++] = c;
                           tokenType = "double";
                      }
                       else
                     {
                          if (tokenType == "single")
                           {
                            offset--;
                             bufferIndex--;                          
                              break;
                           }
                            else
                           {
                              buffer[length++] = c;
                              tokenType = "double";

                                if (length == 2)
                               {
                                   offset--;
                                 bufferIndex--;
                                    preIsTokened = true;

                                   break;
                              }
                           }
                       }
                 }
                 else if (length > 0)
                {
                       if (preIsTokened == true)
                     {
                        
                            length = 0;
                          preIsTokened = false;
                     }
                      else
                       {
                           break;
                      }
                    }
                   #endregion
              }
           }
         return new Token(new String(buffer, 0, length), start, start + length,
                tokenType
              );
       }

      public bool     IsAscii(char c)
       {
          return c<256 && c>=0;
       }
      
      public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
       {
           return c<=0xFFEF && c>=0xFF00;
       }
   }
}

 

 

注:这段程序其实理解起来很简单,整个中文分词的调用方法都一样,如果在Java环境下这些没有必要写出来,可是.net环境下我程序中后两个类就是很难找的了,希望对你们学习有帮助。

代码段二

 

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
using System.Collections;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using KTDictSeg;

namespace xiaobo

{

private class fiter

{

   public static  void Main(string[ ] str)
        {

           string str="我们都是好孩子,即使你不同意又如何,是不?";
            TextReader tr = new StringReader(str );
            KTDictSegAnalyzer wa = new KTDictSegAnalyzer();
            TokenStream ts = wa.TokenStream("*", tr);
            Token t = null;
            while ((t = ts.Next()) != null)
            {
                messagebox.show( t.TermText().ToString());
               
            }
           
        }

}

   public class KTDictSegAnalyzer : Analyzer //这个类是用来实现和肖波提供的分词器连接的接 口,现在的分词器很多,但效果参差不齐,肖波是比较不错的分词器
    {
        public KTDictSegAnalyzer()
        {
        }

        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new KTDictSegTokenizer(reader);
            result = new LowerCaseFilter(result);
            return result;
        }
    }

    public class KTDictSegTokenizer : Tokenizer
    {
        public static CSimpleDictSeg m_SimpleDictSeg;
        private ListioBuffer = new List();
        private int offSet = 0;  //偏移量.
        private int position = -1; //词汇在缓冲中的位置.
        private int length = 0;  //词汇的长度.
        private int start = 0;   //开始偏移量.

        public KTDictSegTokenizer(System.IO.TextReader input)
            : base(input)
        {
            //这里用了一个第三方的中文分词组件.
            //ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
            if (m_SimpleDictSeg == null)
            {
                try
                {
                    m_SimpleDictSeg = new CSimpleDictSeg();
                    m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
                    m_SimpleDictSeg.LoadDict();
                }
                catch (Exception e1)
                {
                    m_SimpleDictSeg = null;
                    throw e1;
                }
            }

            m_SimpleDictSeg.FilterStopWords = true;
            m_SimpleDictSeg.MatchName = true;
            ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());

        }

        //DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。
        public override Token Next()
        {
            position++;
            if (position < ioBuffer.Count)
            {
                length = ioBuffer[position].ToString().Length;
                start = offSet;
                offSet += length;
                return new Token(ioBuffer[position].ToString(), start, start + length);
            }

            return null;
        }
    }

}

注:校波的分词类我们不提供,大家在引入程序集的时候引入就可以了,我在此只提供关键的对。net的接口程序,有兴趣可以研究一下哦。

以上代码,谁有不理解可以直接联系我。