写了个1-gram的分词算法实现:
借鉴了之前在这个blog上看到的n-gram算法中的split函数的写法,其他部分自己写的。
Dictionary.py:
class Dictionary: 'Dictionary Loading and Management' def __init__(self,dicname): self.dictMap={} self.N = 0; dictfile = open(dicname,'r') for eachLine in dictfile: dictstr = eachLine.decode("cp936") strlist = dictstr.split("\t",2) self.dictMap[strlist[0]] = strlist[1].split("\n",1)[0] self.N+=int(self.dictMap[strlist[0]]) dictfile.close() print self.N def getCount(self,wordname): if(self.dictMap.has_key(wordname)): return int(self.dictMap[wordname]) else: return 0.5;#如果词典中没有,这个词的出现次数被定为 0.5 def getPvalue(self,wordname): return float(self.getCount(wordname))/self.N def isAWord(self,word): return self.dictMap.has_key(word) if __name__=='__main__': dict1=Dictionary("dict.txt")
class Ngram: def __init__(self,dictionary): self.mDict=dictionary self.wordList=() self.valueMap = {} self.segMap={} def splitsentence(self,sentence): wordlist = [] for eachNum in range(len(sentence)): wordlist.append((sentence[:eachNum+1],sentence[eachNum+1:])) return wordlist def maxP(self, sentence): if(len(sentence)<=1): return self.mDict.getPvalue(sentence) SenSplitList = self.splitsentence(sentence); maxPvalue = 0; wordPair = []; wordP = 0; for eachPair in SenSplitList: if(len(eachPair[0])>0 and len(eachPair[1])>0): p1=0; p2=0 if(self.valueMap.has_key(eachPair[0])): p1=self.valueMap[eachPair[0]] else: p1=self.maxP(eachPair[0]) if(self.valueMap.has_key(eachPair[1])): p2=self.valueMap[eachPair[1]] else: p2=self.maxP(eachPair[1]) wordP=p1*p2 if(maxPvalue<wordP): maxPvalue = wordP wordPair = eachPair v=self.mDict.getPvalue(sentence) if((v)>maxPvalue and self.mDict.isAWord(sentence)): self.valueMap[sentence]=v self.segMap[sentence]=sentence return v else: self.valueMap[sentence]=maxPvalue self.segMap[sentence]=wordPair return maxPvalue def getSeg(self): return self.segMap if(__name__ =="__main__"): ngram1 = Ngram("dict1") print ngram1.splitsentence("ABC")
from Dictionary import Dictionary from ngram import Ngram def printSeg(segMap,sentence): if(segMap.has_key(sentence)): pair = segMap[sentence] if(isinstance(pair,tuple)): printSeg(segMap,pair[0]) printSeg(segMap,pair[1]) else: if(sentence==pair): print sentence else: printSeg(segMap,pair) else: print sentence dict1 = Dictionary("dict.txt") while(True): ngram1 =Ngram(dict1) sentence = raw_input("please input a Chinese Sentence:").decode("cp936"); print ngram1.maxP(sentence) segmap=ngram1.getSeg() #for eachkey in segmap: # if(isinstance(segmap[eachkey],tuple)): # print (eachkey+":"+segmap[eachkey][0]+','+segmap[eachkey][1]) # else: # print (eachkey+":"+segmap[eachkey]) printSeg(segmap,sentence)
感谢分享!期待后继!
[回复]
博主你好!~在你的博客中学到了很多东西哦~~受益匪浅啊~我想问一下博主有没有关注过事件抽取领域?可知道有什么可用的开源平台否?先谢谢咯~~[嘻嘻]
[回复]
52nlp 回复:
25 7 月, 2011 at 08:07
抱歉,没有关注过~
[回复]
playcoin 回复:
25 7 月, 2011 at 12:43
好的~还是谢谢啦~~
[回复]
不错,尤其是sentence segment学习一下。
这好要交一个中文分词的作业,我也是初学者,实在是想不到如何实现“全切分”,让我很头疼。看了一些学长写的感觉十分冗长。这个segment的递归方法非常优雅,学习~
不知道是否有效率更高的做法?
PS.赞一下python。
[回复]
测试一下能否插入没用的信息
[回复]
52nlp 回复:
12 12 月, 2012 at 09:56
没把这条信息标成spam算你通过了。
[回复]
不错,赞!
[回复]