您的位置:首页 > 其它

Trie树实现词频统计与查找

2017-01-15 14:56 288 查看
#encoding:utf-8
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class LBTrie:
"""
simple implemention of Trie in Python.
"""
def __init__(self):
self.trie = {}
self.size = 0

#添加单词
def add(self, word):
p = self.trie
dicnum = 0
word = word.strip()
for c in word:
if not c in p:
p[c] = {}
dicnum+=1
p = p[c]

if word != '':
#在单词末尾处添加键值''作为标记,即只要某个字符的字典中含有''键即为单词结尾
p[''] = ''
if dicnum == len(word):
return True
#查询单词
def search(self, word):
p = self.trie
word = word.lstrip()
for c in word:
if not c in p:
return False
p = p[c]
#判断单词结束标记''
if '' in p:
return True
return False

#打印Trie树的接口
def output(self):
#print '{'
self.__print_item(self.trie)
#print '}'
return  self.__print_item(self.trie)

#实现Trie树打印的私有递归函数,indent控制缩进
def __print_item(self, p, indent=0):
if p:
ind = '' + '\t' * indent
for key in p.keys():
label = "'%s' : " % key
print ind + label + '{'
self.__print_item(p[key], indent+1)

print ind + ' '*len(label) + '}'

def codeutil(strs):
return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore')

if __name__ == '__main__':
trie_obj = LBTrie()
#添加单词
corpus = open('content.txt','r')
tree = open('tree.txt','w+')
countdic = defaultdict(int)
for record in corpus.readlines():
recordlist = record.split(' ')
for word in recordlist:
check = trie_obj.add(codeutil(word))
if check:
countdic[word] += 1
resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True)
for tup in resortedcountdic:
tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t')
#查找单词
if trie_obj.search(codeutil('氨基酸')):
print 'Yes'
else:
print 'No'
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐