您的位置:首页 > 编程语言 > Python开发

Python从同一文件进行数据不落地的取高频处理

2014-03-12 14:35 846 查看
数据格式:(词,拼音,词频)

的 de
148709248

的 di 1193135

了 le 62873377

了 liao
3199200

是 shi 62432861

一 yi 58994539

不 bu 57479625

不 fou 1136895

将文件中多音字的高频词汇提取并返回

def chose_high_freq_word():
'''key:word,value:[pinyin](high frequence)'''
filepath = os.path.dirname(os.path.abspath(__file__))
filename = os.path.join(filepath,"dict.txt")
if not os.path.isfile(filename):
raise ValueError("No such file:{}".format(filename))
all_word_pinyin_freq_dic = {}
multi_word_pinyin_freq_dic = {}
with codecs.open(filename,encoding="utf-8") as f:
for line in f.readlines():
if line.startswith(";"):
pass
else:
splited_line = line.split("\t")
if len(splited_line) is not 3:
raise ValueError("splited lenth is not 3,in file HZout_NoTone.txt")
word = splited_line[0]
pinyin = splited_line[1]
freq = splited_line[2].strip()
check_multi = all_word_pinyin_freq_dic.get(word)
if check_multi is not None:
check_multi.append([pinyin,freq])
multi_word_pinyin_freq_dic[word] = check_multi
all_word_pinyin_freq_dic[word] = check_multi
else:
all_word_pinyin_freq_dic[word] = [[pinyin,freq]]
# print len(multi_word_pinyin_freq_dic)
pattern = re.compile(r"\d")
for word in multi_word_pinyin_freq_dic:
# print word,multi_word_pinyin_freq_dic[word]
multi_word_pinyin_freq_dic[word] = max(multi_word_pinyin_freq_dic[word],key=lambda x:int(x[1]))
multi_word_pinyin_freq_dic[word] = [pattern.sub("",multi_word_pinyin_freq_dic[word][0])]
# for word in multi_word_pinyin_freq_dic:
#     print word,multi_word_pinyin_freq_dic[word]
return multi_word_pinyin_freq_dic
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐