您的位置:首页 > 其它

使用pyhunspell检查多国语言词汇

2016-06-30 09:09 337 查看
import codecs
import hunspell

hun_dicts={
0x0001:'ar',
0x0002:'bg_BG',
0x0005:'cs',
0x0006:'da_DK',
0x0007:'de_DE,de_AT,de_CH',
0x0008:'el',
0x0009:'en_CA,en_US,en_AU,en_ZA,en_GB',
0x000A:'es',
0x000B:'fi_FI',
0x000C:'fr',
0x000D:'he',
0x000E:'hu_HU',
0x0010:'it_IT',
0x0012:'ko',
0x0013:'nl',
0x0014:'nb,nn',
0x0015:'pl',
0x0016:'pt,pt_BR',
0x0018:'ro_RO',
0x0019:'ru_RU',
0x001B:'sk',
0x001C:'sq_AL',
0x001D:'sv_SE',
0x001E:'th',
0x001F:'tr',
0x0020:'ur_PK',
0x0021:'id_ID',
0x0022:'uk',
0x0024:'sl',
0x0025:'et_EE',
0x0026:'lv_LV',
0x0027:'lt',
0x0029:'fa',
0x002A:'vi',
0x002F:'mk_MK',
0x0034:'xh_ZA',
0x0035:'zu_ZA',
0x0036:'af_ZA',
0x0039:'hi_IN',
0x003e:'ms_MY',
0x0041:'sw_TZ',
0x0044:'am_ET',
#0x0058:'hausa',
0x0059:'hr',
0x0061:'kk_KZ',
0x0080:'sh',
0x0084:'ta_IN',
#0x0093:'yoruba',
#0x0095:'igbo',
}

def get_hunspell(lang):
if lang not in hun_dicts:
return None

path = '/usr/share/hunspell/'
hps = []
for dict in hun_dicts[lang].split(','):
aff=path+dict+'.aff'
dic=path+dict+'.dic'
hps.append(hunspell.HunSpell(dic,aff))
return hps

def is_spell(word, hs_list):
for hs in hs_list:
dic_encode = hs.get_dic_encoding()
if (dic_encode == 'TIS620-2533'):
dic_encode = 'tis_620'
if(hs.spell(word.encode(dic_encode))):
return True
return False
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: