nlp_关键词库(mysql数据库)去噪_python
2016-07-14 15:05
330 查看
1.频率归一化词库
2.去噪
4、合并三个表格为combine_fix_word和combine_fix_word_only, process_words_combine:
#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
import chardet
con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
cur.execute("create table if not exists combine_fix_word(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL ,nature varchar(20) character set utf8,frequency float )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_like like combine_fix_word;")
#含有关键词和词性和频率
cur.execute("select keyword,nature,frequency from fix_word_ansj union select keyword,nature,frequency from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word_like(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()
cur.execute("select keyword,nature,frequency from combine_fix_word_like union select keyword,nature,frequency from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()
#只含关键词
cur.execute("create table if not exists combine_fix_word_only(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_only_like like combine_fix_word_only;")
cur.execute("select keyword from fix_word_ansj union select keyword from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only_like(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()
cur.execute("select keyword from combine_fix_word_only_like union select keyword from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()
cur.execute("drop table combine_fix_word_only_like")
cur.execute("drop table combine_fix_word_like")
con.commit()
con.close()
#!/usr/bin/python #-*- coding:UTF-8 -*- from __future__ import division import MySQLdb as mdb #import chardet con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8') cur = con.cursor() cur.execute("create table if not exists nor_word_ansj(id int(10) not null primary key auto_increment, keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;") cur.execute("create table if not exists nor_word_celebrity(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;") cur.execute("create table if not exists nor_word_sougou(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;") cur.execute("select * from sougou_word") rows=cur.fetchall() num_freq=0 for row in rows: keyword=row[0] nature=row[2] frequency=row[1] num_freq=num_freq+frequency for row in rows: keyword=row[0] nature=row[2] frequency=row[1] cur.execute("insert into nor_word_sougou(keyword,nature,frequency) values('%s','%s','%.8f')" % (keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq)) table_ori=('ansj_seg_default_dic','celebrity') table_new=('nor_word_ansj','nor_word_celebrity') limit_num=10000 for i in range(0, 2): cur.execute("select count(*) from %s " % table_ori[i]) num_line=cur.fetchone() clc_num=int(num_line[0]/limit_num) for j in range(0,clc_num): cur.execute("select * from %s where id>(%d)*(%d)&&id<=(%d)*(%d)" % (table_ori[i],limit_num,j,limit_num,j+1)) rows = cur.fetchall() num_freq=0; for row in rows: keyword=row[0] nature= row[1] frequency=row[2] num_freq=num_freq+frequency for row in rows: keyword=row[0] nature= row[1] frequency=row[2] cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')" % (table_new[i],keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq)) con.commit() cur.execute("select * from %s where id>(%d)*(%d) " % (table_ori[i],clc_num,limit_num)) rows = cur.fetchall() num_freq=0; for row in rows: keyword=row[0] nature= row[1] frequency=row[2] num_freq=num_freq+frequency for row in rows: keyword=row[0] nature= row[1] frequency=row[2] cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')" % (table_new[i],keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq)) con.commit() con.close()
2.去噪
#!/usr/bin/python #-*- coding:UTF-8 -*- from __future__ import division import MySQLdb as mdb import is_cn_or_en_or_number_char con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8') cur = con.cursor() table_ori=('nor_word_ansj','nor_word_celebrity','nor_word_sougou') #停用的词 cur.execute("create table if not exists stop_word_ansj(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") cur.execute("create table if not exists stop_word_celebrity(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") cur.execute("create table if not exists stop_word_sougou(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") #剩余的词 cur.execute("create table if not exists fix_word_ansj(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") cur.execute("create table if not exists fix_word_celebrity(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") cur.execute("create table if not exists fix_word_sougou(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;") stop_table=('stop_word_ansj','stop_word_celebrity','stop_word_sougou') fix_table=('fix_word_ansj','fix_word_celebrity','fix_word_sougou') #挑出在有用属性表里且汉字数目在2个以上 for i in range(0,len(fix_table)-2): cur.execute("select * from %s "% table_ori[i]) print table_ori[1] rows=cur.fetchall() for row in rows: keyword=(u'%s')%row[1] nature=row[2] frequency=row[3] num_cn=0 nature=nature.lower() for keychar in keyword: if is_cn_or_en_or_number_char.is_cn_char(keychar): num_cn+=1 if num_cn>1 and ( 'a'in nature or 'ad'in nature or 'aj'in nature or 'an' in nature or 'i' in nature or 'j' in nature or 'n' in nature or 'ng' in nature or 'nr' in nature or 'nrfg' in nature or 'nrt' in nature or 'ns' in nature or 'nt' in nature or 'nz' in nature or 'v' in nature or 'vd' in nature or 'vg' in nature or 'vi' in nature or 'vn' in nature or 'vq' in nature): cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"% (fix_table[i], keyword,nature,frequency)) else: cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"% (stop_table[i], keyword,nature,frequency)) con.commit() for j in range(0,len(stop_table)-2): stop_nature=('pron,','pron') len_stop_nature= len(stop_nature) for i in range(0,len_stop_nature): cur.execute("select * from %s where nature = '%s'" % (fix_table[j],stop_nature[i])); rows=cur.fetchall() for row in rows: keyword=row[0] nature = row[1] frequency=row[2] nature=nature.lower() if stop_nature[i] in nature: cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(stop_table[j], keyword,nature,frequency)) cur.execute("delete from %s where nature = '%s'" % (fix_table[j],stop_nature[i])) con.commit() #从stop_word_..中挑出(合成词(comb)中汉字数目在两个以上或者含一个汉字“后”+两个数字)有用的词条加入到fix_word_..并且从stop_word_..去除 for i in range(0,len(stop_table)-2): useful_nature=('comb','comb,') len_useful_nature=len(useful_nature) for j in range(0,len_useful_nature): cur.execute("select * from %s where nature = '%s'" % (stop_table[i],useful_nature[j])) rows=cur.fetchall() for row in rows: keyword=(u'%s') % row[0] nature=row[1] frequency=row[2] id_n=row[3] num_cn=0 num_number=0 char_cn_one='' for keychar in keyword: if is_cn_or_en_or_number_char.is_cn_char(keychar): num_cn+=1 if is_cn_or_en_or_number_char.is_number(keychar): num_number+=1 if num_cn==1: for keychar in keyword: if is_cn_or_en_or_number_char.is_cn_char(keychar): char_cn_one=keychar if num_cn>1 or (num_cn==1 and num_number==2 and char_cn_one==u'后'): cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(fix_table[i], keyword,nature,frequency)) cur.execute("delete from %s where id=%d" % (stop_table[i],id_n)) con.commit() #从stop_word_..中挑出有用词条(汉字数为四个或者含一个汉字“后”+两个数字)加入fix_word_..并且从stop_word_..去除 for i in range(0,len(stop_table)-3): #useful_nature=('comb','comb,') #len_useful_nature=len(useful_nature) #for j in range(0,len_useful_nature): cur.execute("select * from %s" % stop_table[i]) rows=cur.fetchall() for row in rows: keyword=(u'%s') % row[0] nature=row[1] frequency=row[2] id_n=row[3] num_cn=0 num_number=0 char_cn_one='' for keychar in keyword: if is_cn_or_en_or_number_char.is_cn_char(keychar): num_cn+=1 if is_cn_or_en_or_number_char.is_number(keychar): num_number+=1 if num_cn==1: for keychar in keyword: if is_cn_or_en_or_number_char.is_cn_char(keychar): char_cn_one=keychar if num_cn==4 or (num_cn==1 and num_number==2 and char_cn_one==u'后'): cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(fix_table[i], keyword,nature,frequency)) cur.execute("delete from %s where id=%d" % (stop_table[i],id_n)) con.commit() con.close()3.判断汉字或英文或数字的函数
def is_cn_char(i): return 0x4e00<=ord(i)<0x9fa6 def is_chinese(uchar): if uchar >= u'\u4e00' and uchar<=u'\u9fa5': return True else: return False def is_number(uchar): if uchar >= u'\u0030' and uchar<=u'\u0039': return True else: return False def is_alphabet(uchar): if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'): return True else: return False
4、合并三个表格为combine_fix_word和combine_fix_word_only, process_words_combine:
#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
import chardet
con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
cur.execute("create table if not exists combine_fix_word(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL ,nature varchar(20) character set utf8,frequency float )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_like like combine_fix_word;")
#含有关键词和词性和频率
cur.execute("select keyword,nature,frequency from fix_word_ansj union select keyword,nature,frequency from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word_like(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()
cur.execute("select keyword,nature,frequency from combine_fix_word_like union select keyword,nature,frequency from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()
#只含关键词
cur.execute("create table if not exists combine_fix_word_only(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_only_like like combine_fix_word_only;")
cur.execute("select keyword from fix_word_ansj union select keyword from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only_like(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()
cur.execute("select keyword from combine_fix_word_only_like union select keyword from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()
cur.execute("drop table combine_fix_word_only_like")
cur.execute("drop table combine_fix_word_like")
con.commit()
con.close()
相关文章推荐
- MySQL中的integer 数据类型
- MySQL存储过程
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- mysql中int、bigint、smallint 和 tinyint的区别与长度
- mysql load data 导出、导入 csv
- install and upgrade scrapy
- source命令执行SQL脚本文件
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- MySQL创建用户及权限控制
- MySQL管理数据表