您的位置:首页 > 编程语言 > Python开发

nlp_关键词库(mysql数据库)去噪_python

2016-07-14 15:05 330 查看
1.频率归一化词库

#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
#import chardet

con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
cur.execute("create table if not exists nor_word_ansj(id int(10) not null primary key auto_increment, keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("create table if not exists nor_word_celebrity(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("create table if not exists nor_word_sougou(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("select * from sougou_word")
rows=cur.fetchall()
num_freq=0
for row in rows:
keyword=row[0]
nature=row[2]
frequency=row[1]
num_freq=num_freq+frequency
for row in rows:
keyword=row[0]
nature=row[2]
frequency=row[1]
cur.execute("insert into nor_word_sougou(keyword,nature,frequency) values('%s','%s','%.8f')" % (keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq))
table_ori=('ansj_seg_default_dic','celebrity')
table_new=('nor_word_ansj','nor_word_celebrity')

limit_num=10000
for i in range(0, 2):
cur.execute("select count(*) from %s " % table_ori[i])
num_line=cur.fetchone()
clc_num=int(num_line[0]/limit_num)
for j in range(0,clc_num):
cur.execute("select * from %s where id>(%d)*(%d)&&id<=(%d)*(%d)" % (table_ori[i],limit_num,j,limit_num,j+1))
rows = cur.fetchall()
num_freq=0;
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
num_freq=num_freq+frequency
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')" % (table_new[i],keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq))

con.commit()
cur.execute("select * from %s where id>(%d)*(%d) " % (table_ori[i],clc_num,limit_num))
rows = cur.fetchall()
num_freq=0;
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
num_freq=num_freq+frequency
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')" % (table_new[i],keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq))

con.commit()

con.close()


2.去噪

#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
import is_cn_or_en_or_number_char
con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
table_ori=('nor_word_ansj','nor_word_celebrity','nor_word_sougou')
#停用的词
cur.execute("create table if not exists stop_word_ansj(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
cur.execute("create table if not exists stop_word_celebrity(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
cur.execute("create table if not exists stop_word_sougou(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
#剩余的词
cur.execute("create table if not exists fix_word_ansj(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
cur.execute("create table if not exists fix_word_celebrity(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
cur.execute("create table if not exists fix_word_sougou(keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20),id int(10) not null primary key auto_increment)default charset=utf8;")
stop_table=('stop_word_ansj','stop_word_celebrity','stop_word_sougou')
fix_table=('fix_word_ansj','fix_word_celebrity','fix_word_sougou')
#挑出在有用属性表里且汉字数目在2个以上
for i in range(0,len(fix_table)-2):
cur.execute("select * from %s "% table_ori[i])
print table_ori[1]
rows=cur.fetchall()
for row in rows:
keyword=(u'%s')%row[1]
nature=row[2]
frequency=row[3]
num_cn=0
nature=nature.lower()
for keychar in keyword:
if is_cn_or_en_or_number_char.is_cn_char(keychar):
num_cn+=1
if num_cn>1 and ( 'a'in nature or 'ad'in nature or  'aj'in nature or  'an' in nature or  'i' in nature or  'j' in nature or  'n' in nature or  'ng' in nature or  'nr' in nature or  'nrfg' in nature or  'nrt' in nature or  'ns' in nature or  'nt' in nature or  'nz' in nature or  'v' in nature or  'vd' in nature or  'vg' in nature or  'vi' in nature or  'vn' in nature or  'vq' in nature):
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"% (fix_table[i], keyword,nature,frequency))
else:
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"% (stop_table[i], keyword,nature,frequency))

con.commit()
for j in range(0,len(stop_table)-2):
stop_nature=('pron,','pron')
len_stop_nature= len(stop_nature)
for i in range(0,len_stop_nature):
cur.execute("select * from %s where nature = '%s'" %  (fix_table[j],stop_nature[i]));
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature = row[1]
frequency=row[2]
nature=nature.lower()
if stop_nature[i] in nature:
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(stop_table[j], keyword,nature,frequency))
cur.execute("delete from %s where nature = '%s'" % (fix_table[j],stop_nature[i]))
con.commit()
#从stop_word_..中挑出(合成词(comb)中汉字数目在两个以上或者含一个汉字“后”+两个数字)有用的词条加入到fix_word_..并且从stop_word_..去除
for i in range(0,len(stop_table)-2):
useful_nature=('comb','comb,')
len_useful_nature=len(useful_nature)
for j in range(0,len_useful_nature):
cur.execute("select * from %s where nature = '%s'" % (stop_table[i],useful_nature[j]))
rows=cur.fetchall()
for row in rows:
keyword=(u'%s') % row[0]
nature=row[1]
frequency=row[2]
id_n=row[3]
num_cn=0
num_number=0
char_cn_one=''
for keychar in keyword:
if is_cn_or_en_or_number_char.is_cn_char(keychar):
num_cn+=1
if is_cn_or_en_or_number_char.is_number(keychar):
num_number+=1
if num_cn==1:
for keychar in keyword:
if is_cn_or_en_or_number_char.is_cn_char(keychar):
char_cn_one=keychar
if num_cn>1 or (num_cn==1 and num_number==2 and char_cn_one==u'后'):
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(fix_table[i], keyword,nature,frequency))
cur.execute("delete from %s where id=%d" % (stop_table[i],id_n))
con.commit()
#从stop_word_..中挑出有用词条(汉字数为四个或者含一个汉字“后”+两个数字)加入fix_word_..并且从stop_word_..去除
for i in range(0,len(stop_table)-3):
#useful_nature=('comb','comb,')
#len_useful_nature=len(useful_nature)
#for j in range(0,len_useful_nature):
cur.execute("select * from %s" % stop_table[i])
rows=cur.fetchall()
for row in rows:
keyword=(u'%s') % row[0]
nature=row[1]
frequency=row[2]
id_n=row[3]
num_cn=0
num_number=0
char_cn_one=''
for keychar in keyword:
if is_cn_or_en_or_number_char.is_cn_char(keychar):
num_cn+=1
if is_cn_or_en_or_number_char.is_number(keychar):
num_number+=1
if num_cn==1:
for keychar in keyword:
if is_cn_or_en_or_number_char.is_cn_char(keychar):
char_cn_one=keychar
if num_cn==4 or (num_cn==1 and num_number==2 and char_cn_one==u'后'):
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')"%(fix_table[i], keyword,nature,frequency))
cur.execute("delete from %s where id=%d" % (stop_table[i],id_n))
con.commit()
con.close()
3.判断汉字或英文或数字的函数

def is_cn_char(i):
return 0x4e00<=ord(i)<0x9fa6

def is_chinese(uchar):
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False

def is_number(uchar):
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False

def is_alphabet(uchar):
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False


4、合并三个表格为combine_fix_word和combine_fix_word_only,     process_words_combine:

#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
import chardet
con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
cur.execute("create table if not exists combine_fix_word(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL ,nature varchar(20) character set utf8,frequency float )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_like like combine_fix_word;")
#含有关键词和词性和频率
cur.execute("select keyword,nature,frequency from fix_word_ansj union select keyword,nature,frequency from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word_like(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()

cur.execute("select keyword,nature,frequency from combine_fix_word_like union select keyword,nature,frequency from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
nature=row[1]
frequency=row[2]
cur.execute("insert into combine_fix_word(keyword,nature,frequency) values('%s','%s',%.8f)"% (keyword.encode("utf-8"),nature.encode("utf-8"),frequency))
con.commit()
#只含关键词
cur.execute("create table if not exists combine_fix_word_only(id int(10) not NULL primary KEY auto_increment, keyword varchar(128) character set utf8 not NULL )default charset=utf8;")
cur.execute("create table if not exists combine_fix_word_only_like like combine_fix_word_only;")
cur.execute("select keyword from fix_word_ansj union select keyword from fix_word_sougou")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only_like(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()

cur.execute("select keyword from combine_fix_word_only_like union select keyword from fix_word_celebrity")
rows=cur.fetchall()
for row in rows:
keyword=row[0]
cur.execute("insert into combine_fix_word_only(keyword) values('%s')"% keyword.encode("utf-8"))
con.commit()

cur.execute("drop table combine_fix_word_only_like")
cur.execute("drop table combine_fix_word_like")
con.commit()

con.close()

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  nlp python mysql