您的位置:首页 > 编程语言 > Python开发

python实现获取文件列表中每个文件关键字

2014-12-18 11:59 302 查看
功能描述:

获取某个路径下的所有文件,提取出每个文件中出现频率最高的前300个字。保存在数据库当中。

前提,你需要配置好nltk

#!/usr/bin/python
#coding=utf-8
'''
function : This script will create a database named mydb then

abstract keywords of files of privacy police.

author    : Chicho

date      : 2014/7/28

running   : python key_extract.py -d path_of_file
'''

import sys,getopt
import nltk
import MySQLdb
from nltk.corpus import PlaintextCorpusReader

corpus_root = ""

if __name__ == '__main__':

opts,args = getopt.getopt(sys.argv[1:], "d:h","directory=help")

#get the directory
for op,value in opts:
if op in ("-d", "--directory"):
corpus_root = value

#actually, the above method to get  a directory is a little complicated,you can
#do like this
'''
the input include you path and use sys.argv to get the path
'''
'''
running : python key_extract.py you path_of_file
corpus_root = sys.argv[1]
'''

# corpus_root is the directory of files of privacy policy, all of the are html files
filelists = PlaintextCorpusReader(corpus_root, '.*')

#get the files' list
files = filelists.fileids()

#connect the database
conn = MySQLdb.connect(host = 'your_personal_host_ip_address', user = 'rusername', port =your_port, passwd = 'U_password')
#get the cursor
curs = conn.cursor()

conn.set_character_set('utf8')
curs.execute('set names utf8')
curs.execute('SET CHARACTER SET utf8;')
curs.execute('SET character_set_connection=utf8;')

'''
conn.text_factory=lambda x: unicode(x, 'utf8', "ignore")
#conn.text_factory=str
'''

# create a database named mydb
'''
try:
curs.execute("create database mydb")
except Exception,e:
print e
'''

conn.select_db('mydb')

try:
for i in range(300):
sql = "alter table filekeywords add " + "key" + str(i) + " varchar(45)"
curs.execute(sql)
except Exception,e:
print e

i = 0
for privacyfile in files:
#f = open(privacyfile,'r', encoding= 'utf-8')
sql = "insert into filekeywords set id =" + str(i)
curs.execute(sql)
sql = "update filekeywords set name =" + "'" + privacyfile + "' where id= " + str(i)
curs.execute(sql)
# get the words in privacy policy
wordlist = [w for w in filelists.words(privacyfile) if w.isalpha() and len(w)>2]

# get the keywords
fdist = nltk.FreqDist(wordlist)
vol = fdist.keys()
key_num = len(vol)
if key_num > 300:
key_num = 300
for j in range(key_num):
sql = "update filekeywords set " + "key" + str(j) + "=" + "'" + vol[j] + "' where id=" + str(i)
curs.execute(sql)
i = i + 1

conn.commit()
curs.close()
conn.close()


转载注明出处:http://blog.csdn.net/chichoxian/article/details/42003603
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: