您的位置:首页 > 编程语言 > Python开发

上班之后完成的第一个项目

2017-07-14 17:23 525 查看
今天下午,终于在3月上班后,完成了第一个项目。

项目的大致内容是,将wikipedia上的某个特定分类(n多个子分类)的所有内容抓取下来,放到gensim中的word2vector训练处模型。

  再将不同的keywords的simword叠加起来,后面的similarity相加,取前topN个,生成一个重复的simword template。

中间进行了很多种尝试,从公司的ES,到google search,最后选定Wikipedia作为语料库。技术都不难,但是过程中趟了一些坑。比如说语料库太大,怎么训练模型等等。

粘上代码,用以将来再看的时候,能会心一笑。

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from nltk.corpus import stopwords
from util.logger import cluster_log
from util.date_util import DateUtil
import settings as job_settings
import gensim

class FilterTemplate(object):
def __init__(self, category):
# TODO: not defined campaign category
self.category = category
self.topN_similarity = job_settings.topN_similarity
self.topN_simword = job_settings.topN_simword
self.X = job_settings.X
self.model_root_path = job_settings.model_root_path

self.date_util = DateUtil()

def model_path(self):
# campaign category has different model path
# TODO: not defined all the category
campaign_category = {
"technology": self.model_root_path + "/technology/technology_model",
"manifacturing": self.model_root_path + "/manifacturing/manifacturing_model"
}
if self.category not in campaign_category.keys():
# if category don't exist process throws an exception
raise Exception("Not defined category: {0}, failed to get model path".format(self.category))
else:
return campaign_category[self.category]

def model_keyword_format(self, keywords_list):
# generate keywords format model needed
english_stopwords = stopwords.words('english')
word_bag = []
for keyword in keywords_list:
try:
if keyword.split(' ') > 1:
word_bag.extend([word for word in keyword.split(' ')])
else:
word_bag.extend(keyword)
except:
cluster_log.info("ERROR KEYWORD: " + keyword)
continue
# dispose of replace word
return [[word] for word in set(word_bag) if word not in english_stopwords and word.isalpha()]

def simword_similarity(self, word_list):
# get simword and similarity of the word in word2vector model
model = gensim.models.Word2Vec.load(self.model_path())
all_simword_list = []
cluster_log.info("strat to find simword and similarity in word2vector model, timestamp: {timstamp}".format(
timstamp = self.date_util.get_current_time_str()
))
for word in word_list:
try:
# topN_simword means to get the top n simword
simword_similarity = model.most_similar(word, topn = self.topN_simword)
# save the word (word similarity = 1) into simword similarity list
word_similarity = (word, 1)
simword_similarity.append(word_similarity)
all_simword_list.append(simword_similarity)
except:
cluster_log.info("{0} can't find simword in word2vector model".format(word))
continue
cluster_log.info("finish to find simword and similarity in word2vector model, timestamp: {timstamp}".format(
timstamp = self.date_util.get_current_time_str()
))
return all_simword_list

def clean_simword(self, simword):
# simword may be number or mojibake, needs to clean up
return True if simword[0].isalpha() and not simword[0].isdigit() and simword[1] != 1 else False

def non_replace_simword(self, all_simword_list):
# generate non replace simword and its total similarity in all simword list
# topN_similarity means get the top N similarity
non_replace_simword = {}
for simword_list in all_simword_list:
word = [w[0] for w in simword_list if w[1] == 1]
try:
for simword in simword_list:
if self.clean_simword(simword):
if not non_replace_simword.has_key(simword[0]):
non_replace_simword[simword[0]] = simword[1]
else:
non_replace_simword[simword[0]] += simword[1]
else:
continue
except:
cluster_log.info("{0} error in all_simword_list ".format(word[0]))
continue
return sorted(non_replace_simword.iteritems(), key = lambda simword: simword[1], reverse = True)[
:self.topN_similarity]

def global_template(self, sorted_set):
# generate global template
global_template = ""
for simword in sorted_set:
try:
if simword[1] != 0:
frequency = int(simword[1] * self.X)
else:
frequency = 1
global_template += (simword[0] + ' ') * frequency
except:
cluster_log.info("Simword: {0} ERROR".format(simword[0]))
continue
return global_template

def generate_global_template(self, keywords_list):
global_template = self.global_template(
self.non_replace_simword(
self.simword_similarity(
self.model_keyword_format(keywords_list)
)))
if len(global_template) == 0:
cluster_log.error("failed to generate global template")
return global_template
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息