您的位置:首页 > 其它

把NLTK中提取的unigram 特征转换成0,1向量表示方式

2014-08-28 12:31 357 查看
weka中arff格式文件 需要给出特征向量才可读取。NLTK中不需要,但是可以把features转成 向量的形式表示出来。

本人代码设计能力实在太弱,折腾了一上午才调试出来可用的script 是在python下完成的。

""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

import csv

import re

def replaceTwoOrMore(s):

# pattern to look for three or more repetitions of any character, including 改写如 loveeeeeee的词 为 love

# newlines.

pattern = re.compile(r"(.)\1{1,}", re.DOTALL)

return pattern.sub(r"\1\1", s)

inpfile = open("stopwords.txt", "r") #把读入的tweets去掉 stopwords

line1 = inpfile.readline()

stopWords = []

while line1:

word1 = line1.strip()

stopWords.append(word1)

line1 = inpfile.readline()

inpfile.close()

inpfile = open("selected_features.txt", "r") #读取feature list

line2 = inpfile.readline()

selected_features = []

while line2:

word2 = line2.strip()

selected_features.append(word2)

line2 = inpfile.readline()

inpfile.close()

posWords = []

negWords = []

file1 = 'positive.csv'

file2 = 'negative.csv'

posdata = open(file1, 'rb')

negdata = open(file2, 'rb')

initial_value = 0

reader1 = csv.reader(posdata, delimiter=',', quotechar='"', escapechar='\\')

for row in reader1:

line = []

array = [initial_value for i in range(2000)]

label = row[0]

word = (row[1]) # row[5] for 600000 data set row[1] for my own data

words = word.split()

for w in words:

w = replaceTwoOrMore(w)

w = w.strip('\'"?,.')

val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)

if(w in stopWords or val is None):

continue

else:

line.append(w.lower())

for i in range(0,2000): #和已经选取的feature list 进行比较,如果feature list中 含有 tweets中的词 标注1 否则 标注0, feature list有多长,就是多少维 #的向量 本例中为 2000

if(selected_features[i] in line):

array[i] = "1,"

else:

array[i] = "0,"

array.append('pos')

posWords.append(array)

#posWords = list(posWords)

reader2 = csv.reader(negdata , delimiter=',', quotechar='"', escapechar='\\')

for row in reader2:

line = []

array = [initial_value for i in range(2000)]

label = row[0]

word = (row[1]) # row[5] for 600000 data set row[1] for my own data

words = word.split()

for w in words:

w = replaceTwoOrMore(w)

w = w.strip('\'"?,.')

val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)

if(w in stopWords or val is None):

continue

else:

line.append(w.lower())

for i in range(0,2000):

if(selected_features[i] in line):

array[i] = "1,"

else:

array[i] = "0,"

array.append('neg')

negWords.append(array)

pos = posWords

neg = negWords

f1=open('pos_array.txt','w') #将转换完成的向量 输出

for i in pos:

k=' '.join([str(j) for j in i])

f1.write(k+"\n")

f1.close()

f2=open('neg_array.txt','w')

for i in neg:

k=' '.join([str(j) for j in i])

f2.write(k+"\n")

f2.close()

""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: