把NLTK中提取的unigram 特征转换成0,1向量表示方式
2014-08-28 12:31
357 查看
weka中arff格式文件 需要给出特征向量才可读取。NLTK中不需要,但是可以把features转成 向量的形式表示出来。
本人代码设计能力实在太弱,折腾了一上午才调试出来可用的script 是在python下完成的。
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import csv
import re
def replaceTwoOrMore(s):
# pattern to look for three or more repetitions of any character, including 改写如 loveeeeeee的词 为 love
# newlines.
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
inpfile = open("stopwords.txt", "r") #把读入的tweets去掉 stopwords
line1 = inpfile.readline()
stopWords = []
while line1:
word1 = line1.strip()
stopWords.append(word1)
line1 = inpfile.readline()
inpfile.close()
inpfile = open("selected_features.txt", "r") #读取feature list
line2 = inpfile.readline()
selected_features = []
while line2:
word2 = line2.strip()
selected_features.append(word2)
line2 = inpfile.readline()
inpfile.close()
posWords = []
negWords = []
file1 = 'positive.csv'
file2 = 'negative.csv'
posdata = open(file1, 'rb')
negdata = open(file2, 'rb')
initial_value = 0
reader1 = csv.reader(posdata, delimiter=',', quotechar='"', escapechar='\\')
for row in reader1:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000): #和已经选取的feature list 进行比较,如果feature list中 含有 tweets中的词 标注1 否则 标注0, feature list有多长,就是多少维 #的向量 本例中为 2000
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('pos')
posWords.append(array)
#posWords = list(posWords)
reader2 = csv.reader(negdata , delimiter=',', quotechar='"', escapechar='\\')
for row in reader2:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000):
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('neg')
negWords.append(array)
pos = posWords
neg = negWords
f1=open('pos_array.txt','w') #将转换完成的向量 输出
for i in pos:
k=' '.join([str(j) for j in i])
f1.write(k+"\n")
f1.close()
f2=open('neg_array.txt','w')
for i in neg:
k=' '.join([str(j) for j in i])
f2.write(k+"\n")
f2.close()
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
本人代码设计能力实在太弱,折腾了一上午才调试出来可用的script 是在python下完成的。
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import csv
import re
def replaceTwoOrMore(s):
# pattern to look for three or more repetitions of any character, including 改写如 loveeeeeee的词 为 love
# newlines.
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
inpfile = open("stopwords.txt", "r") #把读入的tweets去掉 stopwords
line1 = inpfile.readline()
stopWords = []
while line1:
word1 = line1.strip()
stopWords.append(word1)
line1 = inpfile.readline()
inpfile.close()
inpfile = open("selected_features.txt", "r") #读取feature list
line2 = inpfile.readline()
selected_features = []
while line2:
word2 = line2.strip()
selected_features.append(word2)
line2 = inpfile.readline()
inpfile.close()
posWords = []
negWords = []
file1 = 'positive.csv'
file2 = 'negative.csv'
posdata = open(file1, 'rb')
negdata = open(file2, 'rb')
initial_value = 0
reader1 = csv.reader(posdata, delimiter=',', quotechar='"', escapechar='\\')
for row in reader1:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000): #和已经选取的feature list 进行比较,如果feature list中 含有 tweets中的词 标注1 否则 标注0, feature list有多长,就是多少维 #的向量 本例中为 2000
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('pos')
posWords.append(array)
#posWords = list(posWords)
reader2 = csv.reader(negdata , delimiter=',', quotechar='"', escapechar='\\')
for row in reader2:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000):
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('neg')
negWords.append(array)
pos = posWords
neg = negWords
f1=open('pos_array.txt','w') #将转换完成的向量 输出
for i in pos:
k=' '.join([str(j) for j in i])
f1.write(k+"\n")
f1.close()
f2=open('neg_array.txt','w')
for i in neg:
k=' '.join([str(j) for j in i])
f2.write(k+"\n")
f2.close()
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
相关文章推荐
- 旋转不变LBP以及特征向量的提取方式
- UNIX常用的IP表示掩码方式转换为字符串的程序实现方法
- 各种旋转表示方式间的转换
- opencv中SiftDescriptorExtractor所做的SIFT特征向量提取工作简单分析
- 将图像转换为特征向量Transforming Images to Feature Vectors
- (6)文本挖掘(三)——文本特征TFIDF权重计算及文本向量空间VSM表示
- 文本分类(二)特征权重量化器(文档转向量表示)
- LIRe 源代码分析 5:提取特征向量[以颜色布局为例]
- C语言中两种方式表示时间日期值time_t和struct tm类型的相互转换
- Shp文件的几何向量在Osg中转换及轮廓提取
- 特征提取与转换
- 常见的进制表示方式和转换
- LIRe 源代码分析 5:提取特征向量[以颜色布局为例]
- LIRe 源代码分析 5:提取特征向量[以颜色布局为例]
- 颜色特征提取(三)------颜色聚合向量
- C语言中两种方式表示时间日期值time_t和struct tm类型的相互转换
- (转)---C语言中两种方式表示时间日期值time_t和struct tm类型的相互转换
- LIRe 源代码分析 5:提取特征向量[以颜色布局为例]
- Feature Selection(特征提取) 单纯高信息量unigram与参考情感词典词汇对比
- 使用opensmile提取音频的特征,得到特征向量,并扔进libsvm中进行分类训练测试