您的位置:首页 > 其它

keras + lstm 情感分类

2016-12-21 15:50 405 查看
负面评论如下:



正面评论如下:



使用keras配合lstm效果不错。

代码:

#coding:utf-8
'''
Created on 2016-12-20

@author: 刘帅
'''
import pandas as pd #导入Pandas
import numpy as np #导入Numpy
import jieba #导入结巴分词

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU

neg=pd.read_excel(r'D:\tomcat\Theano\neg.xls',header=None,index=None)
pos=pd.read_excel(r'D:\tomcat\Theano\pos.xls',header=None,index=None) #读取训练语料完毕
pos['mark']=1
neg['mark']=0 #给训练语料贴上标签
pn=pd.concat([pos,neg],ignore_index=True) #合并语料
neglen=len(neg)
poslen=len(pos) #计算语料数目

cw = lambda x: list(jieba.cut(x)) #定义分词函数
pn['words'] = pn[0].apply(cw)

comment = pd.read_excel(r'D:\tomcat\Theano\sum.xls') #读入评论内容
#comment = pd.read_csv('a.csv', encoding='utf-8')
comment = comment[comment['rateContent'].notnull()] #仅读取非空评论
comment['words'] = comment['rateContent'].apply(cw) #评论分词

d2v_train = pd.concat([pn['words'], comment['words']], ignore_index = True)

w = [] #将所有词语整合在一起
for i in d2v_train:
w.extend(i)

dict = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数
del w,d2v_train
dict['id']=list(range(1,len(dict)+1))

get_sent = lambda x: list(dict['id'][x])
pn['sent'] = pn['words'].apply(get_sent) #速度太慢

maxlen = 50

print("Pad sequences (samples x time)")
pn['sent'] = list(sequence.pad_sequences(pn['sent'], maxlen=maxlen))

x = np.array(list(pn['sent']))[::2] #训练集
y = np.array(list(pn['mark']))[::2]
xt = np.array(list(pn['sent']))[1::2] #测试集
yt = np.array(list(pn['mark']))[1::2]
xa = np.array(list(pn['sent'])) #全集
ya = np.array(list(pn['mark']))

print('Build model...')
model = Sequential()
model.add(Embedding(len(dict)+1, 256, input_length=maxlen))
model.add(LSTM(output_dim=128, activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
#model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

model.fit(xa, ya, batch_size=16, nb_epoch=10) #训练时间为若干个小时

classes = model.predict_classes(xa)
acc = np_utils.accuracy(classes, ya)
print('Test accuracy:', acc)


4832/21105 [=====>........................] - ETA: 330s - loss: 0.1597 - acc: 0.9462
4848/21105 [=====>........................] - ETA: 330s - loss: 0.1593 - acc: 0.9464
4864/21105 [=====>........................] - ETA: 329s - loss: 0.1589 - acc: 0.9465
4880/21105 [=====>........................] - ETA: 329s - loss: 0.1588 - acc: 0.9465
4896/21105 [=====>........................] - ETA: 329s - loss: 0.1592 - acc: 0.9465
4912/21105 [=====>........................] - ETA: 328s - loss: 0.1588 - acc: 0.9467
4928/21105 [======>.......................] - ETA: 328s - loss: 0.1592 - acc: 0.9464
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: