您的位置:首页 > 其它

鼠标轨迹识别

2017-10-30 11:17 176 查看
import numpy as np
import pandas as pd


1  导入数据(因为数据是以.txt的形式给出的,所以选择用read_table读我的数据,另外注意加header=None)

train=pd.read_table('dsjtzs_txfz_training.txt',header=None)
test=pd.read_table('dsjtzs_txfz_test1.txt',header=None)

2  数据处理
#将数据中的符号全换成逗号,并且以‘,’分隔符分割数据

train.columns=['id']
test.columns=['id']
train['id']=train['id'].str.replace(' ',',')
train['id']=train['id'].str.replace(';',',')
train['id']=train['id'].str.replace(',,',',')
a=train['id'].str.split(',')

test['id']=test['id'].str.replace(' ',',')
test['id']=test['id'].str.replace(';',',')
test['id']=test['id'].str.replace(',,',',')
b=test['id'].str.split(',')

3  提取特征
#获取每个特征的标签

label=[]
for i in range(3000):
j=a[i][-1]
j=float(j)
label.append(j)

#提取x坐标的特征
X_mean=[]
X_median=[]
X_mad=[]
X_var=[]
X_std=[]
X_max=[]
X_min=[]
X_pct_change=[]
X_skew=[]
X_kurt=[]
for i in range(3000):
L=[]
for j in a[i][1:-1:3]:
L.append(float(j))
L=pd.Series(L)
X_mean.append(L.mean())
X_median.append(L.median())
X_mad.append(L.mad())
X_var.append(L.var())
X_std.append(L.std())
X_max.append(L.max())
X_min.append(L.min())
X_pct_change.append(L.pct_change().sum())
X_skew.append(L.skew())
X_kurt.append(L.kurt())

#提取y坐标的特征
Y_mean=[]
Y_median=[]
Y_mad=[]
Y_var=[]
Y_std=[]
Y_max=[]
Y_min=[]
for i in range(3000):
L=[]
for j in a[i][2:-1:3]:
L.append(float(j))
L=pd.Series(L)
Y_mean.append(L.mean())
Y_median.append(L.median())
Y_mad.append(L.mad())
Y_var.append(L.var())
Y_std.append(L.std())
Y_max.append(L.max())
Y_min.append(L.min())

#提取t坐标的特征
T_mean=[]#平均数
T_median=[]#中位数
T_mad=[]#根据平均值计算平均值的绝对离差
T_var=[]#方差
T_std=[]#标准差
T_max=[]#最大值
T_min=[]#最小值
T_diff=[]#一阶差分(就是离散函数中连续相邻两项之差)
T_pct_change=[]#计算百分数变化
T_skew=[]#样本值的偏度
T_kurt=[]#样本值的峰度
click_num=[]#点击次数
for i in range(3000):
L=[]
for j in a[i][3:-3:3]:
L.append(float(j))
L=pd.Series(L)
T_mean.append(L.mean())
T_median.append(L.median())
T_mad.append(L.mad())
T_var.append(L.var())
T_std.append(L.std())
T_max.append(L.max())
T_min.append(L.min())
T_diff.append(L.diff().mean())
T_pct_change.append(L.pct_change().sum())
T_skew.append(L.skew())
T_kurt.append(L.kurt())
click_num.append(len(L))
X_mean_test=[]
X_median_test=[]
X_mad_test=[]
X_var_test=[]
X_std_test=[]
X_max_test=[]
X_min_test=[]
X_pct_change_test=[]
X_skew_test=[]
X_kurt_test=[]
for i in range(100000):
L=[]
for j in b[i][1::3]:
L.append(float(j))
L=pd.Series(L)
X_mean_test.append(L.mean())
X_median_test.append(L.median())
X_mad_test.append(L.mad())
X_var_test.append(L.var())
X_std_test.append(L.std())
X_max_test.append(L.max())
X_min_test.append(L.min())
X_pct_change_test.append(L.pct_change().sum())
X_skew_test.append(L.skew())
X_kurt_test.append(L.kurt())

Y_mean_test=[]
Y_median_test=[]
Y_mad_test=[]
Y_var_test=[]
Y_std_test=[]
Y_max_test=[]
Y_min_test=[]
for i in range(100000):
L=[]
for j in b[i][2::3]:
L.append(float(j))
L=pd.Series(L)
Y_mean_test.append(L.mean())
Y_median_test.append(L.median())
Y_mad_test.append(L.mad())
Y_var_test.append(L.var())
Y_std_test.append(L.std())
Y_max_test.append(L.max())
Y_min_test.append(L.min())
T_mean_test=[]
T_median_test=[]
T_mad_test=[]
T_var_test=[]
T_std_test=[]
T_max_test=[]
T_min_test=[]
T_diff_test=[]
T_pct_change_test=[]
T_skew_test=[]
T_kurt_test=[]
click_num_test=[]
for i in range(100000):
L=[]
for j in b[i][3:-2:3]:
L.append(float(j))
L=pd.Series(L)
T_mean_test.append(L.mean())
T_median_test.append(L.median())
T_mad_test.append(L.mad())
T_var_test.append(L.var())
T_std_test.append(L.std())
T_max_test.append(L.max())
T_min_test.append(L.min())
T_diff_test.append(L.diff().mean())
T_pct_change_test.append(L.pct_change().sum())
T_skew_test.append(L.skew())
T_kurt_test.append(L.kurt())
click_num_test.append(len(L))


#将训练集的特征放到一个Dataframe中并查看Dataframe信息
aa=pd.DataFrame()
aa['id']=range(1,3001)
aa=aa.join(pd.DataFrame(X_mean,columns=['X_mean']))
aa=aa.join(pd.DataFrame(Y_mean,columns=['Y_mean']))
aa=aa.join(pd.DataFrame(T_mean,columns=['T_mean']))
aa=aa.join(pd.DataFrame(X_median,columns=['X_median']))
aa=aa.join(pd.DataFrame(Y_median,columns=['Y_median']))
aa=aa.join(pd.DataFrame(T_median,columns=['T_median']))
aa=aa.join(pd.DataFrame(X_mad,columns=['X_mad']))
aa=aa.join(pd.DataFrame(Y_mad,columns=['Y_mad']))
aa=aa.join(pd.DataFrame(T_mad,columns=['T_mad']))
aa=aa.join(pd.DataFrame(X_var,columns=['X_var']))
aa=aa.join(pd.DataFrame(Y_var,columns=['Y_var']))
aa=aa.join(pd.DataFrame(T_var,columns=['T_var']))
aa=aa.join(pd.DataFrame(X_std,columns=['X_std']))
aa=aa.join(pd.DataFrame(Y_std,columns=['Y_std']))
aa=aa.join(pd.DataFrame(T_std,columns=['T_std']))
aa=aa.join(pd.DataFrame(X_max,columns=['X_max']))
aa=aa.join(pd.DataFrame(Y_max,columns=['Y_max']))
aa=aa.join(pd.DataFrame(T_max,columns=['T_max']))
aa=aa.join(pd.DataFrame(X_min,columns=['X_min']))
aa=aa.join(pd.DataFrame(Y_min,columns=['Y_min']))
aa=aa.join(pd.DataFrame(T_min,columns=['T_min']))
aa=aa.join(pd.DataFrame(T_diff,columns=['T_diff']))
aa=aa.join(pd.DataFrame(X_pct_change,columns=['X_pct_change']))
aa=aa.join(pd.DataFrame(T_pct_change,columns=['T_pct_change']))
aa=aa.join(pd.DataFrame(X_skew,columns=['X_skew']))
aa=aa.join(pd.DataFrame(T_skew,columns=['T_skew']))
aa=aa.join(pd.DataFrame(X_kurt,columns=['X_kurt']))
aa=aa.join(pd.DataFrame(T_kurt,columns=['T_kurt']))
aa=aa.join(pd.DataFrame(click_num,columns=['click_num']))
del aa['id']
y=pd.Series(label)
print(aa.info())

#将测试集的特征放到一个Dataframe中并查看Dataframe信息
bb=pd.DataFrame()
bb['id']=range(1,100001)
bb=bb.join(pd.DataFrame(X_mean_test,columns=['X_mean']))
bb=bb.join(pd.DataFrame(Y_mean_test,columns=['Y_mean']))
bb=bb.join(pd.DataFrame(T_mean_test,columns=['T_mean']))
bb=bb.join(pd.DataFrame(X_median_test,columns=['X_median']))
bb=bb.join(pd.DataFrame(Y_median_test,columns=['Y_median']))
bb=bb.join(pd.DataFrame(T_median_test,columns=['T_median']))
bb=bb.join(pd.DataFrame(X_mad_test,columns=['X_mad']))
bb=bb.join(pd.DataFrame(Y_mad_test,columns=['Y_mad']))
bb=bb.join(pd.DataFrame(T_mad_test,columns=['T_mad']))
bb=bb.join(pd.DataFrame(X_var_test,columns=['X_var']))
bb=bb.join(pd.DataFrame(Y_var_test,columns=['Y_var']))
bb=bb.join(pd.DataFrame(T_var_test,columns=['T_var']))
bb=bb.join(pd.DataFrame(X_std_test,columns=['X_std']))
bb=bb.join(pd.DataFrame(Y_std_test,columns=['Y_std']))
bb=bb.join(pd.DataFrame(T_std_test,columns=['T_std']))
bb=bb.join(pd.DataFrame(X_max_test,columns=['X_max']))
bb=bb.join(pd.DataFrame(Y_max_test,columns=['Y_max']))
bb=bb.join(pd.DataFrame(T_max_test,columns=['T_max']))
bb=bb.join(pd.DataFrame(X_min_test,columns=['X_min']))
bb=bb.join(pd.DataFrame(Y_min_test,columns=['Y_min']))
bb=bb.join(pd.DataFrame(T_min_test,columns=['T_min']))
bb=bb.join(pd.DataFrame(T_diff_test,columns=['T_diff']))
bb=bb.join(pd.DataFrame(X_pct_change_test,columns=['X_pct_change']))
bb=bb.join(pd.DataFrame(T_pct_change_test,columns=['T_pct_change']))
bb=bb.join(pd.DataFrame(X_skew_test,columns=['X_skew']))
bb=bb.join(pd.DataFrame(T_skew_test,columns=['T_skew']))
bb=bb.join(pd.DataFrame(X_kurt_test,columns=['X_kurt']))
bb=bb.join(pd.DataFrame(T_kurt_test,columns=['T_kurt']))
bb=bb.join(pd.DataFrame(click_num_test,columns=['click_num']))
del bb['id']
print(bb.info())

#采用向下填充的方法填充上面特征中的缺失值
aa['T_var']=aa['T_var'].fillna(method='bfill')
aa['T_std']=aa['T_std'].fillna(method='bfill')
aa['T_diff']=aa['T_diff'].fillna(method='bfill')
aa['X_skew']=aa['X_skew'].fillna(method='bfill')
aa['T_skew']=aa['T_skew'].fillna(method='bfill')
aa['X_kurt']=aa['X_kurt'].fillna(method='bfill')
aa['T_kurt']=aa['T_kurt'].fillna(method='bfill')
bb['T_var']=bb['T_var'].fillna(method='bfill')
bb['T_std']=bb['T_std'].fillna(method='bfill')
bb['T_diff']=bb['T_diff'].fillna(method='bfill')
bb['X_skew']=bb['X_skew'].fillna(method='bfill')
bb['T_skew']=bb['T_skew'].fillna(method='bfill')
bb['X_kurt']=bb['X_kurt'].fillna(method='bfill')
bb['T_kurt']=bb['T_kurt'].fillna(method='bfill')


4  数据标准化
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
aa=ss.fit_transform(aa)
bb=ss.fit_transform(bb)

5  数据分割
from sklearn.cross_validation import train_test_split
X_train,y_train,X_val,y_val=train_test_split(aa,y,test_size=0.33,random_state=0)

6  样本均衡化
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=0)
sm_train,sm_val=sm.fit_sample(X_train,X_val)

7  训练模型
from xgboost import XGBClassifier
xgb=XGBClassifier()
model=xgb.fit(sm_train,sm_val)
print(model)

8  打分
from sklearn.cross_validation import cross_val_score
print(cross_val_score(model,sm_train,sm_val).mean())


9  预测并导出结果
predictions =xgb.predict(y_train)
final=pd.DataFrame({'result':predictions.astype(np.int32)})
final.to_csv('final',index=False)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: