鼠标轨迹识别
2017-10-30 11:17
176 查看
import numpy as np import pandas as pd
1 导入数据(因为数据是以.txt的形式给出的,所以选择用read_table读我的数据,另外注意加header=None)
train=pd.read_table('dsjtzs_txfz_training.txt',header=None)
test=pd.read_table('dsjtzs_txfz_test1.txt',header=None)
2 数据处理
#将数据中的符号全换成逗号,并且以‘,’分隔符分割数据
train.columns=['id']
test.columns=['id']
train['id']=train['id'].str.replace(' ',',')
train['id']=train['id'].str.replace(';',',')
train['id']=train['id'].str.replace(',,',',')
a=train['id'].str.split(',')
test['id']=test['id'].str.replace(' ',',')
test['id']=test['id'].str.replace(';',',')
test['id']=test['id'].str.replace(',,',',')
b=test['id'].str.split(',')
3 提取特征
#获取每个特征的标签
label=[]
for i in range(3000):
j=a[i][-1]
j=float(j)
label.append(j)
#提取x坐标的特征
X_mean=[]
X_median=[]
X_mad=[]
X_var=[]
X_std=[]
X_max=[]
X_min=[]
X_pct_change=[]
X_skew=[]
X_kurt=[]
for i in range(3000):
L=[]
for j in a[i][1:-1:3]:
L.append(float(j))
L=pd.Series(L)
X_mean.append(L.mean())
X_median.append(L.median())
X_mad.append(L.mad())
X_var.append(L.var())
X_std.append(L.std())
X_max.append(L.max())
X_min.append(L.min())
X_pct_change.append(L.pct_change().sum())
X_skew.append(L.skew())
X_kurt.append(L.kurt())
#提取y坐标的特征
Y_mean=[]
Y_median=[]
Y_mad=[]
Y_var=[]
Y_std=[]
Y_max=[]
Y_min=[]
for i in range(3000):
L=[]
for j in a[i][2:-1:3]:
L.append(float(j))
L=pd.Series(L)
Y_mean.append(L.mean())
Y_median.append(L.median())
Y_mad.append(L.mad())
Y_var.append(L.var())
Y_std.append(L.std())
Y_max.append(L.max())
Y_min.append(L.min())
#提取t坐标的特征
T_mean=[]#平均数
T_median=[]#中位数
T_mad=[]#根据平均值计算平均值的绝对离差
T_var=[]#方差
T_std=[]#标准差
T_max=[]#最大值
T_min=[]#最小值
T_diff=[]#一阶差分(就是离散函数中连续相邻两项之差)
T_pct_change=[]#计算百分数变化
T_skew=[]#样本值的偏度
T_kurt=[]#样本值的峰度
click_num=[]#点击次数
for i in range(3000):
L=[]
for j in a[i][3:-3:3]:
L.append(float(j))
L=pd.Series(L)
T_mean.append(L.mean())
T_median.append(L.median())
T_mad.append(L.mad())
T_var.append(L.var())
T_std.append(L.std())
T_max.append(L.max())
T_min.append(L.min())
T_diff.append(L.diff().mean())
T_pct_change.append(L.pct_change().sum())
T_skew.append(L.skew())
T_kurt.append(L.kurt())
click_num.append(len(L))
X_mean_test=[] X_median_test=[] X_mad_test=[] X_var_test=[] X_std_test=[] X_max_test=[] X_min_test=[] X_pct_change_test=[] X_skew_test=[] X_kurt_test=[] for i in range(100000): L=[] for j in b[i][1::3]: L.append(float(j)) L=pd.Series(L) X_mean_test.append(L.mean()) X_median_test.append(L.median()) X_mad_test.append(L.mad()) X_var_test.append(L.var()) X_std_test.append(L.std()) X_max_test.append(L.max()) X_min_test.append(L.min()) X_pct_change_test.append(L.pct_change().sum()) X_skew_test.append(L.skew()) X_kurt_test.append(L.kurt())
Y_mean_test=[]
Y_median_test=[]
Y_mad_test=[]
Y_var_test=[]
Y_std_test=[]
Y_max_test=[]
Y_min_test=[]
for i in range(100000):
L=[]
for j in b[i][2::3]:
L.append(float(j))
L=pd.Series(L)
Y_mean_test.append(L.mean())
Y_median_test.append(L.median())
Y_mad_test.append(L.mad())
Y_var_test.append(L.var())
Y_std_test.append(L.std())
Y_max_test.append(L.max())
Y_min_test.append(L.min())
T_mean_test=[] T_median_test=[] T_mad_test=[] T_var_test=[] T_std_test=[] T_max_test=[] T_min_test=[] T_diff_test=[] T_pct_change_test=[] T_skew_test=[] T_kurt_test=[] click_num_test=[] for i in range(100000): L=[] for j in b[i][3:-2:3]: L.append(float(j)) L=pd.Series(L) T_mean_test.append(L.mean()) T_median_test.append(L.median()) T_mad_test.append(L.mad()) T_var_test.append(L.var()) T_std_test.append(L.std()) T_max_test.append(L.max()) T_min_test.append(L.min()) T_diff_test.append(L.diff().mean()) T_pct_change_test.append(L.pct_change().sum()) T_skew_test.append(L.skew()) T_kurt_test.append(L.kurt()) click_num_test.append(len(L))
#将训练集的特征放到一个Dataframe中并查看Dataframe信息
aa=pd.DataFrame()
aa['id']=range(1,3001)
aa=aa.join(pd.DataFrame(X_mean,columns=['X_mean']))
aa=aa.join(pd.DataFrame(Y_mean,columns=['Y_mean']))
aa=aa.join(pd.DataFrame(T_mean,columns=['T_mean']))
aa=aa.join(pd.DataFrame(X_median,columns=['X_median']))
aa=aa.join(pd.DataFrame(Y_median,columns=['Y_median']))
aa=aa.join(pd.DataFrame(T_median,columns=['T_median']))
aa=aa.join(pd.DataFrame(X_mad,columns=['X_mad']))
aa=aa.join(pd.DataFrame(Y_mad,columns=['Y_mad']))
aa=aa.join(pd.DataFrame(T_mad,columns=['T_mad']))
aa=aa.join(pd.DataFrame(X_var,columns=['X_var']))
aa=aa.join(pd.DataFrame(Y_var,columns=['Y_var']))
aa=aa.join(pd.DataFrame(T_var,columns=['T_var']))
aa=aa.join(pd.DataFrame(X_std,columns=['X_std']))
aa=aa.join(pd.DataFrame(Y_std,columns=['Y_std']))
aa=aa.join(pd.DataFrame(T_std,columns=['T_std']))
aa=aa.join(pd.DataFrame(X_max,columns=['X_max']))
aa=aa.join(pd.DataFrame(Y_max,columns=['Y_max']))
aa=aa.join(pd.DataFrame(T_max,columns=['T_max']))
aa=aa.join(pd.DataFrame(X_min,columns=['X_min']))
aa=aa.join(pd.DataFrame(Y_min,columns=['Y_min']))
aa=aa.join(pd.DataFrame(T_min,columns=['T_min']))
aa=aa.join(pd.DataFrame(T_diff,columns=['T_diff']))
aa=aa.join(pd.DataFrame(X_pct_change,columns=['X_pct_change']))
aa=aa.join(pd.DataFrame(T_pct_change,columns=['T_pct_change']))
aa=aa.join(pd.DataFrame(X_skew,columns=['X_skew']))
aa=aa.join(pd.DataFrame(T_skew,columns=['T_skew']))
aa=aa.join(pd.DataFrame(X_kurt,columns=['X_kurt']))
aa=aa.join(pd.DataFrame(T_kurt,columns=['T_kurt']))
aa=aa.join(pd.DataFrame(click_num,columns=['click_num']))
del aa['id']
y=pd.Series(label)
print(aa.info())
#将测试集的特征放到一个Dataframe中并查看Dataframe信息
bb=pd.DataFrame()
bb['id']=range(1,100001)
bb=bb.join(pd.DataFrame(X_mean_test,columns=['X_mean']))
bb=bb.join(pd.DataFrame(Y_mean_test,columns=['Y_mean']))
bb=bb.join(pd.DataFrame(T_mean_test,columns=['T_mean']))
bb=bb.join(pd.DataFrame(X_median_test,columns=['X_median']))
bb=bb.join(pd.DataFrame(Y_median_test,columns=['Y_median']))
bb=bb.join(pd.DataFrame(T_median_test,columns=['T_median']))
bb=bb.join(pd.DataFrame(X_mad_test,columns=['X_mad']))
bb=bb.join(pd.DataFrame(Y_mad_test,columns=['Y_mad']))
bb=bb.join(pd.DataFrame(T_mad_test,columns=['T_mad']))
bb=bb.join(pd.DataFrame(X_var_test,columns=['X_var']))
bb=bb.join(pd.DataFrame(Y_var_test,columns=['Y_var']))
bb=bb.join(pd.DataFrame(T_var_test,columns=['T_var']))
bb=bb.join(pd.DataFrame(X_std_test,columns=['X_std']))
bb=bb.join(pd.DataFrame(Y_std_test,columns=['Y_std']))
bb=bb.join(pd.DataFrame(T_std_test,columns=['T_std']))
bb=bb.join(pd.DataFrame(X_max_test,columns=['X_max']))
bb=bb.join(pd.DataFrame(Y_max_test,columns=['Y_max']))
bb=bb.join(pd.DataFrame(T_max_test,columns=['T_max']))
bb=bb.join(pd.DataFrame(X_min_test,columns=['X_min']))
bb=bb.join(pd.DataFrame(Y_min_test,columns=['Y_min']))
bb=bb.join(pd.DataFrame(T_min_test,columns=['T_min']))
bb=bb.join(pd.DataFrame(T_diff_test,columns=['T_diff']))
bb=bb.join(pd.DataFrame(X_pct_change_test,columns=['X_pct_change']))
bb=bb.join(pd.DataFrame(T_pct_change_test,columns=['T_pct_change']))
bb=bb.join(pd.DataFrame(X_skew_test,columns=['X_skew']))
bb=bb.join(pd.DataFrame(T_skew_test,columns=['T_skew']))
bb=bb.join(pd.DataFrame(X_kurt_test,columns=['X_kurt']))
bb=bb.join(pd.DataFrame(T_kurt_test,columns=['T_kurt']))
bb=bb.join(pd.DataFrame(click_num_test,columns=['click_num']))
del bb['id']
print(bb.info())
#采用向下填充的方法填充上面特征中的缺失值
aa['T_var']=aa['T_var'].fillna(method='bfill')
aa['T_std']=aa['T_std'].fillna(method='bfill')
aa['T_diff']=aa['T_diff'].fillna(method='bfill')
aa['X_skew']=aa['X_skew'].fillna(method='bfill')
aa['T_skew']=aa['T_skew'].fillna(method='bfill')
aa['X_kurt']=aa['X_kurt'].fillna(method='bfill')
aa['T_kurt']=aa['T_kurt'].fillna(method='bfill')
bb['T_var']=bb['T_var'].fillna(method='bfill')
bb['T_std']=bb['T_std'].fillna(method='bfill')
bb['T_diff']=bb['T_diff'].fillna(method='bfill')
bb['X_skew']=bb['X_skew'].fillna(method='bfill')
bb['T_skew']=bb['T_skew'].fillna(method='bfill')
bb['X_kurt']=bb['X_kurt'].fillna(method='bfill')
bb['T_kurt']=bb['T_kurt'].fillna(method='bfill')
4 数据标准化
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
aa=ss.fit_transform(aa)
bb=ss.fit_transform(bb)
5 数据分割
from sklearn.cross_validation import train_test_split
X_train,y_train,X_val,y_val=train_test_split(aa,y,test_size=0.33,random_state=0)
6 样本均衡化
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=0)
sm_train,sm_val=sm.fit_sample(X_train,X_val)
7 训练模型
from xgboost import XGBClassifier
xgb=XGBClassifier()
model=xgb.fit(sm_train,sm_val)
print(model)
8 打分
from sklearn.cross_validation import cross_val_score
print(cross_val_score(model,sm_train,sm_val).mean())
9 预测并导出结果
predictions =xgb.predict(y_train)
final=pd.DataFrame({'result':predictions.astype(np.int32)})
final.to_csv('final',index=False)
相关文章推荐
- 鼠标轨迹识别
- 第二届高校大数据比赛之鼠标轨迹识别
- 鼠标的轨迹识别
- Flex3学习轨迹:显示简单的动态鼠标
- SQL Server 游标运用:鼠标轨迹字符串分割
- Android+OpenCV实现轨迹识别
- 鼠标手势识别 [Flash]
- js鼠标轨迹
- SQL Server 游标运用:鼠标轨迹字符串分割
- 控制台上识别鼠标点击位置坐标
- 新浪微博自动转发评论 源码 按键精灵实现 详细注释 几十行代码实现 涉及图像识别模拟键盘鼠标
- 鼠标和按键在android 上的识别和区别
- kinect手势识别后,利用识别效果控制鼠标
- DX9鼠标拾取网格轨迹
- 使用Java Swing绘制随鼠标拖拽可见轨迹的矩形(不会一直绘制矩形,而是类似ps中的矩形工具)
- 微软智能PS2鼠标识别
- kinect手势识别后,利用识别效果控制鼠标
- Opencv图像识别从零到精通(13)----点线圆矩形与鼠标事件
- 备忘录模式:记录并回放鼠标运动轨迹过程
- (025)[系统故障]XP下禁止将串口设备识别成鼠标(转)