您的位置:首页 > 大数据

第二届高校大数据比赛之鼠标轨迹识别

2017-07-18 18:24 1196 查看

比赛地址http://bdc.saikr.com/c/cql/34541

赛题

鼠标轨迹识别当前广泛运用于多种人机验证产品中,不仅便于用户的理解记忆,而且极大增加了暴力破解难度。但攻击者可通过黑产工具产生类人轨迹批量操作以绕过检测,并在对抗过程中不断升级其伪造数据以持续绕过同样升级的检测技术。我们期望用机器学习算法来提高人机验证中各种机器行为的检出率,其中包括对抗过程中出现的新的攻击手段的检测。

数据格式



评测指标

F = 5PR/(2P+3R)*100

数据读取和处理

#####数据读取和处理
import pandas as pd
import os

def get_data(file):
data1=[]
count=0
with open(file) as f:
for i in f.readlines():
count+=1
arr=i.split(" ")[1].split(';')[:-1]
for j in arr:
temp=[count]
temp.extend(j.split(','))
data1.append(temp)
data2=[]
with open(file) as f:
for i in f.readlines():
count+=1
arr=i.split(" ")[2]
data2.append(arr.split(','))

data=pd.DataFrame(data1,columns=["id",'x',"y","t"])
d2=pd.DataFrame(data2,columns=["target_x","target_y"])
d2.target_y=d2.target_y.apply(lambda x:x[:-1])
d2['id']=range(1,100001)
data=pd.merge(data,d2,on="id")
return data


数据可视化

import matplotlib.pyplot as plt
%matplotlib inline
# plt.xticks(list(range(len(b))),b['x'].values)
import os
path='F:\\competition_data\\Bigdata\\images'
# os.mkdir(path)
for i in range(1,3001):
b=data[data.id==i]
k=list(b['x'].values)
#     k.extend(set(b['target_x'].values))
l=list(b['y'].values)
#     l.extend(set(b['target_y'].values))
plt.plot(k,l,'o-')
fig = plt.gcf()
fig.set_size_inches(30, 15)
fig.savefig(path+'\\'+str(i)+'.png',dpi=100)
plt.close()


特征提取

###特征提取
def get_features(data):
a=pd.DataFrame()
data_length=len(set(data.id.values))
import numpy as np
for i in range(data_length):
test=data[data.id==i]
if len(test)!=1:
test.index=range(len(test))
temp=test[['x','y','t']].diff(1).dropna()
temp['distance']=np.sqrt(temp['x']**2+temp['y']**2)
temp['speed']=np.log1p(temp['distance'])-np.log1p(temp['t'])
temp['angles']=np.log1p(temp['y'])-np.log1p(temp['x'])
speed_diff=temp['speed'].diff(1).dropna()
angle_diff=temp['angles'].diff(1).dropna()
test['distance_aim_deltas']=np.sqrt((test['x']-test['target_x'])**2+(test['y']-test['target_y'])**2)
distance_aim_deltas_diff=test['distance_aim_deltas'].diff(1).dropna()

arr=pd.DataFrame(index=[0])
arr['id']=i
arr['speed_diff_median'] = speed_diff.median()
arr['speed_diff_mean'] = speed_diff.mean()
arr['speed_diff_var'] =  speed_diff.var()
arr['speed_diff_max'] = speed_diff.max()
arr['angle_diff_var'] =  angle_diff.var()
arr['time_delta_min'] =  temp['t'].min()
arr['time_delta_max'] = temp['t'].max()
arr['time_delta_var'] = temp['t'].var()

arr['distance_deltas_max'] =  temp['distance'].max()
arr['distance_deltas_var'] =  temp['distance'].var()
arr['aim_distance_last'] = test['distance_aim_deltas'].values[-1]
arr['aim_distance_diff_max'] = distance_aim_deltas_diff.max()
arr['aim_distance_diff_var'] = distance_aim_deltas_diff.var()
arr['mean_speed'] = temp['speed'].mean()
arr['median_speed'] = temp['speed'].median()
arr['var_speed'] = temp['speed'].var()

arr['max_angle'] = temp['angles'].max()
arr['var_angle'] =  temp['angles'].var()
arr['kurt_angle'] =  temp['angles'].kurt()

arr['y_min'<
b60f
/span>] = test["y"].min()
arr['y_max'] = test["y"].max()
arr['y_var'] = test["y"].var()
arr['y_mean'] = test["y"].mean()
arr['x_min'] = test["x"].min()
arr['x_max'] = test["x"].max()
arr['x_var'] = test["x"].var()
arr['x_mean'] = test["x"].mean()

arr['x_back_num'] = min( (test['x'].diff(1).dropna() > 0).sum(), (test['x'].diff(1).dropna() < 0).sum())
arr['y_back_num'] = min( (test['y'].diff(1).dropna() > 0).sum(), (test['y'].diff(1).dropna() < 0).sum())

arr['xs_delta_var'] = test['x'].diff(1).dropna().var()
arr['xs_delta_max'] = test['x'].diff(1).dropna().max()
arr['xs_delta_min'] =test['x'].diff(1).dropna().min()
#         arr['label']=test['label']
a=pd.concat([a,arr])
return a


模型

###xgb
import xgboost as xgb
test_x=test.drop('id',1)
train_x=train.drop(['id','label'],1)

dtest = xgb.DMatrix(test_x)
# dval = xgb.DMatrix(val_x,label=val_data.label)
dtrain = xgb.DMatrix(train_x, label=train.label)
params={
'booster':'gbtree',
'objective': 'binary:logistic',

#   'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "auc",

'gamma':0.1,#0.2 is ok
'max_depth':3,
#   'lambda':550,
'subsample':0.7,
'colsample_bytree':0.4 ,
#         'min_child_weight':2.5,
'eta': 0.007,
#     'learning_rate':0.01,
'seed':1024,
'nthread':7,

}

watchlist  = [(dtrain,'train'),
# (dval,'val')
]#The early stopping is based on last set in the evallist
model = xgb.train(
params,
dtrain,
feval=feval,
#                   maximize=False,

num_boost_round=1500,
#                   early_stopping_rounds=10,
#                   verbose_eval =30,
evals=watchlist
)
# model=xgb.XGBClassifier(
# max_depth=4,
#     learning_rate=0.007,
#     n_estimators=1500,
#     silent=True,
#     objective='binary:logistic',
# #     booster='gbtree',
# #     n_jobs=-1,
#     nthread=7,
# #     gamma=0,
# #     min_child_weight=1,
# #     max_delta_step=0,
#     subsample=0.7,
#     colsample_bytree=0.7,
# #     colsample_bylevel=0.7,
# #     reg_alpha=0,
# #     reg_lambda=1,
#     scale_pos_weight=1,
#     base_score=0.5,
# #     random_state=0,
#     seed=1024,
#     missing=None,
# )

# xgb.cv(params,dtrain,num_boost_round=1500,nfold=10,feval=feval,early_stopping_rounds=50,)
# model.save_model('./model/xgb.model')
# print "best best_ntree_limit",model.best_ntree_limit


评价函数

def eval(clf,x,y):
prob=clf.predict(x)
for i in range(len(prob)):
if prob[i]>=1:
prob[i]=1
else:
prob[i]=0
p=((y==0)&(prob==0)).sum()/(prob==0).sum()
print("TP"+" : "+str(((y==0)&(prob==0)).sum())+"  "+"预测"+" : "+str((prob==0).sum())+"  "+"真实"+" : "+str((y==0).sum()))
r=((y==0)&(prob==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return 0.0

f=5*p*r/(2*p+3*r)*100
print(f)
return f
def feval(pred,dtrain):
y=dtrain.get_label()
for i in range(len(pred)):
if pred[i]>=0.5:
pred[i]=1
else:
pred[i]=0
p=((y==0)&(pred==0)).sum()/(pred==0).sum()
print("---------------------------------------------------------")
#     print("TP"+" : "+str(((y==0)&(pred==0)).sum())+"  "+"预测"+" : "+str((pred==0).sum())+"  "+"真实"+" : "+str((y==0).sum()))
r=((y==0)&(pred==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return "f",0.0

f=5*p*r/(2*p+3*r)*100
print(f)
return "f",f
def target(score,num):
x=score*(40000+3*num)/5
return x


线下cv

from sklearn import cross_validation
score=cross_validation.cross_val_score(m,train.ix[:,1:-1],train.label,cv=10,scoring=eval)
score.mean()


提交结果

pred=model.predict(dtest)
test['prob']=pred
submit=test.sort_values(by="prob").head(20000)
submit=submit[['id']]
submit=submit.astype(int)


线上成绩0.91

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息