第二届高校大数据比赛之鼠标轨迹识别
2017-07-18 18:24
1196 查看
比赛地址http://bdc.saikr.com/c/cql/34541
赛题
鼠标轨迹识别当前广泛运用于多种人机验证产品中,不仅便于用户的理解记忆,而且极大增加了暴力破解难度。但攻击者可通过黑产工具产生类人轨迹批量操作以绕过检测,并在对抗过程中不断升级其伪造数据以持续绕过同样升级的检测技术。我们期望用机器学习算法来提高人机验证中各种机器行为的检出率,其中包括对抗过程中出现的新的攻击手段的检测。数据格式
评测指标
F = 5PR/(2P+3R)*100
数据读取和处理
#####数据读取和处理 import pandas as pd import os def get_data(file): data1=[] count=0 with open(file) as f: for i in f.readlines(): count+=1 arr=i.split(" ")[1].split(';')[:-1] for j in arr: temp=[count] temp.extend(j.split(',')) data1.append(temp) data2=[] with open(file) as f: for i in f.readlines(): count+=1 arr=i.split(" ")[2] data2.append(arr.split(',')) data=pd.DataFrame(data1,columns=["id",'x',"y","t"]) d2=pd.DataFrame(data2,columns=["target_x","target_y"]) d2.target_y=d2.target_y.apply(lambda x:x[:-1]) d2['id']=range(1,100001) data=pd.merge(data,d2,on="id") return data
数据可视化
import matplotlib.pyplot as plt %matplotlib inline # plt.xticks(list(range(len(b))),b['x'].values) import os path='F:\\competition_data\\Bigdata\\images' # os.mkdir(path) for i in range(1,3001): b=data[data.id==i] k=list(b['x'].values) # k.extend(set(b['target_x'].values)) l=list(b['y'].values) # l.extend(set(b['target_y'].values)) plt.plot(k,l,'o-') fig = plt.gcf() fig.set_size_inches(30, 15) fig.savefig(path+'\\'+str(i)+'.png',dpi=100) plt.close()
特征提取
###特征提取 def get_features(data): a=pd.DataFrame() data_length=len(set(data.id.values)) import numpy as np for i in range(data_length): test=data[data.id==i] if len(test)!=1: test.index=range(len(test)) temp=test[['x','y','t']].diff(1).dropna() temp['distance']=np.sqrt(temp['x']**2+temp['y']**2) temp['speed']=np.log1p(temp['distance'])-np.log1p(temp['t']) temp['angles']=np.log1p(temp['y'])-np.log1p(temp['x']) speed_diff=temp['speed'].diff(1).dropna() angle_diff=temp['angles'].diff(1).dropna() test['distance_aim_deltas']=np.sqrt((test['x']-test['target_x'])**2+(test['y']-test['target_y'])**2) distance_aim_deltas_diff=test['distance_aim_deltas'].diff(1).dropna() arr=pd.DataFrame(index=[0]) arr['id']=i arr['speed_diff_median'] = speed_diff.median() arr['speed_diff_mean'] = speed_diff.mean() arr['speed_diff_var'] = speed_diff.var() arr['speed_diff_max'] = speed_diff.max() arr['angle_diff_var'] = angle_diff.var() arr['time_delta_min'] = temp['t'].min() arr['time_delta_max'] = temp['t'].max() arr['time_delta_var'] = temp['t'].var() arr['distance_deltas_max'] = temp['distance'].max() arr['distance_deltas_var'] = temp['distance'].var() arr['aim_distance_last'] = test['distance_aim_deltas'].values[-1] arr['aim_distance_diff_max'] = distance_aim_deltas_diff.max() arr['aim_distance_diff_var'] = distance_aim_deltas_diff.var() arr['mean_speed'] = temp['speed'].mean() arr['median_speed'] = temp['speed'].median() arr['var_speed'] = temp['speed'].var() arr['max_angle'] = temp['angles'].max() arr['var_angle'] = temp['angles'].var() arr['kurt_angle'] = temp['angles'].kurt() arr['y_min'< b60f /span>] = test["y"].min() arr['y_max'] = test["y"].max() arr['y_var'] = test["y"].var() arr['y_mean'] = test["y"].mean() arr['x_min'] = test["x"].min() arr['x_max'] = test["x"].max() arr['x_var'] = test["x"].var() arr['x_mean'] = test["x"].mean() arr['x_back_num'] = min( (test['x'].diff(1).dropna() > 0).sum(), (test['x'].diff(1).dropna() < 0).sum()) arr['y_back_num'] = min( (test['y'].diff(1).dropna() > 0).sum(), (test['y'].diff(1).dropna() < 0).sum()) arr['xs_delta_var'] = test['x'].diff(1).dropna().var() arr['xs_delta_max'] = test['x'].diff(1).dropna().max() arr['xs_delta_min'] =test['x'].diff(1).dropna().min() # arr['label']=test['label'] a=pd.concat([a,arr]) return a
模型
###xgb import xgboost as xgb test_x=test.drop('id',1) train_x=train.drop(['id','label'],1) dtest = xgb.DMatrix(test_x) # dval = xgb.DMatrix(val_x,label=val_data.label) dtrain = xgb.DMatrix(train_x, label=train.label) params={ 'booster':'gbtree', 'objective': 'binary:logistic', # 'scale_pos_weight': 1500.0/13458.0, 'eval_metric': "auc", 'gamma':0.1,#0.2 is ok 'max_depth':3, # 'lambda':550, 'subsample':0.7, 'colsample_bytree':0.4 , # 'min_child_weight':2.5, 'eta': 0.007, # 'learning_rate':0.01, 'seed':1024, 'nthread':7, } watchlist = [(dtrain,'train'), # (dval,'val') ]#The early stopping is based on last set in the evallist model = xgb.train( params, dtrain, feval=feval, # maximize=False, num_boost_round=1500, # early_stopping_rounds=10, # verbose_eval =30, evals=watchlist ) # model=xgb.XGBClassifier( # max_depth=4, # learning_rate=0.007, # n_estimators=1500, # silent=True, # objective='binary:logistic', # # booster='gbtree', # # n_jobs=-1, # nthread=7, # # gamma=0, # # min_child_weight=1, # # max_delta_step=0, # subsample=0.7, # colsample_bytree=0.7, # # colsample_bylevel=0.7, # # reg_alpha=0, # # reg_lambda=1, # scale_pos_weight=1, # base_score=0.5, # # random_state=0, # seed=1024, # missing=None, # ) # xgb.cv(params,dtrain,num_boost_round=1500,nfold=10,feval=feval,early_stopping_rounds=50,) # model.save_model('./model/xgb.model') # print "best best_ntree_limit",model.best_ntree_limit
评价函数
def eval(clf,x,y): prob=clf.predict(x) for i in range(len(prob)): if prob[i]>=1: prob[i]=1 else: prob[i]=0 p=((y==0)&(prob==0)).sum()/(prob==0).sum() print("TP"+" : "+str(((y==0)&(prob==0)).sum())+" "+"预测"+" : "+str((prob==0).sum())+" "+"真实"+" : "+str((y==0).sum())) r=((y==0)&(prob==0)).sum()/(y==0).sum() if p==0 or r==0: print(0.0) return 0.0 f=5*p*r/(2*p+3*r)*100 print(f) return f def feval(pred,dtrain): y=dtrain.get_label() for i in range(len(pred)): if pred[i]>=0.5: pred[i]=1 else: pred[i]=0 p=((y==0)&(pred==0)).sum()/(pred==0).sum() print("---------------------------------------------------------") # print("TP"+" : "+str(((y==0)&(pred==0)).sum())+" "+"预测"+" : "+str((pred==0).sum())+" "+"真实"+" : "+str((y==0).sum())) r=((y==0)&(pred==0)).sum()/(y==0).sum() if p==0 or r==0: print(0.0) return "f",0.0 f=5*p*r/(2*p+3*r)*100 print(f) return "f",f def target(score,num): x=score*(40000+3*num)/5 return x
线下cv
from sklearn import cross_validation score=cross_validation.cross_val_score(m,train.ix[:,1:-1],train.label,cv=10,scoring=eval) score.mean()
提交结果
pred=model.predict(dtest) test['prob']=pred submit=test.sort_values(by="prob").head(20000) submit=submit[['id']] submit=submit.astype(int)
线上成绩0.91
相关文章推荐
- 京东猪脸识别比赛数据预处理:用Python将视频每一帧提取存储为图片
- 第二届“中国高校计算机大赛-大数据挑战赛” 20名
- ‘’第二届“中国高校计算机大赛-大数据挑战赛‘实战
- 狗狗识别-百度西安交通大学大数据比赛baseline=0.2代码
- 鼠标轨迹识别
- 鼠标的轨迹识别
- 鼠标轨迹识别
- OpenCV仪表数据识别(四):图像倾斜矫正
- 阿里大数据比赛
- nokogiri 足球比赛数据
- 轨迹数据挖掘(trajectory data mining)
- PHP默认识别的数据类型是application/x-www.form-urlencoded标准的数据类型
- 数据挖掘之数据处理——SVM神经网络的数据分类预测-意大利葡萄酒种类识别
- jmeter从外部文件取值问题,如果文件中的参数值为纯数字形式的,jmeter会默认将其识别成int型数据
- (转)浅析linux中鼠标数据读取
- Extjs 中实现combox多选,已经解决了原有的bug 【选择多条记录后,鼠标点击其他空白处,选择的数据丢失,】
- 在文本框输入数据后,因为有历史记录,鼠标点或者敲回车这个历史记录时,请问会触发什么JS事件
- 使用jQuery或者原生js实现鼠标滚动加载页面新数据
- 模式识别之分类器knn---c语言实现带训练数据---反余弦匹配
- 【Java】在JTable中设置鼠标监听器,点击操作对应数据