您的位置:首页 > 其它

scikit-learn 常用分类算法的使用

2017-06-10 20:05 260 查看
scikit-learn机器学习的分类算法包括逻辑回归、朴素贝叶斯、KNN、支持向量机、决策树和随机森林等。这些模块的调用形式基本一致,训练用fit方法,预测用predict方法。用joblib.dump方法可以保存训练的模型,用joblib.load方法可以载入模型。

测试程序。测试数据采用小麦种子数据集 (seeds)。

# -*- coding: utf-8 -*-

import numpy as np
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

feature_names = [
'area',
'perimeter',
'compactness',
'length of kernel',
'width of kernel',
'asymmetry coefficien',
'length of kernel groove',
]

COLOUR_FIGURE = False

def load_csv_data(filename):
data = []
labels = []
datafile = open(filename)
for line in datafile:
fields = line.strip().split('\t')
data.append([float(field) for field in fields[:-1]])
labels.append(fields[-1])
data = np.array(data)
labels = np.array(labels)
return data, labels

def accuracy(test_labels, pred_lables):
correct = np.sum(test_labels == pred_lables)
n = len(test_labels)
return float(correct) / n

#------------------------------------------------------------------------------
#逻辑回归
#------------------------------------------------------------------------------
def testLR(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = LogisticRegression()
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

#------------------------------------------------------------------------------
#朴素贝叶斯
#------------------------------------------------------------------------------
def testNaiveBayes(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = GaussianNB()
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

#------------------------------------------------------------------------------
#K最近邻
#------------------------------------------------------------------------------
def testKNN(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = KNeighborsClassifier(n_neighbors=5)
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

#------------------------------------------------------------------------------
#--- 支持向量机
#------------------------------------------------------------------------------
def testSVM(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = svm.SVC()
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

#------------------------------------------------------------------------------
#--- 决策树
#------------------------------------------------------------------------------
def testDecisionTree(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = DecisionTreeClassifier()
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

#------------------------------------------------------------------------------
#--- 随机森林
#------------------------------------------------------------------------------
def testRandomForest(features, labels):
kf = KFold(len(features), n_folds=3, shuffle=True)
clf = RandomForestClassifier()
result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]
score = [accuracy(labels[result[1]], result[0]) for result in result_set]
print(score)

if __name__ == '__main__':
features, labels = load_csv_data('data/seeds_dataset.txt')
print(features)

print('LogisticRegression: \r')
testLR(features, labels)

print('GaussianNB: \r')
testNaiveBayes(features, labels)

print('KNN: \r')
testKNN(features, labels)

print('SVM: \r')
testSVM(features, labels)

print('Decision Tree: \r')
testDecisionTree(features, labels)

print('Random Forest: \r')
testRandomForest(features, labels)


在Spyder中调试运行,运行结果。

runfile('E:/MyProject/_python/ScikitLearn/demo_clf.py', wdir='E:/MyProject/_python/ScikitLearn')
[[ 15.26    14.84     0.871  ...,   3.312    2.221    5.22  ]
[ 14.88    14.57     0.8811 ...,   3.333    1.018    4.956 ]
[ 14.29    14.09     0.905  ...,   3.337    2.699    4.825 ]
...,
[ 13.2     13.66     0.8883 ...,   3.232    8.315    5.056 ]
[ 11.84    13.21     0.8521 ...,   2.836    3.598    5.044 ]
[ 12.3     13.34     0.8684 ...,   2.974    5.637    5.063 ]]
LogisticRegression:
[0.9142857142857143, 0.9714285714285714, 0.8857142857142857]
GaussianNB:
[0.9428571428571428, 0.8714285714285714, 0.9]
KNN:
[0.9285714285714286, 0.8571428571428571, 0.8857142857142857]
SVM:
[0.9, 0.9285714285714286, 0.8571428571428571]
Decision Tree:
[0.8714285714285714, 0.9714285714285714, 0.9142857142857143]
Random Forest:
[0.8857142857142857, 0.9142857142857143, 0.8428571428571429]
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: