您的位置:首页 > 编程语言 > Python开发

kaggle Digit Recognizer 数字识别

2017-07-06 16:17 375 查看
https://www.kaggle.com/c/digit-recognizer

首先看一下提供的训练文件train.csv

import pandas as pd

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')

print(trainingFile.head())
'''
label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0
1      0       0       0       0       0       0       0       0       0
2      1       0       0       0       0       0       0       0       0
3      4       0       0       0       0       0       0       0       0
4      0       0       0       0       0       0       0       0       0

pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0
1       0    ...            0         0         0         0         0
2       0    ...            0         0         0         0         0
3       0    ...            0         0         0         0         0
4       0    ...            0         0         0         0         0

pixel779  pixel780  pixel781  pixel782  pixel783
0         0         0         0         0         0
1         0         0         0         0         0
2         0         0         0         0         0
3         0         0         0         0         0
4         0         0         0         0         0

[5 rows x 785 columns]
'''
print(len(trainingFile))
'''
42000
'''
根据他的描述可以知道label是指数字是几 pixel是指784个像素点 共42000个数据

首先尝试用kNN算法

点击打开kNN.py

首先先让前41900个数据当训练集 后100个用作测试 看看正确率

import numpy as np
import pandas as pd

import kNN

# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:41900]
train_x[train_x > 0] = 1
train_y = np.array(trainingFile['label'])[:41900]

# 获取测试集
print('获取测试集...')

testingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
test_x = np.array(testingFile.drop('label', 1))[41900:]
test_x[test_x > 0] = 1
test_y = np.array(testingFile['label'])[41900:]

return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')

train_x, train_y, test_x, test_y = loadDataSet()

# 训练
print('训练中...')

pass

# 测试
print('测试中...')

numTestSamples = len(test_x)
matchCount = 0
result = []
for i in range(numTestSamples):
predict = kNN.kNNClassify(test_x[i], train_x, train_y, 3)
if predict == test_y[i]:
matchCount += 1

accuracy = float(matchCount) / numTestSamples

# 输出结果
print('输出结果...')

print('分类准确率为: %.2f%%' % (accuracy * 100))

if __name__ == '__main__':
testHandWritingClass()
输出结果:
加载数据...
获取训练集...
获取测试集...
训练中...
测试中...
输出结果...
分类准确率为: 99.00%


看正确率还不错 直接让train.csv作为训练集 计算test.csv中每个数 照着sample_submission.csv的格式 把答案存到result.csv
import numpy as np
import pandas as pd

import kNN

# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:]
train_x[train_x > 0] = 1
train_y = np.array(trainingFile['label'])[:]

# 获取测试集
print('获取测试集...')

testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
test_x = np.array(testingFile)[:]
test_x[test_x > 0] = 1
test_y = []

return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')

train_x, train_y, test_x, test_y = loadDataSet()

# 训练
print('训练中...')

pass

# 测试
print('测试中...')

numTestSamples = len(test_x)
result = []
for i in range(numTestSamples):
predict = kNN.kNNClassify(test_x[i], train_x, train_y, 4)
result.append([i + 1, predict])
if i % 100 == 0:
print('进度:', i, '/', numTestSamples)

# 输出结果
print('输出结果...')

#print(result)
pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
testHandWritingClass()

最后把result.csv提交分数为0.96543

使用scikit-learn库的kNN
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

# 加载数据
def loadDataSet():
# 获取训练集
print('获取训练集...')

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
train_x = np.array(trainingFile.drop('label', 1))[:]
preprocessing.Binarizer().fit(train_x)
train_y = np.array(trainingFile['label'])[:]

# 获取测试集
print('获取测试集...')

testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
test_x = np.array(testingFile)[:]
preprocessing.Binarizer().fit(test_x)
test_y = []

return train_x, train_y, test_x, test_y

# 手写数字测试
def testHandWritingClass():
# 加载数据
print('加载数据...')

train_x, train_y, test_x, test_y = loadDataSet()

# 训练
print('训练中...')

model = KNeighborsClassifier()
model.fit(train_x, train_y)

# 测试
print('测试中...')

predict = model.predict(test_x)

# 输出结果
print('输出结果...')

result = list(enumerate(predict, 1))
#print(result)
pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
testHandWritingClass()
最后把result.csv提交分数为0.96800
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息