# -*- coding: utf-8 -*-
import argparse as ap
import cv2
import imutils
import numpy as np
import os
from sklearn.externals import joblib
from scipy.cluster.vq import *
from sklearn import preprocessing
import math
import sys
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import  CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import cross_validation


def load_data(path):
:param path:种类文件夹路径
:return: 图片路径列表和标签列表
categories = os.listdir(path)
img_pathes = []
labels = []
for path, dirs, files in os.walk(path):
img_pathes.extend([os.path.join(path, file) for file in files])
print path
if len(files)>0:
labels.extend([path.split('/')[-1]] * len(files))
#print len(img_pathes),img_pathes
#print len(labels),labels
return img_pathes,labels

def cal_bow(image_paths, numWords):
使用bag of word方法提取图像特征
:param image_paths:
#numWords = 100
fea_det = cv2.FeatureDetector_create("SIFT")
des_ext = cv2.DescriptorExtractor_create("SIFT")

# List where all the descriptors are stored
des_list = []

for i, image_path in enumerate(image_paths):
im = cv2.imread(image_path)
print "Extract SIFT of %s image, %d of %d images" %(image_paths[i], i, len(image_paths))
kpts = fea_det.detect(im)
print len(kpts)
kpts, des = des_ext.compute(im, kpts)
des_list.append((image_path, des))

# Stack all the descriptors vertically in a numpy array
descriptors = des_list[0][1]
for image_path, descriptor in des_list[1:]:
#print descriptor.shape, descriptors.shape
#if descriptor != None:
descriptors = np.vstack((descriptors, descriptor))

print "Start k-means: %d words, %d key points" %(numWords, descriptors.shape[0])
voc, variance = kmeans(descriptors, numWords, iter=1)

#初始化一个bag of word矩阵,每行表示一副图像,每列表示一个视觉词,下面统计每副图像中视觉词的个数
im_features = np.zeros((len(image_paths), numWords), "float32")
for i in xrange(len(image_paths)):
descriptor = des_list[i][1]
#if descriptor != None:
#根据聚类中心将所有数据进行分类des_list[i][1]为数据, voc则是kmeans产生的聚类中心.
words, distance = vq(des_list[i][1],voc)
for w in words:
im_features[i][w] += 1

# Perform Tf-Idf vectorization
nbr_occurences = np.sum( (im_features > 0) * 1, axis = 0)
idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

im_features = im_features*idf
im_features = preprocessing.normalize(im_features, norm='l2')
return im_features

def train_clf2(train_data, train_tags):
#clf = SVC(kernel = 'linear')#default with 'rbf'
clf = LinearSVC(C=1100.0)#default with 'rbf'
return clf

def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred, average="macro")
m_recall = metrics.recall_score(actual, pred, average="macro")
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred, average="macro"));
#提取图片特征并保存 图像大小为20*20
path = 'D:/data/Caltech_101/101_part/'
img_pathes,labels = load_data(path)
im_features = cal_bow(img_pathes, numWords=500)
joblib.dump((im_features, labels), "bof.pkl", compress=3)

im_features, labels = joblib.load("bof.pkl")

X_train, X_test, y_train, y_test = \
cross_validation.train_test_split(im_features,labels, test_size=0.3, random_state=0)
clf = train_clf2(X_train, y_train)

pred = clf.predict(X_test)
print pred
print y_test
evaluate(y_test, pred)

