您的位置:首页 > 其它

sklearn查看数据分布

2017-02-10 19:57 337 查看
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

train_data = pd.read_csv("train.csv")
LABELS = train_data['species']

# 将train_data中的‘id’列弹出。
ID = train_data.pop('id')
# print train_data[0:1]

# 将train_data中的‘species’列弹出。
y = train_data.pop('species')
# 将species向量化。
y = LabelEncoder().fit(y).transform(y)
print y

# standardize the data by setting the mean to 0 and std to 1
standardize = True
X = StandardScaler().fit(train_data).transform(train_data) if standardize else train_data.values
print X[0:1]

from sklearn.decomposition import PCA, IncrementalPCA
n_components = 2
ipca = IncrementalPCA(n_components=n_components, batch_size=10)
X_ipca = ipca.fit_transform(X)

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

colors = ['navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod','navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod','navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod',]

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
plt.figure(figsize=(8, 8))
for color, i, target_name in \
zip(colors, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24], LABELS):
plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
color=color, lw=2, label=target_name)

if "Incremental" in title:
err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
plt.title(title + " of iris dataset\nMean absolute unsigned error "
"%.6f" % err)
else:
plt.title(title + " of iris dataset")
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.axis([-10, 10, -10, 10])

plt.show()






import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

train_data = pd.read_csv("train.csv")
LABELS = train_data['species']

# 将train_data中的‘id’列弹出。
ID = train_data.pop('id')
# print train_data[0:1]

# 将train_data中的‘species’列弹出。
y = train_data.pop('species')
# 将species向量化。
y = LabelEncoder().fit(y).transform(y)
print y

# standardize the data by setting the mean to 0 and std to 1
standardize = True
X = StandardScaler().fit(train_data).transform(train_data) if standardize else train_data.values
print X[0:1]

from sklearn.decomposition import PCA, IncrementalPCA
n_components = 2
ipca = IncrementalPCA(n_components=n_components, batch_size=10)
X_ipca = ipca.fit_transform(X)

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

colors = ['navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod','navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod','navy', 'turquoise', 'darkorange', 'blue', 'purple', 'green',
'yellow','red','pink', 'palegoldenrod',]

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
plt.figure(figsize=(8, 8))
for color, i, target_name in \
zip(colors, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24], LABELS):
plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
color=color, lw=2, label=target_name)

if "Incremental" in title:
err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
plt.title(title + " of iris dataset\nMean absolute unsigned error "
"%.6f" % err)
else:
plt.title(title + " of iris dataset")
#plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.axis([-10, 10, -10, 10])

plt.show()






如果帮到你了,请赞赏支持:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  数据可视化