Python Scikit-learn을 활용한 회귀, 분류 예제

Python Scikit-learn을 활용한 회귀, 분류 예제

2017. 8. 17. 21:02ㆍ서버 프로그래밍

Linear Regression

import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn import linear_model

from sklearn.metrics import mean_squared_error#MSE:평균제곱오차

from sklearn.metrics import r2_score#결정계수

import seaborn as sns

df = pd.read_csv(

"https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/housing/housing.data",

delimiter=r"\s+",

names=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"])

df.head()

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0	0.458	7.147	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2

df.shape

(506, 14)

df[pd.isnull(df).any(axis=1)]

CRIM

INDUS

CHAS

NOX

AGE

DIS

RAD

TAX

PTRATIO

LSTAT

MEDV

sns.set(style='whitegrid',context='notebook')

cols = ['LSTAT','INDUS','NOX','RM','MEDV']

sns.pairplot(df[cols],size=2.5)

plt.show()#산점도 행렬 출력

cm = np.corrcoef(df[cols].values.T)

sns.set(font_scale=1.5)

hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f',

annot_kws={'size':15},yticklabels=cols,xticklabels=cols)

plt.show()#상관관계행렬 출력

#LSTAT과 MEDV는 가장 큰 상관관계를 가진다 (산점도행렬에서는 비선형관계였음)

#RM과 MEDV는 산점도행렬에서 선형관계임

#설명변수

X = df[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]

#목표변수(반응값) -> 설명변수를 이용하여 예측하고자하는 변수

y = df[["MEDV"]]

#학습데이터 -> 훈련데이터, 테스트데이터 구분

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=5)

reg = linear_model.LinearRegression()

#훈련 (모형추정)

reg.fit(X_train,y_train)

print(reg.intercept_)#추정된 상수항 출력

print(reg.coef_)#추정된 가중치벡터 출력 (초평면계수)

[ 37.91248701]

[[ -1.30799852e-01 4.94030235e-02 1.09535045e-03 2.70536624e+00

-1.59570504e+01 3.41397332e+00 1.11887670e-03 -1.49308124e+00

3.64422378e-01 -1.31718155e-02 -9.52369666e-01 1.17492092e-02

-5.94076089e-01]]

print(X_train.shape)

print(X_test.shape)

(404, 13)

(102, 13)

#예측값 설정 - 회귀모델이 잘만들어졌는지 테스트

y_train_pred = reg.predict(X_train)

y_test_pred = reg.predict(X_test)

#잔차플롯 : 회귀모델 진단을 위한 그래픽 분석

plt.scatter(y_train_pred,y_train_pred-y_train, c='blue',marker='o',label='Training data')

plt.scatter(y_test_pred,y_test_pred-y_test, c='lightgreen',marker='s',label='Test data')

plt.xlabel('Predicted Values')

plt.ylabel('Residuals')

plt.hlines(y=0,xmin=-10,xmax=50,lw=2,color='red')

plt.xlim([-10,50])

plt.show()

#적합도 특정

#평균제곱오차(MSE:Mean Squared Error) 출력

print('MSE train:%.3f, test:%.3f'%(

mean_squared_error(y_train,y_train_pred),

mean_squared_error(y_test,y_test_pred)))

#결정계수 출력 - 추정한 예측모델의 주어진 자료에 대한 적합도

print('R^2 train:%.3f, test:%.3f'%(

r2_score(y_train,y_train_pred),

r2_score(y_test,y_test_pred)))

MSE train:22.477, test:20.869

R^2 train:0.738, test:0.733

y_test_m = y_test.as_matrix()

plt.figure(figsize=(15,10))

plt.plot(y_test_m)

plt.plot(y_test_pred)

legend_list = ['y_test_m','y_test_pred']

plt.legend(legend_list,loc=4,fontsize='25')

plt.show()

---------------------------------------------------

Classification : DecisionTree

import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

from IPython.display import Image

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn import tree

from sklearn import metrics

df = pd.read_table(

"https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",

sep=',',

names=["buying","maint","doors","persons","lug_boot","safety","eval"])

df.shape

(1728, 7)

df[pd.isnull(df).any(axis=1)]

buying

maint

doors

persons

lug_boot

safety

eval

df.head()

	buying	maint	doors	persons	lug_boot	safety	eval
0	vhigh	vhigh	2	2	small	low	unacc
1	vhigh	vhigh	2	2	small	med	unacc
2	vhigh	vhigh	2	2	small	high	unacc
3	vhigh	vhigh	2	2	med	low	unacc
4	vhigh	vhigh	2	2	med	med	unacc

car_counts = pd.DataFrame(df['eval'].value_counts())

car_counts['Percentage'] = car_counts['eval']/car_counts.sum()[0]

car_counts.head()

	eval	Percentage
unacc	1210	0.700231
acc	384	0.222222
good	69	0.039931
vgood	65	0.037616

plt.figure(figsize=(8,8))

plt.pie(car_counts["Percentage"], labels=['Unacceptable','acceptable','Good','Very Good'])

plt.show()

le = preprocessing.LabelEncoder()

encoded_buying = le.fit(df['buying'])

encoded_buying.classes_

array(['high', 'low', 'med', 'vhigh'], dtype=object

encoded_buying.transform(['high'])

encoded_buying.transform(['low'])

encoded_buying.transform(['med'])

encoded_buying.transform(['vhigh'])

encoded_buying.inverse_transform(1)

'low'

for i in range(0,4):

print(i,":",encoded_buying.inverse_transform(i))

0 : high

1 : low

2 : med

3 : vhigh

df['e.buying'] = df['buying'].map(lambda x:encoded_buying.transform([x]))

df.head()

	buying	maint	doors	persons	lug_boot	safety	eval	e.buying
0	vhigh	vhigh	2	2	small	low	unacc	[3]
1	vhigh	vhigh	2	2	small	med	unacc	[3]
2	vhigh	vhigh	2	2	small	high	unacc	[3]
3	vhigh	vhigh	2	2	med	low	unacc	[3]
4	vhigh	vhigh	2	2	med	med	unacc	[3]

df['e.buying'] = df['e.buying'].map(lambda x:x[0])

df.head()

	buying	maint	doors	persons	lug_boot	safety	eval	e.buying
0	vhigh	vhigh	2	2	small	low	unacc	3
1	vhigh	vhigh	2	2	small	med	unacc	3
2	vhigh	vhigh	2	2	small	high	unacc	3
3	vhigh	vhigh	2	2	med	low	unacc	3
4	vhigh	vhigh	2	2	med	med	unacc	3

encoded_maint = le.fit(df['maint'])

encoded_maint.classes_

array(['high', 'low', 'med', 'vhigh'], dtype=object)

df['e.maint'] = df['maint'].map(lambda x:encoded_maint.transform([x]))

df['e.maint'] = df['e.maint'].map(lambda x:x[0])

df.head()

	buying	maint	doors	persons	lug_boot	safety	eval	e.buying	e.maint
0	vhigh	vhigh	2	2	small	low	unacc	3	3
1	vhigh	vhigh	2	2	small	med	unacc	3	3
2	vhigh	vhigh	2	2	small	high	unacc	3	3
3	vhigh	vhigh	2	2	med	low	unacc	3	3
4	vhigh	vhigh	2	2	med	med	unacc	3	3

def encode_col(col_name):

encodes = le.fit(df[col_name])

new_col_name = "e."+col_name

df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))

df[new_col_name] = df[new_col_name].map(lambda x:x[0])

return

encode_col('doors')

encode_col('persons')

encode_col('lug_boot')

encode_col('safety')

encode_col('eval')

df.head()

buying	maint	doors	persons	lug_boot	safety	eval	e.buying	e.maint	e.doors	e.persons	e.lug_boot	e.safety	e.eval
0	vhigh	vhigh	2	2	small	low	unacc	3	3	0	0	2	1	2
1	vhigh	vhigh	2	2	small	med	unacc	3	3	0	0	2	2	2
2	vhigh	vhigh	2	2	small	high	unacc	3	3	0	0	2	0	2
3	vhigh	vhigh	2	2	med	low	unacc	3	3	0	0	1	1	2
4	vhigh	vhigh	2	2	med	med	unacc	3	3	0	0	1	2	2

pd.DataFrame(df['eval'].value_counts())

	eval
unacc	1210
acc	384
good	69
vgood	65

pd.DataFrame(df['e.eval'].value_counts())

	e.eval
2	1210
0	384
1	69
3	65

X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot', 'e.safety']]

print(type(X))

X.shape

(1728, 6)

y = df['e.eval']

print(type(y))

y.shape

(1728,)

X_train,X_test,y_train,y_test = train_test_split(X,y,

test_size=0.25,

random_state=5)

print(X_train.shape)

print(y_train.shape)

print(X_test.shape)

print(y_test.shape)

(1296, 6)

(1296,)

(432, 6)

(432,)

#결정트리 분류기를 이용하여 훈련

clf_dt = tree.DecisionTreeClassifier(random_state=10)

clf_dt.fit(X_train,y_train)#학습 수행

y_pred_dt = clf_dt.predict(X_test)#예측값 도출

print(type(y_pred_dt))

print(y_pred_dt.shape)

y_pred_dt

(432,)

array([0, 2, 0, 0, 0, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0,

2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2,

2, 3, 2, 2, 0, 1, 2, 2, 2, 0, 2, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0, 0, 0,

2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0, 1, 2, 2, 2,

2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2,

0, 2, 0, 2, 2, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 0, 2,

2, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,

2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 3, 2, 2,

2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, 0, 2, 2, 0, 1, 2,

2, 2, 2, 0, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2,

0, 3, 2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2,

2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0,

2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2,

2, 0, 2, 0, 2, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0,

2, 0, 2, 0, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,

2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0])

metrics.accuracy_score(y_test,y_pred_dt)

0.96527777777777779

correct_pred_dt = []

wrong_pred_dt = []

y_test2 = y_test.reset_index(drop=True)

y_test2 = y_test2.as_matrix()

for i in range(0,432):

if (y_test2[i] != y_pred_dt[i]):#예측이 틀렸을 경우

wrong_pred_dt.append(i)

else:

correct_pred_dt.append(i)

print("Cerrectly indetified labels:",len(correct_pred_dt))

print(" ")

print("Wrong indetified labels:",len(wrong_pred_dt))

print("----------------------")

print(y_test[10:20])

print(y_pred_dt[10:20])

wrong_pred_dt

Cerrectly indetified labels: 417

Wrong indetified labels: 15

----------------------

464 2

269 0

1081 2

871 2

825 2

342 2

1273 2

1061 2

346 2

1090 0

Name: e.eval, dtype: int64

[2 0 2 2 2 2 2 2 2 2]

[19, 41, 45, 54, 74, 101, 104, 167, 168, 226, 259, 273, 328, 369, 428]

def dt_probs(index_num):

X_param = X_test.ix[index_num]

X_param = X_param.to_frame()

X_param = X_param.transpose()

temp_pred = clf_dt.predict_proba(X_param)

temp_pred_1 = temp_pred[0]

y_actual = y_test[index_num]

y_range = ['Unacceptable','Acceptable','Good','Very Good']

print("======================================")

print("For index number:",index_num)

print(" ")

print("Fetures entered:")

print(X_param)

print(" ")

print("Actual score:")

print(y_actual,"(",y_range[y_actual],")")

print(" ")

print("Predicted probabilities:")

for i in range(0,4):

print(y_range[i],":",temp_pred_1[i])

return

dt_probs(805)

dt_probs(50)

======================================

For index number: 805

Fetures entered:

e.buying e.maint e.doors e.persons e.lug_boot e.safety

805 0 1 1 2 1 2

Actual score:

0 ( Unacceptable )

Predicted probabilities:

Unacceptable : 1.0

Acceptable : 0.0

Good : 0.0

Very Good : 0.0

======================================

For index number: 50

Fetures entered:

e.buying e.maint e.doors e.persons e.lug_boot e.safety

50 3 3 1 2 1 0

Actual score:

2 ( Good )

Predicted probabilities:

Unacceptable : 0.0

Acceptable : 0.0

Good : 1.0

Very Good : 0.0

---------------------------------

Classification : kNN

import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

from IPython.display import Image

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

df = pd.read_table(

"https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",

sep=',',

names=["buying","maint","doors","persons","lug_boot","safety","eval"])

le = preprocessing.LabelEncoder()

def encode_col(col_name):

encodes = le.fit(df[col_name])

new_col_name = "e."+col_name

df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))

df[new_col_name] = df[new_col_name].map(lambda x:x[0])

return

encode_col('buying')

encode_col('maint')

encode_col('doors')

encode_col('persons')

encode_col('lug_boot')

encode_col('safety')

encode_col('eval')

df.head()

	buying	maint	doors	persons	lug_boot	safety	eval	e.buying	e.maint	e.doors	e.persons	e.lug_boot	e.safety	e.eval
0	vhigh	vhigh	2	2	small	low	unacc	3	3	0	0	2	1	2
1	vhigh	vhigh	2	2	small	med	unacc	3	3	0	0	2	2	2
2	vhigh	vhigh	2	2	small	high	unacc	3	3	0	0	2	0	2
3	vhigh	vhigh	2	2	med	low	unacc	3	3	0	0	1	1	2
4	vhigh	vhigh	2	2	med	med	unacc	3	3	0	0	1	2	2

X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot','e.safety']]

y = df['e.eval']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=5)

clf_knn = KNeighborsClassifier(n_neighbors=5)

clf_knn.fit(X_train,y_train)

y_pred = clf_knn.predict(X_test)

print(metrics.accuracy_score(y_test,y_pred))

0.898148148148

correct_pred = []

wrong_pred = []

y_test2 = y_test.reset_index(drop=True)

y_test2 = y_test2.as_matrix()

for i in range(0,432):

if (y_test2[i] != y_pred[i]):

wrong_pred.append(i)

else:

correct_pred.append(i)

print("correct indetified labels:",len(correct_pred))

print(" ")

print("wrong indetified labels:",len(wrong_pred))

correct indetified labels: 388

wrong indetified labels: 44

y_test3 = y_test.to_frame()

y_test3 = y_test3.reset_index()

y_test4 = y_test3.drop('e.eval',1)

y_test4.head()

	index
0	805
1	50
2	1171
3	1177
4	395

wrong_list = []

for i in wrong_pred:

wrong_index = y_test4.iloc[i]

wrong_index1 = wrong_index[0]

wrong_list.append(wrong_index1)

print(wrong_pred)

print(wrong_list)

[0, 18, 19, 25, 26, 41, 54, 74, 82, 85, 96, 101, 104, 111, 117, 134, 140, 145, 157, 167, 217, 237, 247, 251, 256, 259, 261, 262, 273, 274, 299, 300, 311, 314, 332, 344, 345, 369, 373, 376, 378, 399, 416, 428]

[805, 346, 1090, 556, 668, 1130, 344, 1532, 1529, 1694, 1534, 904, 1631, 1712, 421, 1312, 1279, 1715, 1069, 1198, 1615, 562, 1307, 1618, 1016, 1414, 1441, 1253, 1630, 664, 230, 1538, 1448, 1549, 1235, 880, 107, 1336, 1663, 1685, 823, 1612, 1525, 1522]

def knn_probs(index_num):

X_param = X_test.ix[index_num]

X_param = X_param.to_frame()

X_param = X_param.transpose()

temp_pred = clf_knn.predict_proba(X_param)

temp_pred_1 = temp_pred[0]

y_actual = y_test[index_num]

y_range = ['Unacceptable','Acceptable','Good','Very Good']

print("======================================")

print("For index number:",index_num)

print(" ")

print("Fetures entered:")

print(X_param)

print(" ")

print("Actual score:")

print(y_actual,"(",y_range[y_actual],")")

print(" ")

print("Predicted probabilities:")

for i in range(0,4):

print(y_range[i],":",temp_pred_1[i])

print(" ")

if index_num in wrong_list:

print("Label predicted:Wrongly")

else:

print("Label predicted:Correctly")

return

knn_probs(805)

knn_probs(50)

knn_probs(1171)

======================================

For index number: 805

Fetures entered:

e.buying e.maint e.doors e.persons e.lug_boot e.safety

805 0 1 1 2 1 2

Actual score:

0 ( Unacceptable )

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.0

Good : 0.6

Very Good : 0.0

Label predicted:Wrongly

======================================

For index number: 50

Fetures entered:

e.buying e.maint e.doors e.persons e.lug_boot e.safety

50 3 3 1 2 1 0

Actual score:

2 ( Good )

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.0

Good : 0.6

Very Good : 0.0

Label predicted:Correctly

======================================

For index number: 1171

Fetures entered:

e.buying e.maint e.doors e.persons e.lug_boot e.safety

1171 2 2 3 1 2 2

Actual score:

0 ( Unacceptable )

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.2

Good : 0.4

Very Good : 0.0

Label predicted:Correctly

from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_recall_fscore_support

cm = confusion_matrix(y_test,y_pred)

labels = ['Unacceptable','Acceptable','Good','Very Good']

df_cm = pd.DataFrame(cm, index=[i for i in labels],

columns=[i for i in labels])

plt.figure(figsize=(10,10))

sns.heatmap(df_cm,annot=True)

plt.xlabel('Predicted',fontsize=20)

plt.ylabel('Actual',fontsize=20)

plt.show()

#accuracy : 정확도

#Precision : 정밀도

#Recall : 재현율

#f(beta) score :정밀도와 재현율의 가중조화평균

scr_clf_knn = precision_recall_fscore_support(y_test,y_pred,

average='weighted')

print("Classfier's precision:"+str(scr_clf_knn[0]))

print("Classfier's recall:"+str(scr_clf_knn[1]))

print("Classfier's fbeta_score:"+str(scr_clf_knn[2]))

print("Classfier's accuracy:"+str(metrics.accuracy_score(y_test,y_pred)))

Classfier's precision:0.89500373968

Classfier's recall:0.898148148148

Classfier's fbeta_score:0.891002785157

Classfier's accuracy:0.898148148148

저작자표시 (새창열림)

나숑의 법칙

나숑의 법칙

태그

최근글

댓글

공지사항

아카이브

관련글

티스토리툴바