Python Scikit-learn을 활용한 회귀, 분류 예제

2017. 8. 17. 21:02서버 프로그래밍

Linear Regression


import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn import linear_model

from sklearn.metrics import mean_squared_error#MSE:평균제곱오차

from sklearn.metrics import r2_score#결정계수

import seaborn as sns

 

df = pd.read_csv(

   "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/housing/housing.data",

   delimiter=r"\s+",

   names=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"])

df.head()

 

CRIM

ZN

INDUS

CHAS

NOX

RM

AGE

DIS

RAD

TAX

PTRATIO

B

LSTAT

MEDV

0

0.00632

18.0

2.31

0

0.538

6.575

65.2

4.0900

1

296.0

15.3

396.90

4.98

24.0

1

0.02731

0.0

7.07

0

0.469

6.421

78.9

4.9671

2

242.0

17.8

396.90

9.14

21.6

2

0.02729

0.0

7.07

0

0.469

7.185

61.1

4.9671

2

242.0

17.8

392.83

4.03

34.7

3

0.03237

0.0

2.18

0

0.458

6.998

45.8

6.0622

3

222.0

18.7

394.63

2.94

33.4

4

0.06905

0.0

2.18

0

0.458

7.147

54.2

6.0622

3

222.0

18.7

396.90

5.33

36.2

 

df.shape

(506, 14)


df[pd.isnull(df).any(axis=1)]

CRIM

ZN

INDUS

CHAS

NOX

RM

AGE

DIS

RAD

TAX

PTRATIO

B

LSTAT

MEDV

  

sns.set(style='whitegrid',context='notebook')

cols = ['LSTAT','INDUS','NOX','RM','MEDV']

sns.pairplot(df[cols],size=2.5)

plt.show()#산점도 행렬 출력

 

cm = np.corrcoef(df[cols].values.T)

sns.set(font_scale=1.5)

hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f',

               annot_kws={'size':15},yticklabels=cols,xticklabels=cols)

plt.show()#상관관계행렬 출력

#LSTAT과 MEDV는 가장 큰 상관관계를 가진다 (산점도행렬에서는 비선형관계였음)

#RM과 MEDV는 산점도행렬에서 선형관계임


#설명변수

X = df[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]

#목표변수(반응값) -> 설명변수를 이용하여 예측하고자하는 변수

y = df[["MEDV"]]

#학습데이터 -> 훈련데이터, 테스트데이터 구분

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=5)

reg = linear_model.LinearRegression()

#훈련 (모형추정)

reg.fit(X_train,y_train)

print(reg.intercept_)#추정된 상수항 출력

print(reg.coef_)#추정된 가중치벡터 출력 (초평면계수)

 

[ 37.91248701]

[[ -1.30799852e-01   4.94030235e-02   1.09535045e-03   2.70536624e+00

  -1.59570504e+01   3.41397332e+00   1.11887670e-03  -1.49308124e+00

   3.64422378e-01  -1.31718155e-02  -9.52369666e-01   1.17492092e-02

  -5.94076089e-01]]


print(X_train.shape)

print(X_test.shape)

(404, 13)

(102, 13)


#예측값 설정 - 회귀모델이 잘만들어졌는지 테스트

y_train_pred = reg.predict(X_train)

y_test_pred = reg.predict(X_test)

#잔차플롯 : 회귀모델 진단을 위한 그래픽 분석

plt.scatter(y_train_pred,y_train_pred-y_train, c='blue',marker='o',label='Training data')

plt.scatter(y_test_pred,y_test_pred-y_test, c='lightgreen',marker='s',label='Test data')

plt.xlabel('Predicted Values')

plt.ylabel('Residuals')

plt.hlines(y=0,xmin=-10,xmax=50,lw=2,color='red')

plt.xlim([-10,50])

plt.show()

 

#적합도 특정

#평균제곱오차(MSE:Mean Squared Error) 출력

print('MSE train:%.3f, test:%.3f'%(

   mean_squared_error(y_train,y_train_pred),

   mean_squared_error(y_test,y_test_pred)))

#결정계수 출력 - 추정한 예측모델의 주어진 자료에 대한 적합도

print('R^2 train:%.3f, test:%.3f'%(

   r2_score(y_train,y_train_pred),

   r2_score(y_test,y_test_pred)))

 

MSE train:22.477, test:20.869

R^2 train:0.738, test:0.733


 

y_test_m = y_test.as_matrix()

plt.figure(figsize=(15,10))

plt.plot(y_test_m)

plt.plot(y_test_pred)

legend_list = ['y_test_m','y_test_pred']

plt.legend(legend_list,loc=4,fontsize='25')

plt.show()

 

--------------------------------------------------- 

Classification : DecisionTree

 

import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

from IPython.display import Image

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn import tree

from sklearn import metrics

 

df = pd.read_table(

   "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",

   sep=',',

   names=["buying","maint","doors","persons","lug_boot","safety","eval"])

df.shape

 

(1728, 7)


df[pd.isnull(df).any(axis=1)]

buying

maint

doors

persons

lug_boot

safety

eval

 

df.head()

 

buying

maint

doors

persons

lug_boot

safety

eval

0

vhigh

vhigh

2

2

small

low

unacc

1

vhigh

vhigh

2

2

small

med

unacc

2

vhigh

vhigh

2

2

small

high

unacc

3

vhigh

vhigh

2

2

med

low

unacc

4

vhigh

vhigh

2

2

med

med

unacc

 

car_counts = pd.DataFrame(df['eval'].value_counts())

car_counts['Percentage'] = car_counts['eval']/car_counts.sum()[0]

car_counts.head()


eval

 Percentage

unacc

1210

0.700231

acc

384

0.222222

good

69

0.039931

vgood

65

0.037616


 

plt.figure(figsize=(8,8))

plt.pie(car_counts["Percentage"], labels=['Unacceptable','acceptable','Good','Very Good'])

plt.show()

 


le = preprocessing.LabelEncoder()

encoded_buying = le.fit(df['buying'])

encoded_buying.classes_

 

array(['high', 'low', 'med', 'vhigh'], dtype=object


encoded_buying.transform(['high'])

encoded_buying.transform(['low'])

encoded_buying.transform(['med'])

encoded_buying.transform(['vhigh'])

encoded_buying.inverse_transform(1)

'low'


for i in range(0,4):

   print(i,":",encoded_buying.inverse_transform(i))

0 : high

1 : low

2 : med

3 : vhigh


df['e.buying'] = df['buying'].map(lambda x:encoded_buying.transform([x]))

df.head()

 

buying

maint

doors

persons

lug_boot

safety

eval

e.buying

0

vhigh

vhigh

2

2

small

low

unacc

[3]

1

vhigh

vhigh

2

2

small

med

unacc

[3]

2

vhigh

vhigh

2

2

small

high

unacc

[3]

3

vhigh

vhigh

2

2

med

low

unacc

[3]

4

vhigh

vhigh

2

2

med

med

unacc

[3]

 

df['e.buying'] = df['e.buying'].map(lambda x:x[0])

df.head()

 

buying

maint

doors

persons

lug_boot

safety

eval

e.buying

0

vhigh

vhigh

2

2

small

low

unacc

3

1

vhigh

vhigh

2

2

small

med

unacc

3

2

vhigh

vhigh

2

2

small

high

unacc

3

3

vhigh

vhigh

2

2

med

low

unacc

3

4

vhigh

vhigh

2

2

med

med

unacc

3

 

encoded_maint = le.fit(df['maint'])

encoded_maint.classes_

array(['high', 'low', 'med', 'vhigh'], dtype=object)


df['e.maint'] = df['maint'].map(lambda x:encoded_maint.transform([x]))

df['e.maint'] = df['e.maint'].map(lambda x:x[0])

df.head()

 

buying

maint

doors

persons

lug_boot

safety

eval

e.buying

e.maint

0

vhigh

vhigh

2

2

small

low

unacc

3

3

1

vhigh

vhigh

2

2

small

med

unacc

3

3

2

vhigh

vhigh

2

2

small

high

unacc

3

3

3

vhigh

vhigh

2

2

med

low

unacc

3

3

4

vhigh

vhigh

2

2

med

med

unacc

3

3

 

def encode_col(col_name):

   encodes = le.fit(df[col_name])

   new_col_name = "e."+col_name

   df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))

   df[new_col_name] = df[new_col_name].map(lambda x:x[0])

   return

encode_col('doors')

encode_col('persons')

encode_col('lug_boot')

encode_col('safety')

encode_col('eval')

df.head()

buying

maint

doors

persons

lug_boot

safety

eval

e.buying

e.maint

e.doors

e.persons

e.lug_boot

e.safety

e.eval

 

0

vhigh

vhigh

2

2

small

low

unacc

3

3

0

0

2

1

2

1

vhigh

vhigh

2

2

small

med

unacc

3

3

0

0

2

2

2

2

vhigh

vhigh

2

2

small

high

unacc

3

3

0

0

2

0

2

3

vhigh

vhigh

2

2

med

low

unacc

3

3

0

0

1

1

2

4

vhigh

vhigh

2

2

med

med

unacc

3

3

0

0

1

2

2

 

 

pd.DataFrame(df['eval'].value_counts())


 eval

unacc

1210

acc

384

good

69

vgood

65

 

pd.DataFrame(df['e.eval'].value_counts())

 

e.eval

2

1210

0

384

1

69

3

65


 

X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot', 'e.safety']]

print(type(X))

X.shape

<class 'pandas.core.frame.DataFrame'>

(1728, 6)


y = df['e.eval']

print(type(y))

y.shape

<class 'pandas.core.series.Series'>

(1728,)


X_train,X_test,y_train,y_test = train_test_split(X,y,

                                               test_size=0.25,

                                               random_state=5)

print(X_train.shape)

print(y_train.shape)

print(X_test.shape)

print(y_test.shape)

 

(1296, 6)

(1296,)

(432, 6)

(432,)


#결정트리 분류기를 이용하여 훈련

clf_dt = tree.DecisionTreeClassifier(random_state=10)

clf_dt.fit(X_train,y_train)#학습 수행

y_pred_dt = clf_dt.predict(X_test)#예측값 도출

print(type(y_pred_dt))

print(y_pred_dt.shape)

y_pred_dt

 

<class 'numpy.ndarray'>

(432,)


array([0, 2, 0, 0, 0, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

      2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0,

      2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2,

      2, 3, 2, 2, 0, 1, 2, 2, 2, 0, 2, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0, 0, 0,

      2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0, 1, 2, 2, 2,

      2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2,

      0, 2, 0, 2, 2, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 0, 2,

      2, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2,

      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,

      2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 3, 2, 2,

      2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, 0, 2, 2, 0, 1, 2,

      2, 2, 2, 0, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2,

      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2,

      0, 3, 2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2,

      2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0,

      2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2,

      2, 0, 2, 0, 2, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0,

      2, 0, 2, 0, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,

      2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0])


metrics.accuracy_score(y_test,y_pred_dt)

0.96527777777777779


correct_pred_dt = []

wrong_pred_dt = []

y_test2 = y_test.reset_index(drop=True)

y_test2 = y_test2.as_matrix()

for i in range(0,432):

   if (y_test2[i] != y_pred_dt[i]):#예측이 틀렸을 경우

       wrong_pred_dt.append(i)

   else:

       correct_pred_dt.append(i)

print("Cerrectly indetified labels:",len(correct_pred_dt))

print(" ")

print("Wrong indetified labels:",len(wrong_pred_dt))

print("----------------------")

print(y_test[10:20])

print(y_pred_dt[10:20])

wrong_pred_dt

 

Cerrectly indetified labels: 417

 

Wrong indetified labels: 15

----------------------

464     2

269     0

1081    2

871     2

825     2

342     2

1273    2

1061    2

346     2

1090    0

Name: e.eval, dtype: int64

[2 0 2 2 2 2 2 2 2 2]


[19, 41, 45, 54, 74, 101, 104, 167, 168, 226, 259, 273, 328, 369, 428]


def dt_probs(index_num):

   X_param = X_test.ix[index_num]

   X_param = X_param.to_frame()

   X_param = X_param.transpose()

   temp_pred = clf_dt.predict_proba(X_param)

   temp_pred_1 = temp_pred[0]

   y_actual = y_test[index_num]

   y_range = ['Unacceptable','Acceptable','Good','Very Good']

   print("======================================")

   print("For index number:",index_num)

   print(" ")

   print("Fetures entered:")

   print(X_param)

   print(" ")

   print("Actual score:")

   print(y_actual,"(",y_range[y_actual],")")

   print(" ")

   print("Predicted probabilities:")

   for i in range(0,4):

       print(y_range[i],":",temp_pred_1[i])

   return

dt_probs(805)

dt_probs(50)

 

======================================

For index number: 805

 

Fetures entered:

    e.buying  e.maint  e.doors  e.persons  e.lug_boot  e.safety

805         0        1        1          2           1         2

 

Actual score:

0 ( Unacceptable )

 

Predicted probabilities:

Unacceptable : 1.0

Acceptable : 0.0

Good : 0.0

Very Good : 0.0

======================================

For index number: 50

 

Fetures entered:

   e.buying  e.maint  e.doors  e.persons  e.lug_boot  e.safety

50         3        3        1          2           1         0

 

Actual score:

2 ( Good )

 

Predicted probabilities:

Unacceptable : 0.0

Acceptable : 0.0

Good : 1.0

Very Good : 0.0


---------------------------------


Classification : kNN

 

import sklearn

import pandas as pd

import numpy as np

import matplotlib

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

from IPython.display import Image

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

 

df = pd.read_table(

   "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",

   sep=',',

   names=["buying","maint","doors","persons","lug_boot","safety","eval"])

 

le = preprocessing.LabelEncoder()

def encode_col(col_name):

   encodes = le.fit(df[col_name])

   new_col_name = "e."+col_name

   df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))

   df[new_col_name] = df[new_col_name].map(lambda x:x[0])

   return

 

encode_col('buying')

encode_col('maint')

encode_col('doors')

encode_col('persons')

encode_col('lug_boot')

encode_col('safety')

encode_col('eval')

df.head() 

 

buying

maint

doors

persons

lug_boot

safety

eval

e.buying

e.maint

e.doors

e.persons

e.lug_boot

e.safety

e.eval

0

vhigh

vhigh

2

2

small

low

unacc

3

3

0

0

2

1

2

1

vhigh

vhigh

2

2

small

med

unacc

3

3

0

0

2

2

2

2

vhigh

vhigh

2

2

small

high

unacc

3

3

0

0

2

0

2

3

vhigh

vhigh

2

2

med

low

unacc

3

3

0

0

1

1

2

4

vhigh

vhigh

2

2

med

med

unacc

3

3

0

0

1

2

2

  

X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot','e.safety']]

y = df['e.eval']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=5)

clf_knn = KNeighborsClassifier(n_neighbors=5)

clf_knn.fit(X_train,y_train)

y_pred = clf_knn.predict(X_test)

 

print(metrics.accuracy_score(y_test,y_pred))

0.898148148148


correct_pred = []

wrong_pred = []

y_test2 = y_test.reset_index(drop=True)

y_test2 = y_test2.as_matrix()

for i in range(0,432):

   if (y_test2[i] != y_pred[i]):

       wrong_pred.append(i)

   else:

       correct_pred.append(i)

print("correct indetified labels:",len(correct_pred))

print(" ")

print("wrong indetified labels:",len(wrong_pred))

 

correct indetified labels: 388

 

wrong indetified labels: 44


 

y_test3 = y_test.to_frame()

y_test3 = y_test3.reset_index()

y_test4 = y_test3.drop('e.eval',1)

y_test4.head()

 

index

0

805

1

50

2

1171

3

1177

4

395


 

wrong_list = []

for i in wrong_pred:

   wrong_index = y_test4.iloc[i]

   wrong_index1 = wrong_index[0]

   wrong_list.append(wrong_index1)

print(wrong_pred)

print(wrong_list)

[0, 18, 19, 25, 26, 41, 54, 74, 82, 85, 96, 101, 104, 111, 117, 134, 140, 145, 157, 167, 217, 237, 247, 251, 256, 259, 261, 262, 273, 274, 299, 300, 311, 314, 332, 344, 345, 369, 373, 376, 378, 399, 416, 428]

[805, 346, 1090, 556, 668, 1130, 344, 1532, 1529, 1694, 1534, 904, 1631, 1712, 421, 1312, 1279, 1715, 1069, 1198, 1615, 562, 1307, 1618, 1016, 1414, 1441, 1253, 1630, 664, 230, 1538, 1448, 1549, 1235, 880, 107, 1336, 1663, 1685, 823, 1612, 1525, 1522]


def knn_probs(index_num):

   X_param = X_test.ix[index_num]

   X_param = X_param.to_frame()

   X_param = X_param.transpose()

   temp_pred = clf_knn.predict_proba(X_param)

   temp_pred_1 = temp_pred[0]

   y_actual = y_test[index_num]

   y_range = ['Unacceptable','Acceptable','Good','Very Good']

   print("======================================")

   print("For index number:",index_num)

   print(" ")

   print("Fetures entered:")

   print(X_param)

   print(" ")

   print("Actual score:")

   print(y_actual,"(",y_range[y_actual],")")

   print(" ")

   print("Predicted probabilities:")

   for i in range(0,4):

       print(y_range[i],":",temp_pred_1[i])

   print(" ")

   if index_num in wrong_list:

       print("Label predicted:Wrongly")

   else:

       print("Label predicted:Correctly")

   return

 

knn_probs(805)

knn_probs(50)

knn_probs(1171)

 

======================================

For index number: 805

 

Fetures entered:

    e.buying  e.maint  e.doors  e.persons  e.lug_boot  e.safety

805         0        1        1          2           1         2

 

Actual score:

0 ( Unacceptable )

 

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.0

Good : 0.6

Very Good : 0.0

 

Label predicted:Wrongly

======================================

For index number: 50

 

Fetures entered:

   e.buying  e.maint  e.doors  e.persons  e.lug_boot  e.safety

50         3        3        1          2           1         0

 

Actual score:

2 ( Good )

 

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.0

Good : 0.6

Very Good : 0.0

 

Label predicted:Correctly

======================================

For index number: 1171

 

Fetures entered:

     e.buying  e.maint  e.doors  e.persons  e.lug_boot  e.safety

1171         2        2        3          1           2         2

 

Actual score:

0 ( Unacceptable )

 

Predicted probabilities:

Unacceptable : 0.4

Acceptable : 0.2

Good : 0.4

Very Good : 0.0

 

Label predicted:Correctly


from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_recall_fscore_support

 

cm = confusion_matrix(y_test,y_pred)

labels = ['Unacceptable','Acceptable','Good','Very Good']

df_cm = pd.DataFrame(cm, index=[i for i in labels],

                   columns=[i for i in labels])

plt.figure(figsize=(10,10))

sns.heatmap(df_cm,annot=True)

plt.xlabel('Predicted',fontsize=20)

plt.ylabel('Actual',fontsize=20)

plt.show()


#accuracy : 정확도

#Precision : 정밀도

#Recall : 재현율

#f(beta) score :정밀도와 재현율의 가중조화평균

scr_clf_knn = precision_recall_fscore_support(y_test,y_pred,

                                            average='weighted')

print("Classfier's precision:"+str(scr_clf_knn[0]))

print("Classfier's recall:"+str(scr_clf_knn[1]))

print("Classfier's fbeta_score:"+str(scr_clf_knn[2]))

print("Classfier's accuracy:"+str(metrics.accuracy_score(y_test,y_pred)))

 

Classfier's precision:0.89500373968

Classfier's recall:0.898148148148

Classfier's fbeta_score:0.891002785157

Classfier's accuracy:0.898148148148