2017. 8. 17. 21:02ㆍ서버 프로그래밍
Linear Regression
import sklearn
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error#MSE:평균제곱오차
from sklearn.metrics import r2_score#결정계수
import seaborn as sns
df = pd.read_csv(
"https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/housing/housing.data",
delimiter=r"\s+",
names=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"])
df.head()
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV |
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
df.shape
(506, 14)
df[pd.isnull(df).any(axis=1)]
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV |
sns.set(style='whitegrid',context='notebook')
cols = ['LSTAT','INDUS','NOX','RM','MEDV']
sns.pairplot(df[cols],size=2.5)
plt.show()#산점도 행렬 출력
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f',
annot_kws={'size':15},yticklabels=cols,xticklabels=cols)
plt.show()#상관관계행렬 출력
#LSTAT과 MEDV는 가장 큰 상관관계를 가진다 (산점도행렬에서는 비선형관계였음)
#RM과 MEDV는 산점도행렬에서 선형관계임
#설명변수
X = df[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]
#목표변수(반응값) -> 설명변수를 이용하여 예측하고자하는 변수
y = df[["MEDV"]]
#학습데이터 -> 훈련데이터, 테스트데이터 구분
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=5)
reg = linear_model.LinearRegression()
#훈련 (모형추정)
reg.fit(X_train,y_train)
print(reg.intercept_)#추정된 상수항 출력
print(reg.coef_)#추정된 가중치벡터 출력 (초평면계수)
[ 37.91248701]
[[ -1.30799852e-01 4.94030235e-02 1.09535045e-03 2.70536624e+00
-1.59570504e+01 3.41397332e+00 1.11887670e-03 -1.49308124e+00
3.64422378e-01 -1.31718155e-02 -9.52369666e-01 1.17492092e-02
-5.94076089e-01]]
print(X_train.shape)
print(X_test.shape)
(404, 13)
(102, 13)
#예측값 설정 - 회귀모델이 잘만들어졌는지 테스트
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
#잔차플롯 : 회귀모델 진단을 위한 그래픽 분석
plt.scatter(y_train_pred,y_train_pred-y_train, c='blue',marker='o',label='Training data')
plt.scatter(y_test_pred,y_test_pred-y_test, c='lightgreen',marker='s',label='Test data')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.hlines(y=0,xmin=-10,xmax=50,lw=2,color='red')
plt.xlim([-10,50])
plt.show()
#적합도 특정
#평균제곱오차(MSE:Mean Squared Error) 출력
print('MSE train:%.3f, test:%.3f'%(
mean_squared_error(y_train,y_train_pred),
mean_squared_error(y_test,y_test_pred)))
#결정계수 출력 - 추정한 예측모델의 주어진 자료에 대한 적합도
print('R^2 train:%.3f, test:%.3f'%(
r2_score(y_train,y_train_pred),
r2_score(y_test,y_test_pred)))
MSE train:22.477, test:20.869
R^2 train:0.738, test:0.733
y_test_m = y_test.as_matrix()
plt.figure(figsize=(15,10))
plt.plot(y_test_m)
plt.plot(y_test_pred)
legend_list = ['y_test_m','y_test_pred']
plt.legend(legend_list,loc=4,fontsize='25')
plt.show()
---------------------------------------------------
Classification : DecisionTree
import sklearn
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import Image
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
df = pd.read_table(
"https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
sep=',',
names=["buying","maint","doors","persons","lug_boot","safety","eval"])
df.shape
(1728, 7)
df[pd.isnull(df).any(axis=1)]
buying | maint | doors | persons | lug_boot | safety | eval |
df.head()
| buying | maint | doors | persons | lug_boot | safety | eval |
0 | vhigh | vhigh | 2 | 2 | small | low | unacc |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc |
car_counts = pd.DataFrame(df['eval'].value_counts())
car_counts['Percentage'] = car_counts['eval']/car_counts.sum()[0]
car_counts.head()
eval | Percentage | |
unacc | 1210 | 0.700231 |
acc | 384 | 0.222222 |
good | 69 | 0.039931 |
vgood | 65 | 0.037616 |
plt.figure(figsize=(8,8))
plt.pie(car_counts["Percentage"], labels=['Unacceptable','acceptable','Good','Very Good'])
plt.show()
le = preprocessing.LabelEncoder()
encoded_buying = le.fit(df['buying'])
encoded_buying.classes_
array(['high', 'low', 'med', 'vhigh'], dtype=object
encoded_buying.transform(['high'])
encoded_buying.transform(['low'])
encoded_buying.transform(['med'])
encoded_buying.transform(['vhigh'])
encoded_buying.inverse_transform(1)
'low'
for i in range(0,4):
print(i,":",encoded_buying.inverse_transform(i))
0 : high
1 : low
2 : med
3 : vhigh
df['e.buying'] = df['buying'].map(lambda x:encoded_buying.transform([x]))
df.head()
| buying | maint | doors | persons | lug_boot | safety | eval | e.buying |
0 | vhigh | vhigh | 2 | 2 | small | low | unacc | [3] |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc | [3] |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc | [3] |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc | [3] |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc | [3] |
df['e.buying'] = df['e.buying'].map(lambda x:x[0])
df.head()
| buying | maint | doors | persons | lug_boot | safety | eval | e.buying |
0 | vhigh | vhigh | 2 | 2 | small | low | unacc | 3 |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc | 3 |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc | 3 |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc | 3 |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc | 3 |
encoded_maint = le.fit(df['maint'])
encoded_maint.classes_
array(['high', 'low', 'med', 'vhigh'], dtype=object)
df['e.maint'] = df['maint'].map(lambda x:encoded_maint.transform([x]))
df['e.maint'] = df['e.maint'].map(lambda x:x[0])
df.head()
| buying | maint | doors | persons | lug_boot | safety | eval | e.buying | e.maint |
0 | vhigh | vhigh | 2 | 2 | small | low | unacc | 3 | 3 |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc | 3 | 3 |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc | 3 | 3 |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc | 3 | 3 |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc | 3 | 3 |
def encode_col(col_name):
encodes = le.fit(df[col_name])
new_col_name = "e."+col_name
df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))
df[new_col_name] = df[new_col_name].map(lambda x:x[0])
return
encode_col('doors')
encode_col('persons')
encode_col('lug_boot')
encode_col('safety')
encode_col('eval')
df.head()
buying | maint | doors | persons | lug_boot | safety | eval | e.buying | e.maint | e.doors | e.persons | e.lug_boot | e.safety | e.eval |
|
0 | vhigh | vhigh | 2 | 2 | small | low | unacc | 3 | 3 | 0 | 0 | 2 | 1 | 2 |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc | 3 | 3 | 0 | 0 | 2 | 2 | 2 |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc | 3 | 3 | 0 | 0 | 2 | 0 | 2 |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc | 3 | 3 | 0 | 0 | 1 | 1 | 2 |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc | 3 | 3 | 0 | 0 | 1 | 2 | 2 |
pd.DataFrame(df['eval'].value_counts())
eval | |
unacc | 1210 |
acc | 384 |
good | 69 |
vgood | 65 |
pd.DataFrame(df['e.eval'].value_counts())
| e.eval |
2 | 1210 |
0 | 384 |
1 | 69 |
3 | 65 |
X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot', 'e.safety']]
print(type(X))
X.shape
<class 'pandas.core.frame.DataFrame'>
(1728, 6)
y = df['e.eval']
print(type(y))
y.shape
<class 'pandas.core.series.Series'>
(1728,)
X_train,X_test,y_train,y_test = train_test_split(X,y,
test_size=0.25,
random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(1296, 6)
(1296,)
(432, 6)
(432,)
#결정트리 분류기를 이용하여 훈련
clf_dt = tree.DecisionTreeClassifier(random_state=10)
clf_dt.fit(X_train,y_train)#학습 수행
y_pred_dt = clf_dt.predict(X_test)#예측값 도출
print(type(y_pred_dt))
print(y_pred_dt.shape)
y_pred_dt
<class 'numpy.ndarray'>
(432,)
array([0, 2, 0, 0, 0, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0,
2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2,
2, 3, 2, 2, 0, 1, 2, 2, 2, 0, 2, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0, 0, 0,
2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0, 1, 2, 2, 2,
2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2,
0, 2, 0, 2, 2, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 0, 2,
2, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 3, 2, 2,
2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, 0, 2, 2, 0, 1, 2,
2, 2, 2, 0, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2,
0, 3, 2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2,
2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0,
2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2,
2, 0, 2, 0, 2, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0,
2, 0, 2, 0, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,
2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0])
metrics.accuracy_score(y_test,y_pred_dt)
0.96527777777777779
correct_pred_dt = []
wrong_pred_dt = []
y_test2 = y_test.reset_index(drop=True)
y_test2 = y_test2.as_matrix()
for i in range(0,432):
if (y_test2[i] != y_pred_dt[i]):#예측이 틀렸을 경우
wrong_pred_dt.append(i)
else:
correct_pred_dt.append(i)
print("Cerrectly indetified labels:",len(correct_pred_dt))
print(" ")
print("Wrong indetified labels:",len(wrong_pred_dt))
print("----------------------")
print(y_test[10:20])
print(y_pred_dt[10:20])
wrong_pred_dt
Cerrectly indetified labels: 417
Wrong indetified labels: 15
----------------------
464 2
269 0
1081 2
871 2
825 2
342 2
1273 2
1061 2
346 2
1090 0
Name: e.eval, dtype: int64
[2 0 2 2 2 2 2 2 2 2]
[19, 41, 45, 54, 74, 101, 104, 167, 168, 226, 259, 273, 328, 369, 428]
def dt_probs(index_num):
X_param = X_test.ix[index_num]
X_param = X_param.to_frame()
X_param = X_param.transpose()
temp_pred = clf_dt.predict_proba(X_param)
temp_pred_1 = temp_pred[0]
y_actual = y_test[index_num]
y_range = ['Unacceptable','Acceptable','Good','Very Good']
print("======================================")
print("For index number:",index_num)
print(" ")
print("Fetures entered:")
print(X_param)
print(" ")
print("Actual score:")
print(y_actual,"(",y_range[y_actual],")")
print(" ")
print("Predicted probabilities:")
for i in range(0,4):
print(y_range[i],":",temp_pred_1[i])
return
dt_probs(805)
dt_probs(50)
======================================
For index number: 805
Fetures entered:
e.buying e.maint e.doors e.persons e.lug_boot e.safety
805 0 1 1 2 1 2
Actual score:
0 ( Unacceptable )
Predicted probabilities:
Unacceptable : 1.0
Acceptable : 0.0
Good : 0.0
Very Good : 0.0
======================================
For index number: 50
Fetures entered:
e.buying e.maint e.doors e.persons e.lug_boot e.safety
50 3 3 1 2 1 0
Actual score:
2 ( Good )
Predicted probabilities:
Unacceptable : 0.0
Acceptable : 0.0
Good : 1.0
Very Good : 0.0
---------------------------------
Classification : kNN
import sklearn
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import Image
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
df = pd.read_table(
"https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
sep=',',
names=["buying","maint","doors","persons","lug_boot","safety","eval"])
le = preprocessing.LabelEncoder()
def encode_col(col_name):
encodes = le.fit(df[col_name])
new_col_name = "e."+col_name
df[new_col_name] = df[col_name].map(lambda x:encodes.transform([x]))
df[new_col_name] = df[new_col_name].map(lambda x:x[0])
return
encode_col('buying')
encode_col('maint')
encode_col('doors')
encode_col('persons')
encode_col('lug_boot')
encode_col('safety')
encode_col('eval')
df.head()
| buying | maint | doors | persons | lug_boot | safety | eval | e.buying | e.maint | e.doors | e.persons | e.lug_boot | e.safety | e.eval |
0 | vhigh | vhigh | 2 | 2 | small | low | unacc | 3 | 3 | 0 | 0 | 2 | 1 | 2 |
1 | vhigh | vhigh | 2 | 2 | small | med | unacc | 3 | 3 | 0 | 0 | 2 | 2 | 2 |
2 | vhigh | vhigh | 2 | 2 | small | high | unacc | 3 | 3 | 0 | 0 | 2 | 0 | 2 |
3 | vhigh | vhigh | 2 | 2 | med | low | unacc | 3 | 3 | 0 | 0 | 1 | 1 | 2 |
4 | vhigh | vhigh | 2 | 2 | med | med | unacc | 3 | 3 | 0 | 0 | 1 | 2 | 2 |
X = df[['e.buying','e.maint','e.doors','e.persons','e.lug_boot','e.safety']]
y = df['e.eval']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=5)
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(X_train,y_train)
y_pred = clf_knn.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))
0.898148148148
correct_pred = []
wrong_pred = []
y_test2 = y_test.reset_index(drop=True)
y_test2 = y_test2.as_matrix()
for i in range(0,432):
if (y_test2[i] != y_pred[i]):
wrong_pred.append(i)
else:
correct_pred.append(i)
print("correct indetified labels:",len(correct_pred))
print(" ")
print("wrong indetified labels:",len(wrong_pred))
correct indetified labels: 388
wrong indetified labels: 44
y_test3 = y_test.to_frame()
y_test3 = y_test3.reset_index()
y_test4 = y_test3.drop('e.eval',1)
y_test4.head()
| index |
0 | 805 |
1 | 50 |
2 | 1171 |
3 | 1177 |
4 | 395 |
wrong_list = []
for i in wrong_pred:
wrong_index = y_test4.iloc[i]
wrong_index1 = wrong_index[0]
wrong_list.append(wrong_index1)
print(wrong_pred)
print(wrong_list)
[0, 18, 19, 25, 26, 41, 54, 74, 82, 85, 96, 101, 104, 111, 117, 134, 140, 145, 157, 167, 217, 237, 247, 251, 256, 259, 261, 262, 273, 274, 299, 300, 311, 314, 332, 344, 345, 369, 373, 376, 378, 399, 416, 428]
[805, 346, 1090, 556, 668, 1130, 344, 1532, 1529, 1694, 1534, 904, 1631, 1712, 421, 1312, 1279, 1715, 1069, 1198, 1615, 562, 1307, 1618, 1016, 1414, 1441, 1253, 1630, 664, 230, 1538, 1448, 1549, 1235, 880, 107, 1336, 1663, 1685, 823, 1612, 1525, 1522]
def knn_probs(index_num):
X_param = X_test.ix[index_num]
X_param = X_param.to_frame()
X_param = X_param.transpose()
temp_pred = clf_knn.predict_proba(X_param)
temp_pred_1 = temp_pred[0]
y_actual = y_test[index_num]
y_range = ['Unacceptable','Acceptable','Good','Very Good']
print("======================================")
print("For index number:",index_num)
print(" ")
print("Fetures entered:")
print(X_param)
print(" ")
print("Actual score:")
print(y_actual,"(",y_range[y_actual],")")
print(" ")
print("Predicted probabilities:")
for i in range(0,4):
print(y_range[i],":",temp_pred_1[i])
print(" ")
if index_num in wrong_list:
print("Label predicted:Wrongly")
else:
print("Label predicted:Correctly")
return
knn_probs(805)
knn_probs(50)
knn_probs(1171)
======================================
For index number: 805
Fetures entered:
e.buying e.maint e.doors e.persons e.lug_boot e.safety
805 0 1 1 2 1 2
Actual score:
0 ( Unacceptable )
Predicted probabilities:
Unacceptable : 0.4
Acceptable : 0.0
Good : 0.6
Very Good : 0.0
Label predicted:Wrongly
======================================
For index number: 50
Fetures entered:
e.buying e.maint e.doors e.persons e.lug_boot e.safety
50 3 3 1 2 1 0
Actual score:
2 ( Good )
Predicted probabilities:
Unacceptable : 0.4
Acceptable : 0.0
Good : 0.6
Very Good : 0.0
Label predicted:Correctly
======================================
For index number: 1171
Fetures entered:
e.buying e.maint e.doors e.persons e.lug_boot e.safety
1171 2 2 3 1 2 2
Actual score:
0 ( Unacceptable )
Predicted probabilities:
Unacceptable : 0.4
Acceptable : 0.2
Good : 0.4
Very Good : 0.0
Label predicted:Correctly
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
cm = confusion_matrix(y_test,y_pred)
labels = ['Unacceptable','Acceptable','Good','Very Good']
df_cm = pd.DataFrame(cm, index=[i for i in labels],
columns=[i for i in labels])
plt.figure(figsize=(10,10))
sns.heatmap(df_cm,annot=True)
plt.xlabel('Predicted',fontsize=20)
plt.ylabel('Actual',fontsize=20)
plt.show()
#accuracy : 정확도
#Precision : 정밀도
#Recall : 재현율
#f(beta) score :정밀도와 재현율의 가중조화평균
scr_clf_knn = precision_recall_fscore_support(y_test,y_pred,
average='weighted')
print("Classfier's precision:"+str(scr_clf_knn[0]))
print("Classfier's recall:"+str(scr_clf_knn[1]))
print("Classfier's fbeta_score:"+str(scr_clf_knn[2]))
print("Classfier's accuracy:"+str(metrics.accuracy_score(y_test,y_pred)))
Classfier's precision:0.89500373968
Classfier's recall:0.898148148148
Classfier's fbeta_score:0.891002785157
Classfier's accuracy:0.898148148148