Otto Group Product Classification Challenge

4 minute read

#파일 불러오기
import pandas as pd
df = pd.read_csv("C:/kaggle/otto-group-product-classification-challenge/train.csv")

df

	id	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_90	feat_91	feat_92	feat_93	target
0	1	1	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	Class_1
1	2	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	Class_1
2	3	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	Class_1
3	4	1	0	0	1	6	1	5	0	0	...	0	1	2	0	0	0	0	0	0	Class_1
4	5	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	1	0	0	0	Class_1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
61873	61874	1	0	0	1	1	0	0	0	0	...	1	0	0	0	0	0	0	2	0	Class_9
61874	61875	4	0	0	0	0	0	0	0	0	...	0	2	0	0	2	0	0	1	0	Class_9
61875	61876	0	0	0	0	0	0	0	3	1	...	0	3	1	0	0	0	0	0	0	Class_9
61876	61877	1	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	3	10	0	Class_9
61877	61878	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	2	0	Class_9

61878 rows × 95 columns

#id는 필요없으므로 제거해준다.
df=df.drop(['id'],axis=1)

#수치로 바꿔주기위해 LabelEncoder을 사용한다.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['target']=le.fit_transform(df['target'])

시각화로 확인

#시각화로 확인
#타겟이 class1~9로 써있던것이 숫자 0부터 8로 바뀌었다.
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
sns.countplot(df['target'])

C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(<AxesSubplot:xlabel='target', ylabel='count'>

트레이닝과 테스트를 위한 모델 준비

#트레이닝과 테스트를 위한 모델 준비
y=df['target']
x=df.drop(['target'],axis=1)

#train 데이터(0.8)와 test데이터(0.2)로 나누기
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)

로지스틱 회귀와 정확도 점수 측정을 위해 패키지 불러오기

#로지스틱 회귀와 정확도 점수 측정을 위해 패키지 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
list_models=[]
list_scores=[]
lr=LogisticRegression(max_iter=100000) # max_iter=Gradient Descent 방식을 반복해서 몇번 수행할 것인가
lr.fit(x_train,y_train)
pred_1=lr.predict(x_test)
score_1=accuracy_score(y_test,pred_1)
list_models.append('logistic regression')
list_scores.append(score_1)

fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_1,ax=axes[0])
sns.countplot(y_test,ax=axes[1])

C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='target', ylabel='count'>

output_8_2

from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
pred_2=rfc.predict(x_test)
score_2=accuracy_score(y_test,pred_2)
list_scores.append(score_2)
list_models.append('random forest classifier')

score_2

score_2

0.8062378797672916

fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_2,ax=axes[0])
sns.countplot(y_test,ax=axes[1])

C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn( <AxesSubplot:xlabel='target', ylabel='count'>

output_11_2

로지스틱 회귀 모형과 랜덤 포리스트 모형에 대한 비교 bw 예측을 생성해 봅시다.

#로지스틱 회귀 모형과 랜덤 포리스트 모형에 대한 비교 bw 예측을 생성해 봅시다.
fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_1,ax=axes[0])
axes[0].legend(title='predictions by logistic regression')
sns.countplot(pred_2,ax=axes[1])
axes[1].legend(title='predictions by random forest')

C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
No handles with labels found to put in legend.
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
No handles with labels found to put in legend.

<matplotlib.legend.Legend at 0x126d01a6bb0>

output_12_2

위의 관측에서, 우리는 이러한 예측들의 유일한 주요 차이 bw는 로지스틱 회귀 분석의 예측과 비교하여 랜덤 포리스트에서 1 클래스의 카운트가 더 적고 2 클래스의 카운트가 더 높다는 결론을 내릴 수 있다.

from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
pred_3=svm.predict(x_test)
score_3=accuracy_score(y_test,pred_3)
list_scores.append(score_3)
list_models.append('support vector machines')

score_3

score_3

0.7798965740142211

!pip install xgboost 

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Requirement already satisfied: scipy in c:\users\administrator\anaconda3\lib\site-packages (from xgboost) (1.5.2)
Requirement already satisfied: numpy in c:\users\administrator\anaconda3\lib\site-packages (from xgboost) (1.19.2)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2

from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(x_train,y_train)
pred_4=xgb.predict(x_test)
score_4=accuracy_score(y_test,pred_4)
list_models.append('xgboost classifier')
list_scores.append(score_4)

C:\Users\Administrator\anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[20:34:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

score_4

score_4

0.8122979961215255

plt.figure(figsize=(12,5))
plt.bar(list_models,list_scores,width=0.3)
plt.xlabel('classifictions models')
plt.ylabel('accuracy scores')
plt.show()

output_19_0

Share on

Twitter Facebook LinkedIn

토토모

Otto Group Product Classification Challenge

시각화로 확인

트레이닝과 테스트를 위한 모델 준비

로지스틱 회귀와 정확도 점수 측정을 위해 패키지 불러오기

score_2

로지스틱 회귀 모형과 랜덤 포리스트 모형에 대한 비교 bw 예측을 생성해 봅시다.

score_3

score_4

Share on

Leave a comment

You may also enjoy

[SWEA 6246].[파이썬 프로그래밍 기초(1) 파이썬의 기본 구조와 기초 문법] 7. 흐름과 제어 - 반복 8

[SWEA 6244].[파이썬 프로그래밍 기초(1) 파이썬의 기본 구조와 기초 문법] 7. 흐름과 제어 - 반복 7

[SWEA 6242].[파이썬 프로그래밍 기초(1) 파이썬의 기본 구조와 기초 문법] 7. 흐름과 제어 - 반복 6

[SWEA 6240].[파이썬 프로그래밍 기초(1) 파이썬의 기본 구조와 기초 문법] 7. 흐름과 제어 - 반복 5