Otto Group Product Classification Challenge
Otto Group Product Classification Challenge
#파일 불러오기
import pandas as pd
df = pd.read_csv("C:/kaggle/otto-group-product-classification-challenge/train.csv")
df
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
3 | 4 | 1 | 0 | 0 | 1 | 6 | 1 | 5 | 0 | 0 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | Class_1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
61873 | 61874 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | Class_9 |
61874 | 61875 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | Class_9 |
61875 | 61876 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | ... | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Class_9 |
61876 | 61877 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 10 | 0 | Class_9 |
61877 | 61878 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | Class_9 |
61878 rows × 95 columns
#id는 필요없으므로 제거해준다.
df=df.drop(['id'],axis=1)
#수치로 바꿔주기위해 LabelEncoder을 사용한다.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['target']=le.fit_transform(df['target'])
시각화로 확인
#시각화로 확인
#타겟이 class1~9로 써있던것이 숫자 0부터 8로 바뀌었다.
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
sns.countplot(df['target'])
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(<AxesSubplot:xlabel='target', ylabel='count'>
트레이닝과 테스트를 위한 모델 준비
#트레이닝과 테스트를 위한 모델 준비
y=df['target']
x=df.drop(['target'],axis=1)
#train 데이터(0.8)와 test데이터(0.2)로 나누기
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)
로지스틱 회귀와 정확도 점수 측정을 위해 패키지 불러오기
#로지스틱 회귀와 정확도 점수 측정을 위해 패키지 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
list_models=[]
list_scores=[]
lr=LogisticRegression(max_iter=100000) # max_iter=Gradient Descent 방식을 반복해서 몇번 수행할 것인가
lr.fit(x_train,y_train)
pred_1=lr.predict(x_test)
score_1=accuracy_score(y_test,pred_1)
list_models.append('logistic regression')
list_scores.append(score_1)
fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_1,ax=axes[0])
sns.countplot(y_test,ax=axes[1])
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='target', ylabel='count'>
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
pred_2=rfc.predict(x_test)
score_2=accuracy_score(y_test,pred_2)
list_scores.append(score_2)
list_models.append('random forest classifier')
score_2
score_2
0.8062378797672916
fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_2,ax=axes[0])
sns.countplot(y_test,ax=axes[1])
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn( <AxesSubplot:xlabel='target', ylabel='count'>
로지스틱 회귀 모형과 랜덤 포리스트 모형에 대한 비교 bw 예측을 생성해 봅시다.
#로지스틱 회귀 모형과 랜덤 포리스트 모형에 대한 비교 bw 예측을 생성해 봅시다.
fig,axes=plt.subplots(1,2)
fig.set_size_inches(11.7, 8.27)
sns.countplot(pred_1,ax=axes[0])
axes[0].legend(title='predictions by logistic regression')
sns.countplot(pred_2,ax=axes[1])
axes[1].legend(title='predictions by random forest')
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
No handles with labels found to put in legend.
C:\Users\Administrator\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
No handles with labels found to put in legend.
<matplotlib.legend.Legend at 0x126d01a6bb0>
위의 관측에서, 우리는 이러한 예측들의 유일한 주요 차이 bw는 로지스틱 회귀 분석의 예측과 비교하여 랜덤 포리스트에서 1 클래스의 카운트가 더 적고 2 클래스의 카운트가 더 높다는 결론을 내릴 수 있다.
from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
pred_3=svm.predict(x_test)
score_3=accuracy_score(y_test,pred_3)
list_scores.append(score_3)
list_models.append('support vector machines')
score_3
score_3
0.7798965740142211
!pip install xgboost
Collecting xgboost
Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Requirement already satisfied: scipy in c:\users\administrator\anaconda3\lib\site-packages (from xgboost) (1.5.2)
Requirement already satisfied: numpy in c:\users\administrator\anaconda3\lib\site-packages (from xgboost) (1.19.2)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(x_train,y_train)
pred_4=xgb.predict(x_test)
score_4=accuracy_score(y_test,pred_4)
list_models.append('xgboost classifier')
list_scores.append(score_4)
C:\Users\Administrator\anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:34:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
score_4
score_4
0.8122979961215255
plt.figure(figsize=(12,5))
plt.bar(list_models,list_scores,width=0.3)
plt.xlabel('classifictions models')
plt.ylabel('accuracy scores')
plt.show()
Leave a comment