# 타이타닉 데이터 로드 후 정보 확인
import numpy as np
import pandas as pd
df = pd.read_csv('./titanic.csv')
print('Train 데이터 정보')
print(df.info())
df.head(3)
* Passengerid: 탑승자 데이터 일련번호
* survived: 생존 여부, 0 = 사망, 1 = 생존
* Pclass: 티켓의 선실 등급, 1 = 일등석, 2 = 이등석, 3 = 삼등석
* sex: 탑승자 성별
* name: 탑승자 이름
* Age: 탑승자 나이
* sibsp: 같이 탑승한 형제자매 또는 배우자 인원수
* parch: 같이 탑승한 부모님 또는 어린이 인원수
* ticket: 티켓 번호
* fare: 요금
* cabin: 선실 번호
* embarked: 중간 정착 항구 C = Cherbourg, Q = Queenstown, S = Southampton
# 데이터 전처리
## 결측지 처리
- Age >>>> 평균 값으로 채우기
- Cabin >>>> N으로 채우기
- Embarked >>>> N으로 채우기
def fillna(df):
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Cabin'] = df['Cabin'].fillna('N')
df['Embarked'] = df['Embarked'].fillna('N')
return df
## 필요 없는 컬럼 제거
- PassengerId
- Name
- Ticket
def drop_features(df):
return df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
## 범주형 데이터 처리
- Sex >>>> 숫자형 변환
- Embarked >>>> 숫자형 변환
- Cabin >>>> 첫 번째 문자만 추출 후 숫자형 변환
from sklearn.preprocessing import LabelEncoder
def format_features(df):
df['Cabin'] = df['Cabin'].astype(str).str[:1]
features = ['Cabin', 'Sex', 'Embarked']
for feature in features:
le = LabelEncoder()
df[feature] = le.fit_transform(df[feature])
return df
## 데이터 전처리 함수
def transform_features(df):
df = fillna(df)
df = drop_features(df)
df = format_features(df)
return df
## 데이터 분리(X, y)
y_titanic_df = df['Survived']
X_titanic_df = df.drop('Survived', axis=1, inplace=False)
X_titanic_df = transform_features(X_titanic_df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_titanic_df
X_titanic_df.isnull().sum()
## 모델 학습
- DecisionTreeClassifier
- RandomForestClassifier
- LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# 모델 정의
models = {
'DecisionTree': DecisionTreeClassifier(random_state=44),
'RandomForest': RandomForestClassifier(random_state=44),
'LogisticRegression' : LogisticRegression(solver='liblinear')
}
## 평가
def evaluate_models(models, X_train, X_test, y_train, y_test):
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
results[name] = acc
print(f'{name} 정확도: {acc:.4f}')
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print("Classification Report:\n", classification_report(y_test, pred))
return results
# 모델 평가
results = evaluate_models(models, X_train, X_test, y_train, y_test)
import matplotlib.pyplot as plt
plt.bar(results.keys(), results.values())
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()
## 모델 학습
- K-Fold 교차 검증
# for 문
from sklearn.model_selection import KFold
import numpy as np
def exec_kfold(clf, folds=5):
kfold = KFold(n_splits=folds)
scores = []
for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
scores.append(accuracy)
print(f'교차 검증 {iter_count} 정확도: {accuracy:.4f}')
mean_score = np.mean(scores)
print(f'평균 정확도: {mean_score:.4f}')
exec_kfold(DecisionTreeClassifier(random_state=11))
## cross_val_score()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(DecisionTreeClassifier(random_state=44), X_titanic_df, y_titanic_df, cv=5)
for iter_count, accuracy in enumerate(scores):
print(f'교차 검증: {iter_count}, 정확도: {accuracy}')
print(f'평균 정확도 >>>> {np.mean(scores):.4f}')
## 하이퍼파라미터 튜닝(알려 드릴 예정)
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]}
grid_dclf = GridSearchCV(DecisionTreeClassifier(random_state=44), param_grid=parameters, scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)
print(f'GridSearchCV 최적 하이퍼 파라미터 >>>> {grid_dclf.best_params_}')
print(f'GridSearchCV 최고 정확도: {grid_dclf.best_score_}')
'Machine learning' 카테고리의 다른 글
[Python 지도학습 실습 RETURN 2] (4) | 2024.12.27 |
---|---|
[Python 지도학습 실습 RETURN 1] (1) | 2024.12.26 |
[머신러닝: 비지도학습 실습] (2) | 2024.12.19 |
[머신러닝: 지도학습 실습 2] (6) | 2024.12.19 |
[머신 러닝: 지도학습 실습 1] (4) | 2024.12.18 |