Source Dataset: Pima Indians Diabetes Database (Kaggle)
Dataset berisi informasi medis dan apakah pasien terdiagnosis diabetes (1) atau tidak (0).
import pandas as pd
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
col_names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=col_names)
df.info()
Analisis distribusi, korelasi, dan outlier dari data.
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x='Outcome', data=df)
sns.heatmap(df.corr(), annot=True)
df.describe()
StandardScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
for col in ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]:
df[col] = df[col].replace(0, np.nan)
df[col] = df[col].fillna(df[col].median())
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.drop('Outcome', axis=1)), columns=df.columns[:-1])
df_scaled['Outcome'] = df['Outcome']
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
X = df_scaled.drop("Outcome", axis=1)
y = df_scaled["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
Gunakan metrik klasifikasi seperti akurasi, precision, recall, F1, dan confusion matrix.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pred_lr = model_lr.predict(X_test)
pred_rf = model_rf.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, pred_lr))
print("Random Forest")
print(classification_report(y_test, pred_rf))
from sklearn.model_selection import GridSearchCV
params = {'n_estimators': [50, 100], 'max_depth': [5, 10]}
gs = GridSearchCV(RandomForestClassifier(), params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_params_)
import joblib
joblib.dump(gs.best_estimator_, 'model_diabetes.pkl')