Bir python kodum var ancak onu test ettiğimde yeterince iyi sonuçlar alamıyorum… Belki kötü bir veri seti kullanmaktan kaynaklanmakta ama veri setimi değiştiremiyorum. yardımcı olursanız sevinirim.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_score, f1_score, recall_score
from sklearn.model_selection import cross_val_score
import optuna
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping # Early stopping import edilmesi
Veri Setini Yükle
df = pd.read_excel(“C:\Users\qwerty\Desktop\hepsi\rawdata\rawdata.xlsx”)
Sayısal Olmayan Sütunların Etiketlenmesi
label_encoders = {}
for col in df.select_dtypes(include=[‘object’]).columns:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
Eksik Değerlerin İşlenmesi
imputer = SimpleImputer(strategy=‘mean’)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
Aykırı Değerlerin İşlenmesi
for col in df_imputed.select_dtypes(include=[np.number]).columns:
q75, q25 = np.percentile(df_imputed[col], [75, 25])
iqr = q75 - q25
upper_bound = q75 + (1.5 * iqr)
lower_bound = q25 - (1.5 * iqr)
df_imputed[col] = np.where(df_imputed[col] > upper_bound, upper_bound, df_imputed[col])
df_imputed[col] = np.where(df_imputed[col] < lower_bound, lower_bound, df_imputed[col])
Veriyi Ayırma
X = df_imputed.iloc[:, :-2] # Tüm kolonlar (son iki kolon hariç)
y1 = df_imputed.iloc[:, -2].astype(int) # 1. hedef değişken
y2 = df_imputed.iloc[:, -1].astype(int) # 2. hedef değişken
StratifiedShuffleSplit ile Veriyi Bölme
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.3, random_state=42)
y2_train, y2_test = y2.iloc[y1_train.index], y2.iloc[y1_test.index]
Ölçekleme
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Özellik Seçimi (RFE)
estimator = RandomForestClassifier()
selector = RFE(estimator, n_features_to_select=9, step=1)
X_train_selected = selector.fit_transform(X_train_scaled, y1_train)
X_test_selected = selector.transform(X_test_scaled)
Keras modeli oluşturma
def create_keras_model(num_layers, units, learning_rate):
model = keras.Sequential()
for _ in range(num_layers):
model.add(layers.Dense(units, activation=‘relu’))
model.add(layers.Dropout(0.2)) # Dropout ekleyin
model.add(layers.Dense(1, activation=‘sigmoid’))
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss=‘binary_crossentropy’, metrics=[‘accuracy’])
return model
Hiperparametre Optimizasyonu
performance_data = # Performans verilerini saklamak için bir liste oluştur
def objective(trial, y_train):
model_name = trial.suggest_categorical(“model”, [“rf”, “knn”, “dt”, “mlp”, “xgb”, “lgbm”, “catboost”, “keras”])
if model_name == "rf":
n_estimators = trial.suggest_int("n_estimators", 50, 300)
max_depth = trial.suggest_int("max_depth", 2, 50)
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
elif model_name == "knn":
n_neighbors = trial.suggest_int("n_neighbors", 2, 20)
model = KNeighborsClassifier(n_neighbors=n_neighbors)
elif model_name == "dt":
max_depth = trial.suggest_int("max_depth", 2, 50)
model = DecisionTreeClassifier(max_depth=max_depth)
elif model_name == "mlp":
hidden_layer_sizes = trial.suggest_int("hidden_layer_sizes", 50, 300)
alpha = trial.suggest_float("alpha", 1e-5, 1e-1)
model = MLPClassifier(hidden_layer_sizes=(hidden_layer_sizes,), alpha=alpha, max_iter=1000)
elif model_name == "xgb":
n_estimators = trial.suggest_int("n_estimators", 50, 300)
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
max_depth = trial.suggest_int("max_depth", 2, 50)
model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth,
use_label_encoder=False)
elif model_name == "lgbm":
n_estimators = trial.suggest_int("n_estimators", 50, 300)
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
num_leaves = trial.suggest_int("num_leaves", 2, 256)
model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves)
elif model_name == "catboost":
n_estimators = trial.suggest_int("n_estimators", 50, 300)
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
depth = trial.suggest_int("depth", 2, 16)
model = CatBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, depth=depth, verbose=0)
elif model_name == "keras":
num_layers = trial.suggest_int("num_layers", 1, 5)
units = trial.suggest_int("units", 32, 128)
learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2)
model = create_keras_model(num_layers, units, learning_rate)
model.fit(X_train_selected, y_train, epochs=50, batch_size=32, verbose=0)
score = model.evaluate(X_train_selected, y_train, verbose=0)[1]
performance_data.append({"trial": len(performance_data) + 1, "model": model_name, "score": score})
return score
score = cross_val_score(model, X_train_selected, y_train, cv=5, scoring="accuracy").mean()
# Performans verilerini kaydet
performance_data.append({"trial": len(performance_data) + 1, "model": model_name, "score": score})
return score
y1 için en iyi parametreleri bul
study_y1 = optuna.create_study(direction=“maximize”)
study_y1.optimize(lambda trial: objective(trial, y1_train), n_trials=150)
best_params_y1 = study_y1.best_params
y2 için en iyi parametreleri bul
study_y2 = optuna.create_study(direction=“maximize”)
study_y2.optimize(lambda trial: objective(trial, y2_train), n_trials=150)
best_params_y2 = study_y2.best_params
En İyi Modelleri Eğit
def train_best_model(best_params, X_train, y_train):
if best_params[“model”] == “keras”:
model = create_keras_model(best_params[“num_layers”], best_params[“units”], best_params[“learning_rate”])
# Early Stopping Callbacks ekledik
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_split=0.2,
callbacks=[early_stopping])
else:
model_name = best_params["model"]
if model_name == "rf":
model = RandomForestClassifier(n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"])
elif model_name == "knn":
model = KNeighborsClassifier(n_neighbors=best_params["n_neighbors"])
elif model_name == "dt":
model = DecisionTreeClassifier(max_depth=best_params["max_depth"])
elif model_name == "mlp":
model = MLPClassifier(hidden_layer_sizes=(best_params["hidden_layer_sizes"],), alpha=best_params["alpha"],
max_iter=1000)
elif model_name == "xgb":
model = XGBClassifier(n_estimators=best_params["n_estimators"], learning_rate=best_params["learning_rate"],
max_depth=best_params["max_depth"], use_label_encoder=False)
elif model_name == "lgbm":
model = LGBMClassifier(n_estimators=best_params["n_estimators"], learning_rate=best_params["learning_rate"],
num_leaves=best_params["num_leaves"])
elif model_name == "catboost":
model = CatBoostClassifier(n_estimators=best_params["n_estimators"],
learning_rate=best_params["learning_rate"],
depth=best_params["depth"], verbose=0)
model.fit(X_train, y_train)
return model
model_y1 = train_best_model(best_params_y1, X_train_selected, y1_train)
model_y2 = train_best_model(best_params_y2, X_train_selected, y2_train)
Stacking Modeli Ekleyelim
StackingClassifier için en iyi modelleri seçelim
base_learners_y1 = [
(“rf”, RandomForestClassifier(n_estimators=100, max_depth=15)),
(“knn”, KNeighborsClassifier(n_neighbors=5)),
(“dt”, DecisionTreeClassifier(max_depth=15)),
(“mlp”, MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)),
(“xgb”, XGBClassifier(n_estimators=100, max_depth=5)),
(“lgbm”, LGBMClassifier(n_estimators=100, max_depth=5)),
(“catboost”, CatBoostClassifier(iterations=100, depth=5, learning_rate=0.05))
]
base_learners_y2 = base_learners_y1 # Y2 için aynı base learners’ı kullanalım
stacking_model_y1 = VotingClassifier(estimators=base_learners_y1, voting=‘soft’)
stacking_model_y2 = VotingClassifier(estimators=base_learners_y2, voting=‘soft’)
stacking_model_y1.fit(X_train_selected, y1_train)
stacking_model_y2.fit(X_train_selected, y2_train)
Tahminleri Al
def evaluate_model(model, X_test, y_test):
# Eğer model bir VotingClassifier ise
if isinstance(model, VotingClassifier):
# Tüm model tahminlerini al (olasılık tahminleri)
y_pred_prob_list = [estimator.predict_proba(X_test) for estimator in model.estimators_]
# Olasılıkları 2D forma sok
y_pred_prob = np.array(y_pred_prob_list).T # (n_models, n_samples, n_classes)
# Olasılıklar üzerinden her örnek için en yüksek olasılığa sahip sınıfı seç
y_pred = np.argmax(y_pred_prob.mean(axis=0), axis=1)
else:
# Diğer modeller için normal tahmin
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
return precision, recall, f1
y1 Performans Değerlendirmesi
precision_y1, recall_y1, f1_y1 = evaluate_model(stacking_model_y1, X_test_selected, y1_test)
print(f"y1 için Precision: {precision_y1}“)
print(f"y1 için Recall: {recall_y1}”)
print(f"y1 için F1 Skoru: {f1_y1}")
y2 Performans Değerlendirmesi
precision_y2, recall_y2, f1_y2 = evaluate_model(stacking_model_y2, X_test_selected, y2_test)
print(f"y2 için Precision: {precision_y2}“)
print(f"y2 için Recall: {recall_y2}”)
print(f"y2 için F1 Skoru: {f1_y2}")
Performans Metriklerini Kaydet
performance_metrics = {
“y1”: {“Precision”: precision_y1, “Recall”: recall_y1, “F1”: f1_y1},
“y2”: {“Precision”: precision_y2, “Recall”: recall_y2, “F1”: f1_y2},
}
Metrikleri bir dosyaya kaydet
with open(“C:\Users\qwerty\Desktop\hepsi\rawdata\performance_metrics_c.txt”, “w”) as f:
for target, metrics in performance_metrics.items():
f.write(f"{target} için:\n")
for metric, value in metrics.items():
f.write(f"{metric}: {value}\n")
f.write(“\n”)
Model Kaydetme
joblib.dump(stacking_model_y1, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\stacking_model_y1_c.pkl’)
joblib.dump(stacking_model_y2, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\stacking_model_y2_c.pkl’)
joblib.dump(scaler, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\scaler03072024_c.pkl’)
joblib.dump(imputer, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\imputer03072024_c.pkl’)
joblib.dump(label_encoders, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\label_encoders03072024_c.pkl’)
joblib.dump(selector, ‘C:\Users\qwerty\Desktop\hepsi\rawdata\selector03072024_c.pkl’)
Performans verilerini bir DataFrame’e çevir ve Excel’e yaz
performance_df = pd.DataFrame(performance_data)
performance_df.to_excel(“C:\Users\qwerty\Desktop\hepsi\rawdata\performance_trials.xlsx”, index=False)
Doğru ve Yanlış Tahminleri Belirleme
y1_predictions = stacking_model_y1.predict(X_test_selected).ravel()
y2_predictions = stacking_model_y2.predict(X_test_selected).ravel()
Boyutları kontrol et
print(“y1_test boyutu:”, y1_test.shape)
print(“y1_predictions boyutu:”, y1_predictions.shape)
print(“y2_test boyutu:”, y2_test.shape)
print(“y2_predictions boyutu:”, y2_predictions.shape)
Sonuçları DataFrame’e ekle
results_df = pd.DataFrame({
‘True_iy’: y1_test.values,
‘Predicted_iy’: y1_predictions,
‘True_ms’: y2_test.values,
‘Predicted_ms’: y2_predictions
})
Doğru ve yanlış tahminleri işaretle
results_df[‘Correct_iy’] = results_df[‘True_iy’] == results_df[‘Predicted_iy’]
results_df[‘Correct_ms’] = results_df[‘True_ms’] == results_df[‘Predicted_ms’]
Sonuçları Excel dosyasına kaydet
results_df.to_excel(“C:\Users\qwerty\Desktop\hepsi\rawdata\predictions_results_c.xlsx”, index=False)
print(“Tahmin sonuçları başarıyla kaydedildi.”)