#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# 提取的颜色
left_color_hex = '#72b6a1'  # 绿色
right_color_hex = '#e99675'  # 橙色

# Set global font to Times New Roman and increase font sizes
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 12

# Load the dataset
file_path = 'S4.Raw Data.xlsx'
data = pd.read_excel(file_path)
target = 'T2DM'

# Handle NaN values using IterativeImputer
imputer = IterativeImputer(random_state=42)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Exclude columns 'OGTT', 'FPG', 'T2DM_Report', 'Medical insurance' from features
exclude_columns = ['OGTT', 'FPG', 'T2DM_Report', 'Medical insurance']
X = data_imputed.drop(columns=[target] + exclude_columns)
y = data_imputed[target]

# Count the number of samples in each class before under-sampling
print("Original class distribution:")
print(y.value_counts())

# Count the number of samples in each class
class_counts = y.value_counts()
minority_class_count = class_counts.min()

# Define the sampling strategy to keep the minority class intact
sampling_strategy = {class_counts.idxmin(): minority_class_count, class_counts.idxmax(): minority_class_count}

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Count the number of samples in each class after under-sampling
print("\nClass distribution after under-sampling:")
print(y_resampled.value_counts())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Continuous and categorical features
continuous_features = ['BMI', 'Sedentary time', 'Age', 'Daily sleep duration']
categorical_features = [col for col in X_train.columns if col not in continuous_features]

# Preprocessing pipelines
preprocessor_standard = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

preprocessor_no_transform = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', continuous_features),
        ('cat', 'passthrough', categorical_features)
    ])


# In[ ]:


from sklearn.model_selection import RandomizedSearchCV

# Define the models with their initial hyperparameter grids
initial_param_grids = {
    'Logistic Regression': {
        'logreg__penalty': ['l2', 'l1'],
        'logreg__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'logreg__solver': ['liblinear', 'saga'],
        'logreg__class_weight': [None, 'balanced']
    },
    'SVM': {
        'svc__C': [0.1, 1, 10, 100],
        'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__gamma': ['scale', 'auto'],
        'svc__class_weight': [None, 'balanced']
    },
    'Random Forest': {
        'rf__n_estimators': [100, 200, 300, 400, 500],
        'rf__max_depth': [None, 10, 20, 30],
        'rf__min_samples_split': [2, 5, 10],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__bootstrap': [True, False]
    },
    'XGBoost': {
        'xgb__n_estimators': [100, 200, 300, 400, 500],
        'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
        'xgb__max_depth': [3, 4, 5, 6, 7],
        'xgb__min_child_weight': [1, 3, 5],
        'xgb__gamma': [0, 0.1, 0.2, 0.3],
        'xgb__subsample': [0.6, 0.8, 1.0],
        'xgb__colsample_bytree': [0.6, 0.8, 1.0]
    },
    'LightGBM': {
        'lgbm__num_leaves': [20, 30, 40, 50],
        'lgbm__max_depth': [None, 10, 20, 30],
        'lgbm__learning_rate': [0.01, 0.1, 0.2],
        'lgbm__n_estimators': [100, 200, 300, 400, 500],
        'lgbm__min_child_samples': [20, 30, 40],
        'lgbm__subsample': [0.6, 0.8, 1.0]
    }
}

# Define models with default hyperparameters
pipelines = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor_standard),
        ('logreg', LogisticRegression(max_iter=10000))
    ]),
    'SVM': Pipeline([
        ('preprocessor', preprocessor_standard),
        ('svc', SVC(probability=True))
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('rf', RandomForestClassifier())
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('lgbm', LGBMClassifier())
    ])
}

# Perform RandomizedSearchCV for each model
best_params_initial = {}
for model_name, pipeline in pipelines.items():
    print(f"Tuning hyperparameters for {model_name}...")
    random_search = RandomizedSearchCV(pipeline, param_distributions=initial_param_grids[model_name], 
                                       n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)
    best_params_initial[model_name] = random_search.best_params_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best AUC for {model_name}: {random_search.best_score_:.4f}")


# In[ ]:


from sklearn.model_selection import GridSearchCV

# Helper function to handle NoneType values in fine-tuning
def get_fine_tuned_range(param, decrement=1, increment=1):
    if param is None:
        return [None]
    return [param - decrement, param, param + increment]

# Fine-tuned hyperparameter grids based on initial tuning results
fine_tuned_param_grids = {
    'Logistic Regression': {
        'logreg__C': get_fine_tuned_range(best_params_initial['Logistic Regression']['logreg__C'], 0.5, 0.5)
    },
    'SVM': {
        'svc__C': get_fine_tuned_range(best_params_initial['SVM']['svc__C'], 0.5, 0.5),
        'svc__gamma': [best_params_initial['SVM']['svc__gamma']]
    },
    'Random Forest': {
        'rf__n_estimators': get_fine_tuned_range(best_params_initial['Random Forest']['rf__n_estimators'], 50, 50),
        'rf__max_depth': get_fine_tuned_range(best_params_initial['Random Forest']['rf__max_depth'], 5, 5),
        'rf__min_samples_split': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_split'], 1, 1),
        'rf__min_samples_leaf': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_leaf'], 1, 1),
        'rf__bootstrap': [best_params_initial['Random Forest']['rf__bootstrap']]
    },
    'XGBoost': {
        'xgb__n_estimators': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__n_estimators'], 50, 50),
        'xgb__learning_rate': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__learning_rate'], 0.05, 0.05),
        'xgb__max_depth': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__max_depth'], 1, 1),
        'xgb__min_child_weight': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__min_child_weight'], 1, 1),
        'xgb__gamma': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__gamma'], 0.05, 0.05),
        'xgb__subsample': [best_params_initial['XGBoost']['xgb__subsample']],
        'xgb__colsample_bytree': [best_params_initial['XGBoost']['xgb__colsample_bytree']]
    },
    'LightGBM': {
        'lgbm__num_leaves': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__num_leaves'], 5, 5),
        'lgbm__max_depth': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__max_depth'], 1, 1),
        'lgbm__learning_rate': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__learning_rate'], 0.05, 0.05),
        'lgbm__n_estimators': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__n_estimators'], 50, 50),
        'lgbm__min_child_samples': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__min_child_samples'], 5, 5),
        'lgbm__subsample': [best_params_initial['LightGBM']['lgbm__subsample']]
    }
}

# Perform GridSearchCV for each model
best_params_fine_tuned = {}
for model_name, pipeline in pipelines.items():
    print(f"Fine-tuning hyperparameters for {model_name}...")
    grid_search = GridSearchCV(pipeline, param_grid=fine_tuned_param_grids[model_name], 
                               cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params_fine_tuned[model_name] = grid_search.best_params_
    print(f"Fine-tuned best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Fine-tuned best AUC for {model_name}: {grid_search.best_score_:.4f}")

# Remove prefix from best parameters for each model
def remove_prefix(best_params, prefix):
    return {key.split(f'{prefix}__')[1]: value for key, value in best_params.items()}

best_params_processed = {
    'Logistic Regression': remove_prefix(best_params_fine_tuned['Logistic Regression'], 'logreg'),
    'SVM': remove_prefix(best_params_fine_tuned['SVM'], 'svc'),
    'Random Forest': remove_prefix(best_params_fine_tuned['Random Forest'], 'rf'),
    'XGBoost': remove_prefix(best_params_fine_tuned['XGBoost'], 'xgb'),
    'LightGBM': remove_prefix(best_params_fine_tuned['LightGBM'], 'lgbm')
}


# In[ ]:


from IPython.display import display, HTML

# Define a function to calculate and plot metrics
def calculate_metrics(models, X_test, y_test):
    metrics = {'Model': [], 'AUC': [], 'Accuracy': [], 'Recall': [], 'Precision': []}
    plt.figure(figsize=(14, 10), dpi=300)
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})')
        
        # Calculate and store metrics
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        
        metrics['Model'].append(name)
        metrics['AUC'].append(auc)
        metrics['Accuracy'].append(accuracy)
        metrics['Recall'].append(recall)
        metrics['Precision'].append(precision)
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('The AUC-ROC Curves of 5 ML Models')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    metrics_df = pd.DataFrame(metrics)
    return metrics_df

# Models with fine-tuned parameters
fine_tuned_models = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor_standard),
        ('logreg', LogisticRegression(**best_params_processed['Logistic Regression'], max_iter=10000))
    ]),
    'SVM': Pipeline([
        ('preprocessor', preprocessor_standard),
        ('svc', SVC(**best_params_processed['SVM'], probability=True))
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('rf', RandomForestClassifier(**best_params_processed['Random Forest']))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('xgb', XGBClassifier(**best_params_processed['XGBoost'], use_label_encoder=False, eval_metric='logloss'))
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor_no_transform),
        ('lgbm', LGBMClassifier(**best_params_processed['LightGBM']))
    ])
}

# Calculate metrics and plot AUC-ROC curves
metrics_df = calculate_metrics(fine_tuned_models, X_test, y_test)

# Display the metrics in a table
metrics_df_display = metrics_df.set_index('Model')
display(HTML(metrics_df_display.to_html()))


# In[ ]:


pip install shap lightgbm


# In[ ]:


import shap
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd


# 使用最佳参数重新训练LightGBM模型
best_params_lgbm = best_params_processed['LightGBM']

# 创建并训练LightGBM模型
lgbm_model = LGBMClassifier(**best_params_lgbm)
lgbm_model.fit(X_train, y_train)

# 创建SHAP解释器
explainer = shap.TreeExplainer(lgbm_model)
shap_values = explainer.shap_values(X_test)

# 创建图形对象
fig, ax = plt.subplots(figsize=(8, 8), dpi=300)
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X_test.columns, show=False)
plt.title('Feature Importance (Bar Plot)', fontsize=16)
plt.savefig('shap_bar_plot.png', bbox_inches='tight')
plt.close(fig)

fig, ax = plt.subplots(figsize=(8, 8), dpi=300)
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)
plt.title('Feature Importance (Bee Swarm Plot)', fontsize=16)
plt.savefig('shap_bee_swarm_plot.png', bbox_inches='tight')
plt.close(fig)


# In[ ]:


from PIL import Image, ImageDraw, ImageFont

# 读取保存的图像
bar_plot = Image.open('shap_bar_plot.png')
bee_swarm_plot = Image.open('shap_bee_swarm_plot.png')

# 创建并排拼接的图像
total_width = bar_plot.width + bee_swarm_plot.width
max_height = max(bar_plot.height, bee_swarm_plot.height)

new_img = Image.new('RGB', (total_width, max_height), (255, 255, 255))
new_img.paste(bar_plot, (0, 0))
new_img.paste(bee_swarm_plot, (bar_plot.width, 0))

# 添加A和B标记
draw = ImageDraw.Draw(new_img)
font = ImageFont.truetype("arial", 100)

draw.text((20, 20), "A", fill="black", font=font)
draw.text((bar_plot.width + 20, 20), "B", fill="black", font=font)

# 保存并显示最终的拼接图像
new_img.save('shap_combined_plot.png')
new_img.show()


# In[ ]: