#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import time
from joblib import dump
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble  import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import plot_partial_dependence
import shap
shap.initjs()
#Eliminate warnings
import warnings
warnings.filterwarnings('ignore')


# In[2]:


def preprocess_data(df, test_size=0.2, random_state=48):
    x = df.drop(['entry_id','T(℃)'], axis=1)
    y = df['T(℃)']
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_size, random_state=random_state)
    minmax_scaler = MinMaxScaler()
    xtrain_normalized = minmax_scaler.fit_transform(xtrain)
    xtest_normalized = minmax_scaler.fit_transform(xtest)
    return xtrain_normalized, xtest_normalized, ytrain, ytest,x


# In[3]:


def plot_predictions(yTrue, yPredict, model_score,model_name,file_name):
    df_model = pd.DataFrame({
        'yTrue': yTrue,
        'yPredict': yPredict
    })
    df_sorted = df_model.sort_values(by='yTrue')
    plt.plot(np.array(df_sorted.yTrue)[:], color='#1f77b4', label='yTrue')  # blue
    plt.plot(np.array(df_sorted.yPredict)[:], color='#d62728', label='yPredict')  # red
    plt.legend()
    plt.title(model_name+" "+"True and Predicted Values"+" "+"["+file_name+"]")
    plt.xlabel('Index')
    plt.ylabel("Temperature (℃)")
    plt.text(0.4, 0.95, f'R2: {model_score[0]:.3f}', ha='left', va='top', transform=plt.gca().transAxes)
    plt.text(0.4, 0.90, f'RMSE: {model_score[1]:.3f}', ha='left', va='top', transform=plt.gca().transAxes)
    plt.text(0.4, 0.85, f'MAE: {model_score[2]:.3f}', ha='left', va='top', transform=plt.gca().transAxes)
    plt.text(0.4, 0.80, f'EV: {model_score[3]:.3f}', ha='left', va='top', transform=plt.gca().transAxes)
    plt.show()


# In[4]:


def calculate_r2_rmse_mae_ev(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)   #explained_variance
    result_str = f"R2 score: {r2:.3f}\nRMSE: {rmse:.3f}\nMAE: {mae:.3f}\nExplained Variance: {ev:.3f}"
    print("R2 Score:", r2)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("EV:", ev)
    return r2, rmse, mae, ev


# In[5]:


def save_best_model(model, model_name, dataset_name):
    model_path = f"{model_name}_{dataset_name}.pkl"
    dump(model, model_path)
    print("Best model saved to:", model_path)


# In[6]:


def train_and_save_best_xgb_model(xtrain, ytrain, file_name):
    param_grid = { 
            'n_estimators': [100, 200, 300],  
            'learning_rate': [0.01, 0.05, 0.1],  
            'max_depth': [3, 5, 7],  
            'colsample_bytree': [0.5, 0.7],  
            'gamma': [0, 0.25, 1.0]  
        }
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1)
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(xtrain, ytrain)
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_
    xgb_best = xgb.XGBRegressor(objective='reg:squarederror', **best_params)  
    xgb_best.fit(xtrain, ytrain)  
    print('xgb best parameters:', best_params)
    print('xgb best estimator:', best_estimator)
    save_best_model(xgb_best, 'xgb_best', file_name)
    print('end\n')
    return xgb_best


# In[7]:


# Import data sets
df_lowT=pd.read_excel('3lowT_886.xlsx')
file_name_lowT="LowT"
df_middleT=pd.read_excel('4middleT_1180.xlsx')
file_name_middleT="MiddleT"
df_highT=pd.read_excel('5highT_226.xlsx')
file_name_highT="HighT"
pre_lowT=preprocess_data(df_lowT, test_size=0.2, random_state=48)
pre_middleT=preprocess_data(df_middleT, test_size=0.2, random_state=48)
pre_highT=preprocess_data(df_highT, test_size=0.2, random_state=48)


# # Main

# In[8]:


#train model
xgb_best_lowT=train_and_save_best_xgb_model(pre_lowT[0], pre_lowT[2], file_name_lowT)
xgb_best_middleT=train_and_save_best_xgb_model(pre_middleT[0], pre_middleT[2], file_name_middleT)
xgb_best_highT=train_and_save_best_xgb_model(pre_highT[0], pre_highT[2], file_name_highT)


# In[ ]:


# xgb best parameters: {'colsample_bytree': 0.7, 'gamma': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
# xgb best estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=1.0, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.1, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=3, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              multi_strategy=None, n_estimators=300, n_jobs=-1,
#              num_parallel_tree=None, random_state=None, ...)
# Best model saved to: xgb_best_LowT.pkl
# end

# xgb best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
# xgb best estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=0, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.1, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=3, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              multi_strategy=None, n_estimators=300, n_jobs=-1,
#              num_parallel_tree=None, random_state=None, ...)
# Best model saved to: xgb_best_MiddleT.pkl
# end

# xgb best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}
# xgb best estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=0, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.05, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=3, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              multi_strategy=None, n_estimators=300, n_jobs=-1,
#              num_parallel_tree=None, random_state=None, ...)
# Best model saved to: xgb_best_HighT.pkl
# end


# In[9]:


# # Load the model
# xgb_best_lowT=load('xgb_best_lowT.pkl')
# xgb_best_middleT=load('xgb_best_middleT.pkl')
# xgb_best_highT=load('xgb_best_highT.pkl')


# In[10]:


# evaluation dabiesulu
# XGB - lowT  
# xtrain  
y_pred_xgb_xtrain_lowT = xgb_best_lowT.predict(pre_lowT[0])  
score_xgb_xtrain_lowT = calculate_r2_rmse_mae_ev(pre_lowT[2], y_pred_xgb_xtrain_lowT)  
plot_predictions(pre_lowT[2], y_pred_xgb_xtrain_lowT, score_xgb_xtrain_lowT, "XGB (Train)", file_name_lowT)  
# xtest  
y_pred_xgb_xtest_lowT = xgb_best_lowT.predict(pre_lowT[1])  
score_xgb_xtest_lowT = calculate_r2_rmse_mae_ev(pre_lowT[3], y_pred_xgb_xtest_lowT)  
plot_predictions(pre_lowT[3], y_pred_xgb_xtest_lowT, score_xgb_xtest_lowT, "XGB (Test)", file_name_lowT)
# XGB - middleT    
# xtrain    
y_pred_xgb_xtrain_middleT = xgb_best_middleT.predict(pre_middleT[0])    
score_xgb_xtrain_middleT = calculate_r2_rmse_mae_ev(pre_middleT[2], y_pred_xgb_xtrain_middleT)    
plot_predictions(pre_middleT[2], y_pred_xgb_xtrain_middleT, score_xgb_xtrain_middleT, "XGB (Train)", file_name_middleT)      
# xtest    
y_pred_xgb_xtest_middleT = xgb_best_middleT.predict(pre_middleT[1])    
score_xgb_xtest_middleT = calculate_r2_rmse_mae_ev(pre_middleT[3], y_pred_xgb_xtest_middleT)    
plot_predictions(pre_middleT[3], y_pred_xgb_xtest_middleT, score_xgb_xtest_middleT, "XGB (Test)", file_name_middleT)
# XGB - highT  
# xtrain  
y_pred_xgb_xtrain_highT = xgb_best_highT.predict(pre_highT[0])  
score_xgb_xtrain_highT = calculate_r2_rmse_mae_ev(pre_highT[2], y_pred_xgb_xtrain_highT)  
plot_predictions(pre_highT[2], y_pred_xgb_xtrain_highT, score_xgb_xtrain_highT, "XGB (Train)", file_name_highT)  
# xtest  
y_pred_xgb_xtest_highT = xgb_best_highT.predict(pre_highT[1])  
score_xgb_xtest_highT = calculate_r2_rmse_mae_ev(pre_highT[3], y_pred_xgb_xtest_highT)  
plot_predictions(pre_highT[3], y_pred_xgb_xtest_highT, score_xgb_xtest_highT, "XGB (Test)", file_name_highT)


# In[11]:


def plot_feature_importance(df_importance, include_ti_feature=True, ax=None, file_name=''):   
    if not include_ti_feature:  
        df_importance = df_importance.iloc[1:]   
    if ax is None:  
        plt.figure(figsize=(10, 6))  
        ax = plt.gca()  
    sns.barplot(x='Importance', y='Feature', data=df_importance, palette="muted", ax=ax)  
    if include_ti_feature:  
        ax.set_title('Importance Of Trace Elements [Ti_ppm Included] [' + file_name + ']')  
    else:  
        ax.set_title('Importance Of Trace Elements [Ti_ppm Not Included] [' + file_name + ']')  
    ax.set_xlabel('Importance')  
    ax.set_ylabel('Trace Elements')  
    ax.title.set_fontsize(12)
    for index, value in enumerate(df_importance['Importance']):  
        ax.text(value, index, round(value, 2), va='center')  
    sns.despine(ax=ax)  
    return ax

fig, axs = plt.subplots(2, 3, figsize=(12, 11))
feature_importance_lowT = xgb_best_lowT.feature_importances_ * 100
feature_importance_middleT = xgb_best_middleT.feature_importances_ * 100
feature_importance_highT = xgb_best_highT.feature_importances_ * 100
columns = pre_lowT[4].columns[:]
feature_names = list(columns)
feature_importance_df_lowT = pd.DataFrame({  
    'Feature': columns,  
    'Importance': feature_importance_lowT  
})  
feature_importance_df_middleT = pd.DataFrame({  
    'Feature': columns,  
    'Importance': feature_importance_middleT 
}) 
feature_importance_df_highT = pd.DataFrame({  
    'Feature': columns,  
    'Importance': feature_importance_highT 
})  
feature_importance_df_lowT_sorted = feature_importance_df_lowT.sort_values('Importance', ascending=False)  
feature_importance_df_middleT_sorted = feature_importance_df_middleT.sort_values('Importance', ascending=False)
feature_importance_df_highT_sorted = feature_importance_df_highT.sort_values('Importance', ascending=False) 
index=50
feature_importance_lowT = feature_importance_df_lowT_sorted.head(index)
feature_importance_middleT = feature_importance_df_middleT_sorted.head(index)
feature_importance_highT = feature_importance_df_highT_sorted.head(index)

plot_feature_importance(feature_importance_lowT, include_ti_feature=True, ax=axs[0, 0])  
axs[0, 0].set_title('(a) Importance[Ti_ppm Included] [' + file_name_lowT + ']').set_fontsize(11)    
plot_feature_importance(feature_importance_lowT, include_ti_feature=False, ax=axs[1, 0])  
axs[1, 0].set_title('(b) Importance[Ti_ppm Not Included] [' + file_name_lowT + ']').set_fontsize(11)  
plot_feature_importance(feature_importance_middleT, include_ti_feature=True, ax=axs[0, 1])  
axs[0, 1].set_title('(c) Importance[Ti_ppm Included] [' + file_name_middleT + ']').set_fontsize(11)   
plot_feature_importance(feature_importance_middleT, include_ti_feature=False, ax=axs[1, 1])  
axs[1, 1].set_title('(d) Importance[Ti_ppm Not Included] [' + file_name_middleT + ']').set_fontsize(11)   
plot_feature_importance(feature_importance_highT, include_ti_feature=True, ax=axs[0, 2])  
axs[0, 2].set_title('(e) Importance[Ti_ppm Included] [' + file_name_highT + ']').set_fontsize(11)  
plot_feature_importance(feature_importance_highT, include_ti_feature=False, ax=axs[1, 2])  
axs[1, 2].set_title('(f) Importance[Ti_ppm Not Included] [' + file_name_highT + ']').set_fontsize(11) 

plt.tight_layout()  
plt.show()


# In[19]:


#lowT:   
# shap.summary_plot
xtrain_df = pd.DataFrame(pre_lowT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_lowT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP summary plot [ti_ppm included] [lowT] \n')
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
max_index = np.unravel_index(np.abs(shap_values).argmax(), shap_values.shape)
shap_values[:, max_index[1]] =np.nan
plt.title('(a) SHAP summary plot [ti_ppm not included] [lowT] \n'.format())
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
#shap.decision_plot
xtrain_df = pd.DataFrame(pre_lowT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_lowT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP Decision Plot [ti_ppm included] [lowT] \n')
shap.decision_plot(explainer.expected_value, shap_values, feature_names=feature_names)
shap.decision_plot(explainer.expected_value, shap_values[0], feature_names=feature_names)
plt.show()
ti_ppm_index = feature_names.index("ti_ppm")
shap_values_without_ti_ppm = np.delete(shap_values, ti_ppm_index, axis=1)
feature_names_without_ti_ppm = feature_names[:ti_ppm_index] + feature_names[ti_ppm_index+1:]
plt.title('(a) SHAP Decision Plot superposition [ti_ppm not included] [lowT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm, feature_names=feature_names_without_ti_ppm)
plt.title('(b) SHAP Decision Plot [ti_ppm not included] [lowT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm[0], feature_names=feature_names_without_ti_ppm)
plt.show()


# In[20]:


#middle:   
# shap.summary_plot
xtrain_df = pd.DataFrame(pre_middleT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_middleT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP summary plot [ti_ppm included] [middleT] \n')
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
max_index = np.unravel_index(np.abs(shap_values).argmax(), shap_values.shape)
shap_values[:, max_index[1]] =np.nan
plt.title('(b) SHAP summary plot [ti_ppm not included] [middleT] \n'.format())
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
#shap.decision_plot
xtrain_df = pd.DataFrame(pre_middleT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_middleT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP Decision Plot [ti_ppm included] [lowT] \n')
shap.decision_plot(explainer.expected_value, shap_values, feature_names=feature_names)
shap.decision_plot(explainer.expected_value, shap_values[0], feature_names=feature_names)
plt.show()
ti_ppm_index = feature_names.index("ti_ppm")
shap_values_without_ti_ppm = np.delete(shap_values, ti_ppm_index, axis=1)
feature_names_without_ti_ppm = feature_names[:ti_ppm_index] + feature_names[ti_ppm_index+1:]
plt.title('(c) SHAP Decision Plot superposition [ti_ppm not included] [middleT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm, feature_names=feature_names_without_ti_ppm)
plt.title('(d) SHAP Decision Plot [ti_ppm not included] [middleT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm[0], feature_names=feature_names_without_ti_ppm)
plt.show()


# In[21]:


#highT:   
# shap.summary_plot
xtrain_df = pd.DataFrame(pre_highT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_highT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP summary plot [ti_ppm included] [highT] \n')
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
max_index = np.unravel_index(np.abs(shap_values).argmax(), shap_values.shape)
shap_values[:, max_index[1]] =np.nan
plt.title('(c) SHAP summary plot [ti_ppm not included] [highT] \n'.format())
shap.summary_plot(shap_values, xtrain_df, plot_type="dot", max_display=15,plot_size="auto")
#shap.decision_plot
xtrain_df = pd.DataFrame(pre_highT[0], columns=feature_names)
explainer = shap.TreeExplainer(xgb_best_highT)
shap_values = explainer.shap_values(xtrain_df)
plt.title('SHAP Decision Plot [ti_ppm included] [highT] \n')
shap.decision_plot(explainer.expected_value, shap_values, feature_names=feature_names)
shap.decision_plot(explainer.expected_value, shap_values[0], feature_names=feature_names)
plt.show()
ti_ppm_index = feature_names.index("ti_ppm")
shap_values_without_ti_ppm = np.delete(shap_values, ti_ppm_index, axis=1)
feature_names_without_ti_ppm = feature_names[:ti_ppm_index] + feature_names[ti_ppm_index+1:]
plt.title('(e) SHAP Decision Plot superposition [ti_ppm not included] [highT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm, feature_names=feature_names_without_ti_ppm)
plt.title('(f) SHAP Decision Plot [ti_ppm not included] [highT] \n')
shap.decision_plot(explainer.expected_value, shap_values_without_ti_ppm[0], feature_names=feature_names_without_ti_ppm)
plt.show()


# In[15]:


# Partial Dependence Plots
# lowT
feature_importance = xgb_best_lowT.feature_importances_
columns = pre_lowT[4].columns[:]
feature_names = list(columns)
df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
df_importance = df_importance.sort_values(by='Importance', ascending=False)
top_features = df_importance['Feature'].tolist()[:12]  # Choose the five most important features
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(25, 15))
fig.suptitle('Partial Dependence Plots [top 12] [lowT]', fontsize=18)
for i, feature_name in enumerate(top_features):
    feature_index = feature_names.index(feature_name)
    plot_partial_dependence(xgb_best_lowT, pre_lowT[0], features=[feature_index], ax=axs[i//4, i%4])
    axs[i//4, i%4].set_title(feature_name)
for ax in axs.flat:
    ax.set_xlabel(feature_name)
    ax.set_ylabel('Model Response')
plt.subplots_adjust(top=0.9)
plt.show()
#middleT
feature_importance = xgb_best_middleT.feature_importances_
columns = pre_middleT[4].columns[:]
feature_names = list(columns)
df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
df_importance = df_importance.sort_values(by='Importance', ascending=False)
top_features = df_importance['Feature'].tolist()[:12]  # Choose the five most important features
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(25, 15))
fig.suptitle('Partial Dependence Plots [top 12] [middleT]', fontsize=18)
for i, feature_name in enumerate(top_features):
    feature_index = feature_names.index(feature_name)
    plot_partial_dependence(xgb_best_middleT, pre_middleT[0], features=[feature_index], ax=axs[i//4, i%4])
    axs[i//4, i%4].set_title(feature_name)
for ax in axs.flat:
    ax.set_xlabel(feature_name)
    ax.set_ylabel('Model Response')
plt.subplots_adjust(top=0.9)
plt.show()
# highT
feature_importance = xgb_best_highT.feature_importances_
columns = pre_highT[4].columns[:]
feature_names = list(columns)
df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
df_importance = df_importance.sort_values(by='Importance', ascending=False)
top_features = df_importance['Feature'].tolist()[:12]  # Choose the five most important features
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(25, 15))
fig.suptitle('Partial Dependence Plots [top 12] [highT]', fontsize=18)
for i, feature_name in enumerate(top_features):
    feature_index = feature_names.index(feature_name)
    plot_partial_dependence(xgb_best_highT, pre_highT[0], features=[feature_index], ax=axs[i//4, i%4])
    axs[i//4, i%4].set_title(feature_name)
for ax in axs.flat:
    ax.set_xlabel(feature_name)
    ax.set_ylabel('Model Response')
plt.subplots_adjust(top=0.9)
plt.show()

