# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            mean_squared_error, mean_absolute_error, r2_score)
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib to use non-interactive backend
matplotlib.use('Agg')

# Set plotting parameters for English text
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 10

# ============================================================================
# SECTION 1: DATA LOADING AND PREPROCESSING
# ============================================================================
print("="*80)
print("SECTION 1: DATA LOADING AND PREPROCESSING")
print("="*80)

# Load data
df = pd.read_csv('string.csv')
print(f"Successfully loaded 'string.csv'")
print(f"Dataset contains {df.shape[0]} targets and {df.shape[1]} features")
print(f"\nFirst few rows:\n{df.head()}")

# Data preprocessing
df_processed = df.copy()
for col in ['IsSingleNode', 'selected']:
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].astype(int)

# Normalize score features
score_features = ['Degree', 'BetweennessCentrality', 'ClosenessCentrality']
scaler = MinMaxScaler()
df_processed[score_features] = scaler.fit_transform(df_processed[score_features])
df_processed['Importance_Score'] = df_processed[score_features].mean(axis=1)

# Prepare feature matrix
drop_cols = ['name', 'shared name', 'Importance_Score'] + score_features
X = df_processed.drop(columns=drop_cols).select_dtypes(include=np.number)
feature_names = X.columns.tolist()

print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of features: {len(feature_names)}")

# ============================================================================
# SECTION 2: CLASSIFICATION TASK WITH HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "="*80)
print("SECTION 2: CLASSIFICATION TASK WITH HYPERPARAMETER TUNING")
print("="*80)

# Create binary classification labels
threshold = df_processed['Importance_Score'].quantile(0.7)
y_class = (df_processed['Importance_Score'] >= threshold).astype(int)
print(f"Classification threshold (70th percentile): {threshold:.4f}")
print(f"Class distribution:\n{y_class.value_counts()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.3, random_state=42, stratify=y_class
)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# ============================================================================
# SECTION 2.1: RANDOM FOREST HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "-"*80)
print("Random Forest Hyperparameter Tuning")
print("-"*80)

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("\nHyperparameter grid for Random Forest:")
for param, values in rf_param_grid.items():
    print(f"  {param}: {values}")

# Perform Grid Search
rf_classifier = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=rf_param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

print("\nPerforming Grid Search (5-fold cross-validation)...")
rf_grid_search.fit(X_train, y_train)

print(f"\nBest Random Forest parameters:")
for param, value in rf_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best cross-validation F1-score: {rf_grid_search.best_score_:.4f}")

# Get best RF model
best_rf_clf = rf_grid_search.best_estimator_

# ============================================================================
# SECTION 2.2: XGBOOST HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "-"*80)
print("XGBoost Hyperparameter Tuning")
print("-"*80)

# Define hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

print("\nHyperparameter grid for XGBoost:")
for param, values in xgb_param_grid.items():
    print(f"  {param}: {values}")

# Perform Grid Search
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=xgb_param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

print("\nPerforming Grid Search (5-fold cross-validation)...")
xgb_grid_search.fit(X_train, y_train)

print(f"\nBest XGBoost parameters:")
for param, value in xgb_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best cross-validation F1-score: {xgb_grid_search.best_score_:.4f}")

# Get best XGBoost model
best_xgb_clf = xgb_grid_search.best_estimator_

# ============================================================================
# SECTION 2.3: CROSS-VALIDATION RESULTS
# ============================================================================
print("\n" + "-"*80)
print("Cross-Validation Performance Comparison")
print("-"*80)

# Perform 10-fold cross-validation for both models
cv_results = {}
for model_name, model in [("Random Forest", best_rf_clf), ("XGBoost", best_xgb_clf)]:
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1')
    cv_results[model_name] = cv_scores
    print(f"\n{model_name} 10-Fold CV F1-scores:")
    print(f"  Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  Min: {cv_scores.min():.4f}, Max: {cv_scores.max():.4f}")

# Save CV results
cv_df = pd.DataFrame(cv_results)
cv_df.to_csv('result/cross_validation_scores.csv', index=False)
print("\nCross-validation scores saved to 'result/cross_validation_scores.csv'")

# ============================================================================
# SECTION 2.4: OVERFITTING ANALYSIS
# ============================================================================
print("\n" + "-"*80)
print("Overfitting Analysis: Training vs Test Performance")
print("-"*80)

overfitting_results = []

for model_name, model in [("Random Forest", best_rf_clf), ("XGBoost", best_xgb_clf)]:
    # Training predictions
    train_preds = model.predict(X_train)
    train_acc = accuracy_score(y_train, train_preds)
    train_f1 = f1_score(y_train, train_preds)

    # Test predictions
    test_preds = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_preds)
    test_f1 = f1_score(y_test, test_preds)

    # Calculate overfitting metrics
    acc_diff = train_acc - test_acc
    f1_diff = train_f1 - test_f1

    overfitting_results.append({
        'Model': model_name,
        'Train_Accuracy': train_acc,
        'Test_Accuracy': test_acc,
        'Accuracy_Diff': acc_diff,
        'Train_F1': train_f1,
        'Test_F1': test_f1,
        'F1_Diff': f1_diff
    })

    print(f"\n{model_name}:")
    print(f"  Training - Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    print(f"  Test     - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")
    print(f"  Difference - Accuracy: {acc_diff:.4f}, F1: {f1_diff:.4f}")

    # Overfitting assessment
    if acc_diff > 0.1 or f1_diff > 0.1:
        print(f"  WARNING: Potential overfitting detected!")
    elif acc_diff > 0.05 or f1_diff > 0.05:
        print(f"  NOTE: Mild overfitting, model is acceptable")
    else:
        print(f"  OK: No significant overfitting")

overfitting_df = pd.DataFrame(overfitting_results)
overfitting_df.to_csv('result/overfitting_analysis.csv', index=False)
print("\nOverfitting analysis saved to 'result/overfitting_analysis.csv'")

# ============================================================================
# SECTION 2.5: FINAL MODEL EVALUATION HEATMAP
# ============================================================================
print("\n" + "-"*80)
print("Generating Model Performance Heatmap")
print("-"*80)

final_results = []
for model_name, model in [("Random Forest", best_rf_clf), ("XGBoost", best_xgb_clf)]:
    for dataset_name, y_true, X_data in [("Training", y_train, X_train), ("Test", y_test, X_test)]:
        y_pred = model.predict(X_data)
        final_results.append({
            "Model": model_name,
            "Dataset": dataset_name,
            "Accuracy": accuracy_score(y_true, y_pred),
            "Precision": precision_score(y_true, y_pred, zero_division=0),
            "Recall": recall_score(y_true, y_pred, zero_division=0),
            "F1-Score": f1_score(y_true, y_pred, zero_division=0)
        })

final_results_df = pd.DataFrame(final_results)
print("\nFinal Model Performance:")
print(final_results_df.to_string(index=False))

# Save final results
final_results_df.to_csv('result/final_model_performance.csv', index=False)

# Generate heatmap
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, dataset_name in enumerate(["Training", "Test"]):
    subset_df = final_results_df[final_results_df['Dataset'] == dataset_name].set_index('Model')
    subset_df = subset_df.drop(columns=['Dataset'])

    sns.heatmap(subset_df, annot=True, cmap="YlOrRd", fmt=".3f",
                linewidths=.5, cbar_kws={"label": "Score"}, ax=axes[idx])
    axes[idx].set_title(f'Model Performance ({dataset_name} Set)', fontsize=14, pad=10)
    axes[idx].set_xlabel("Metrics", fontsize=12)
    axes[idx].set_ylabel("Model", fontsize=12)

plt.tight_layout()
plt.savefig('result/model_performance_heatmap.png', dpi=300, bbox_inches='tight')
print("\nModel performance heatmap saved to 'result/model_performance_heatmap.png'")
plt.close()

# ============================================================================
# SECTION 3: REGRESSION TASK FOR TARGET RANKING
# ============================================================================
print("\n" + "="*80)
print("SECTION 3: REGRESSION TASK FOR TARGET RANKING")
print("="*80)

# Prepare regression target
y_reg = df_processed['Importance_Score']

# Split regression data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.3, random_state=42
)

# ============================================================================
# SECTION 3.1: RANDOM FOREST REGRESSOR HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "-"*80)
print("Random Forest Regressor Hyperparameter Tuning")
print("-"*80)

rf_reg_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("\nHyperparameter grid for Random Forest Regressor:")
for param, values in rf_reg_param_grid.items():
    print(f"  {param}: {values}")

rf_regressor = RandomForestRegressor(random_state=42)
rf_reg_grid_search = GridSearchCV(
    estimator=rf_regressor,
    param_grid=rf_reg_param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

print("\nPerforming Grid Search (5-fold cross-validation)...")
rf_reg_grid_search.fit(X_train_reg, y_train_reg)

print(f"\nBest Random Forest Regressor parameters:")
for param, value in rf_reg_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best cross-validation R2 score: {rf_reg_grid_search.best_score_:.4f}")

best_rf_reg = rf_reg_grid_search.best_estimator_

# ============================================================================
# SECTION 3.2: XGBOOST REGRESSOR HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "-"*80)
print("XGBoost Regressor Hyperparameter Tuning")
print("-"*80)

xgb_reg_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

print("\nHyperparameter grid for XGBoost Regressor:")
for param, values in xgb_reg_param_grid.items():
    print(f"  {param}: {values}")

xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_reg_grid_search = GridSearchCV(
    estimator=xgb_regressor,
    param_grid=xgb_reg_param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

print("\nPerforming Grid Search (5-fold cross-validation)...")
xgb_reg_grid_search.fit(X_train_reg, y_train_reg)

print(f"\nBest XGBoost Regressor parameters:")
for param, value in xgb_reg_grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best cross-validation R2 score: {xgb_reg_grid_search.best_score_:.4f}")

best_xgb_reg = xgb_reg_grid_search.best_estimator_

# ============================================================================
# SECTION 3.3: REGRESSION MODEL EVALUATION
# ============================================================================
print("\n" + "-"*80)
print("Regression Model Evaluation")
print("-"*80)

regression_results = []

for model_name, model in [("Random Forest", best_rf_reg), ("XGBoost", best_xgb_reg)]:
    for dataset_name, y_true, X_data in [("Training", y_train_reg, X_train_reg),
                                          ("Test", y_test_reg, X_test_reg)]:
        y_pred = model.predict(X_data)
        regression_results.append({
            "Model": model_name,
            "Dataset": dataset_name,
            "R2": r2_score(y_true, y_pred),
            "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
            "MAE": mean_absolute_error(y_true, y_pred)
        })

regression_df = pd.DataFrame(regression_results)
print("\nRegression Model Performance:")
print(regression_df.to_string(index=False))

regression_df.to_csv('result/regression_performance.csv', index=False)
print("\nRegression performance saved to 'result/regression_performance.csv'")

# ============================================================================
# SECTION 3.4: FEATURE IMPORTANCE ANALYSIS
# ============================================================================
print("\n" + "-"*80)
print("Feature Importance Analysis")
print("-"*80)

# Get feature importances from both models
rf_importance = best_rf_reg.feature_importances_
xgb_importance = best_xgb_reg.feature_importances_

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'RF_Importance': rf_importance,
    'XGB_Importance': xgb_importance
})

# Calculate ensemble importance (average)
feature_importance_df['Ensemble_Importance'] = (
    feature_importance_df['RF_Importance'] + feature_importance_df['XGB_Importance']
) / 2

# Sort by ensemble importance
feature_importance_df = feature_importance_df.sort_values(
    'Ensemble_Importance', ascending=False
).reset_index(drop=True)

print("\nTop 15 Most Important Features:")
print(feature_importance_df.head(15).to_string(index=False))

feature_importance_df.to_csv('result/feature_importance.csv', index=False)
print("\nFeature importance saved to 'result/feature_importance.csv'")

# Plot feature importance
fig, axes = plt.subplots(1, 3, figsize=(20, 8))

# Random Forest importance
top_features = feature_importance_df.head(15)
sns.barplot(x='RF_Importance', y='Feature', data=top_features,
            palette='viridis', ax=axes[0])
axes[0].set_title('Random Forest Feature Importance', fontsize=14, pad=10)
axes[0].set_xlabel('Importance Score', fontsize=12)
axes[0].set_ylabel('')
axes[0].grid(axis='x', linestyle='--', alpha=0.6)

# XGBoost importance
sns.barplot(x='XGB_Importance', y='Feature', data=top_features,
            palette='viridis', ax=axes[1])
axes[1].set_title('XGBoost Feature Importance', fontsize=14, pad=10)
axes[1].set_xlabel('Importance Score', fontsize=12)
axes[1].set_ylabel('')
axes[1].grid(axis='x', linestyle='--', alpha=0.6)

# Ensemble importance
sns.barplot(x='Ensemble_Importance', y='Feature', data=top_features,
            palette='viridis', ax=axes[2])
axes[2].set_title('Ensemble Feature Importance', fontsize=14, pad=10)
axes[2].set_xlabel('Importance Score', fontsize=12)
axes[2].set_ylabel('')
axes[2].grid(axis='x', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.savefig('result/feature_importance_comparison.png', dpi=300, bbox_inches='tight')
print("\nFeature importance comparison saved to 'result/feature_importance_comparison.png'")
plt.close()

# ============================================================================
# SECTION 4: FINAL TARGET RANKING
# ============================================================================
print("\n" + "="*80)
print("SECTION 4: FINAL TARGET RANKING")
print("="*80)

# Train on full dataset
print("\nTraining final models on full dataset...")
best_rf_reg.fit(X, y_reg)
best_xgb_reg.fit(X, y_reg)

# Generate predictions
rf_predictions = best_rf_reg.predict(X)
xgb_predictions = best_xgb_reg.predict(X)

# Create ranking dataframe
ranking_results_df = pd.DataFrame({
    'Target': df['shared name'],
    'RF_Score': rf_predictions,
    'XGB_Score': xgb_predictions
})

# Calculate ensemble score
ranking_results_df['Ensemble_Score'] = (
    ranking_results_df['RF_Score'] + ranking_results_df['XGB_Score']
) / 2

# Normalize to 0-15 scale
scaler_final = MinMaxScaler(feature_range=(0, 15))
ranking_results_df['ML_Score'] = scaler_final.fit_transform(
    ranking_results_df[['Ensemble_Score']]
)

# Sort by ML score
ranked_targets = ranking_results_df.sort_values(
    by='ML_Score', ascending=False
).reset_index(drop=True)

print("\nFinal Target Ranking (Top 20):")
print(ranked_targets.head(20).to_string(index=False))

# Save full ranking
ranked_targets.to_csv('result/final_target_ranking.csv', index=False)
print("\nFull ranking saved to 'result/final_target_ranking.csv'")

# ============================================================================
# SECTION 5: VISUALIZATION OF FINAL RANKING
# ============================================================================
print("\n" + "-"*80)
print("Generating Final Ranking Visualization")
print("-"*80)

top_n = 15
plot_data = ranked_targets.head(top_n)

plt.figure(figsize=(12, 10))

# Color scheme: all bars in green
colors = ['#2ca02c' for i in range(len(plot_data))]

ax = sns.barplot(x='ML_Score', y='Target', data=plot_data,
                 palette=colors, orient='h')

plt.title('Target Priority Ranking Based on Ensemble ML Model',
          fontsize=18, pad=20)
plt.xlabel('Machine Learning Score', fontsize=14)
plt.ylabel('')

# Add value labels
for i, v in enumerate(plot_data['ML_Score']):
    ax.text(v + 0.1, i, f'{v:.2f}', color='black', va='center', fontsize=11)

plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()

plt.savefig('result/final_target_ranking_visualization.png', dpi=300, bbox_inches='tight')
print("\nFinal ranking visualization saved to 'result/final_target_ranking_visualization.png'")
plt.close()

# ============================================================================
# SECTION 6: HYPERPARAMETER SUMMARY REPORT
# ============================================================================
print("\n" + "="*80)
print("SECTION 6: EXPERIMENTAL SUMMARY")
print("="*80)

summary_report = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_size': len(df),
    'n_features': len(feature_names),
    'training_size': len(X_train),
    'test_size': len(X_test),
    'classification_threshold': threshold,
    'best_rf_classifier_params': rf_grid_search.best_params_,
    'best_rf_classifier_cv_score': rf_grid_search.best_score_,
    'best_xgb_classifier_params': xgb_grid_search.best_params_,
    'best_xgb_classifier_cv_score': xgb_grid_search.best_score_,
    'best_rf_regressor_params': rf_reg_grid_search.best_params_,
    'best_rf_regressor_cv_score': rf_reg_grid_search.best_score_,
    'best_xgb_regressor_params': xgb_reg_grid_search.best_params_,
    'best_xgb_regressor_cv_score': xgb_reg_grid_search.best_score_,
}

print("\nExperimental Summary:")
print(f"  Timestamp: {summary_report['timestamp']}")
print(f"  Dataset size: {summary_report['dataset_size']}")
print(f"  Number of features: {summary_report['n_features']}")
print(f"  Training/Test split: {summary_report['training_size']}/{summary_report['test_size']}")
print(f"  Classification threshold: {summary_report['classification_threshold']:.4f}")
print(f"\n  Random Forest Classifier - Best CV F1: {summary_report['best_rf_classifier_cv_score']:.4f}")
print(f"  XGBoost Classifier - Best CV F1: {summary_report['best_xgb_classifier_cv_score']:.4f}")
print(f"  Random Forest Regressor - Best CV R2: {summary_report['best_rf_regressor_cv_score']:.4f}")
print(f"  XGBoost Regressor - Best CV R2: {summary_report['best_xgb_regressor_cv_score']:.4f}")

# Save summary report as JSON
import json
with open('result/experimental_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2)

print("\n" + "="*80)
print("ALL EXPERIMENTS COMPLETED SUCCESSFULLY!")
print("="*80)
print("\nResults saved in 'result/' folder:")
print("  - cross_validation_scores.csv")
print("  - overfitting_analysis.csv")
print("  - final_model_performance.csv")
print("  - model_performance_heatmap.png")
print("  - regression_performance.csv")
print("  - feature_importance.csv")
print("  - feature_importance_comparison.png")
print("  - final_target_ranking.csv")
print("  - final_target_ranking_visualization.png")
print("  - experimental_summary.json")
print("="*80)
