#####################################################################################
# P. Sen and A. Dutta 2021
# 
# This script trains the RF regressor to predict the moment value on the magnetic 
# compounds. The dataset 'data-long2.csv' containing 827 stable 2D magnetic materials
# is loaded, out of which 80% is used for 'GridSearchCV'based 5-fold cross-validation
# and the rest 20% is used as the test set.
# ===================================================================================
# The training and evaluation of the model perfomance is done for 100
# different splits of the original dataset and the results are stored in the
# file './stats/mag-refRF-sp0.2-stat2.txt'. The feature importances generated by
# RF are stored in the file './stats/mag-regRF-sp0.2-2.csv'.
#####################################################################################
import seaborn as sb
import numpy as np
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import plot_partial_dependence
import statistics
import pandas as pd
from math import sqrt

print("Magnetic regression using RF")

MAE_data = pd.read_csv('../../datasets/magnetic/data-long2.csv',sep=',')
print(MAE_data.shape)
# Add derived features
# mom is mag_mom per metal atom
MAE_data['mom'] = MAE_data['mag_mom']/MAE_data['n_metal']
MAE_data['mom'] = abs(MAE_data['mom'].values)
print(MAE_data.shape)

# Drop unwanted features
MAE_data.drop(labels=['id','uu','dd','ud','du','mag_mom', 'n_metal', 'E_zx', 'E_zy',\
'H','He','Li','Be','B','C','N','O','F','Ne',\
'Na','Mg','Al','Si','P','S','Cl','Ar',\
'K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr',\
'Rb','Sr','Y','Zr','Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe',\
'Cs','Ba','La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu',\
'Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg','Tl','Pb','Bi','Po','At'], axis=1, inplace=True)

#Keep those features with importance > 0.01
#MAE_data=MAE_data[['del_val', 'mean_val', 'd_elect', 'del_group', 'mean_group', 'mode_val', 'smeig12', 'del_electroneg', 'mean_electroneg', 'smeig11', 'del_Z', 'smeig10', 'acsf46', 'acsf49', 'mean_Z',\
#'acsf48', 'cell_area', 'mode_group', 'acsf44', 'acsf43','acsf47', 'acsf45', 'smeig9','acsf215', 'acsf213','acsf216','acsf212', 'mode_electroneg', 'del_period', 'acsf42','acsf39','acsf174', 'mom']]#,'acsf173'

print('Shape = ',MAE_data.shape)

# Select rows after setting cutoff for the target variable
y_min = 0.0
MAE_data = MAE_data.loc[MAE_data['mom'] >= y_min]

Nsamples = MAE_data.shape[0]
#print('log(MAE) selected between [', y_min, ':', y_max, ']')
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
file1 = open("./stats/mag-regRF-sp0.2-stat2.txt", "a")
#file1 = open("./stats/mag-regRF-sp0.2-impgt0.01-stat2.txt", "a")
file1.write("# ccp_alpha,\tmax_depth,\tmax_samples,\tmin_samples_leaf,\tmin_samples_split,\tn_estimators,\tr2_train,\tr2_test,\tpearson_train,\tpearson_test,\texp_var_train,\texp_var_test,\tMAE_train,\tMAE_test,\tMAE_train_inv,\tMAE_test_inv #\n\n")

imp = pd.DataFrame()

Nruns = 100
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split
# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.model_selection import GridSearchCV

for cnt in range(Nruns):
	print("Run count : ", cnt+1)
	
	#Shuffle rows randomly
	MAE_data = MAE_data.sample(frac=1, axis=0).reset_index(drop=True)
	
	y = MAE_data[['mom']].values
	X = MAE_data.drop(labels=['mom'], axis=1, inplace=False).to_numpy()
	
	# Test/train split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
	
	y_test_data = y_test[:,0]
	Ntrain = np.shape(y_train)[0] # no. of training examples
	
	# Scaling feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
	X_train = Xscaler.transform(X_train)
	X_test = Xscaler.transform(X_test)
	
	# Scaling target data
	y_train = y_train.reshape(-1,1)
	y_test = y_test.reshape(-1,1)
	Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
	#Yscaler = MinMaxScaler(feature_range=(-1, 1)).fit(y_train)
	y_train = Yscaler.transform(y_train)
	y_test = Yscaler.transform(y_test)
	#######################
	
	#Create random forest regressor
	N_CV = 5
	'''
	estimators = np.linspace(100,225,6, dtype=int, endpoint=True)
	depth = np.linspace(11,25,15, dtype=int, endpoint=True)
	min_split = np.linspace(2,5,4, dtype=int, endpoint=True)
	min_leaf = [1]
	max_sample = np.linspace(0.6,0.6,1, endpoint=True)  # 'max_samples':max_sample
	max_features = 'sqrt'
	alphas = np.linspace(0.0,0.3,4, endpoint=True)
	'''
	estimators = [100, 125, 150, 175, 200, 225]
	depth = [15, 16, 17, 18, 19, 20, 21, 22]
	min_split = np.linspace(2,5,4, dtype=int, endpoint=True)
	min_leaf = [1]
	max_sample = np.linspace(0.6,0.6,1, endpoint=True)  # 'max_samples':max_sample
	max_features = 'sqrt'
	#alphas = np.linspace(0.0,0.3,4, endpoint=True)
	alphas = [0.0]
	
	parameters = {'n_estimators':estimators, 'max_depth':depth, 'max_samples':max_sample,\
	       'min_samples_leaf':min_leaf, 'min_samples_split':min_split, 'ccp_alpha':alphas}
	rf = GridSearchCV(RandomForestRegressor(criterion='mae',max_features='sqrt'), parameters, n_jobs=-1, cv=N_CV, verbose=0)
	
	opt = rf.fit(X_train,np.ravel(y_train))
	
	# append to file
	row = []
	for k,v in opt.best_params_.items():
	    print(k)
	    row.append(v)
	
	y_pred = opt.best_estimator_.predict(X_test)
	y_pred_train = opt.best_estimator_.predict(X_train)
	
	# R^2 for training set
	row.append(r2_score(y_train, y_pred_train))
	# R^2 for test set 
	row.append(r2_score(y_test, y_pred))
	
	y_train_data = y_train[:,0]
	
	# Pearson's correlation coefficient
	r = np.corrcoef(y_train_data, y_pred_train)
	row.append(r[0,1])
	r = np.corrcoef(y_test_data, y_pred)
	row.append(r[0,1])
	
	
	# Explained variance
	row.append(explained_variance_score(y_train,y_pred_train))
	row.append(explained_variance_score(y_test,y_pred))
	
	#Mean absolute error
	row.append(mean_absolute_error(y_train, y_pred_train))
	row.append(mean_absolute_error(y_test, y_pred))
	
	# Perform the inverse scaling transormations
	X_test_inv = Xscaler.inverse_transform(X_test)
	y_test_inv = Yscaler.inverse_transform(y_test.reshape(-1,1))
	y_train_inv = Yscaler.inverse_transform(y_train.reshape(-1,1))
	y_pred_inv = Yscaler.inverse_transform(y_pred.reshape(-1,1))
	y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1))
	
	#Mean absolute error
	row.append(mean_absolute_error(y_train_inv, y_pred_train_inv))
	row.append(mean_absolute_error(y_test_inv, y_pred_inv))
	
	file1.write(','.join(map(str,row)))
	file1.write('\n\n')

    # Append the feature rankings to a dataframe
	imp = imp.append([opt.best_estimator_.feature_importances_])

imp.columns = MAE_data.drop('mom', axis=1, inplace=False).columns	

# write the feature importance table to a file
imp.to_csv(r'./stats/mag-regRF-sp0.2-2.csv')
#imp.to_csv(r'./stats/mag-regRF-sp0.2-impgt0.01-2.csv')
