##############################################################################
# A. Dutta 2021
# 
# This script trains the Kernel Ridge Regressor (KRR) on heat of formation of 
# materials. The dataset 'cmr-stable-all-features.csv' containing 2846 
# energetically stable 2D materials is loaded, out of which 70% is used for 
# 'GridSearchCV' based 5-fold cross-validation and the rest 30% is used as 
# the test set.
# ===========================================================================
# The training and evaluation of the model perfomance is done for 100
# different splits of the original dataset and the results are stored in the
# file './stats/hform-stable-KRR-stat.txt'.
##############################################################################
#import required packages
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

hform_data = pd.read_csv('../../datasets/hform/cmr-stable-all-features.csv',sep=',')
print('Shape= ',hform_data.shape)

# Select desired features
hform_data = hform_data[['mean_Z','del_Z','mode_Z','mean_grp','del_grp','mode_grp',\
'mean_period','del_period','mode_period','mean_valelec','del_valelec','mode_valelec','mean_electroneg','del_electroneg',\
'mode_electroneg', 'stoichio_entropy', 'cell_area', 'L2_norm', 'L3_norm', 'hform',\
'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11','smeig12'\
]]

print('Shape= ',hform_data.shape)
Nsamples = hform_data.shape[0]
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
file1 = open("./stats/hform-stable-KRR-stat.txt", "a")
file1.write("# alpha,\tgamma,\tr2_train,\tr2_test,\tpearsoni_train,\tpearson_test,\texp_var_train,\texp_var_test,\tMAE_train,\tMAE_test,\tMAE_train_inv,\tMAE_test_inv #\n\n")
Nruns = 100
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split
# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler
# SVM regerssor and GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

for cnt in range(Nruns):
	print("Run count : ", cnt+1)
	# Shuffle rows randomly
	hform_data = hform_data.sample(frac=1, axis=0).reset_index(drop=True)
	
	y = hform_data[['hform']].values
	X = hform_data.drop(labels=['hform'], axis=1, inplace=False).to_numpy()
	
	# Test/train split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
	
	y_test_data = y_test[:,0]
	Ntrain = np.shape(y_train)[0] # no. of training examples
	
	# Scaling feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
	aler = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
	X_train = Xscaler.transform(X_train)
	X_test = Xscaler.transform(X_test)
	
	# Scaling target data
	y_train = y_train.reshape(-1,1)
	y_test = y_test.reshape(-1,1)
	
	Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
	#Yscaler = MinMaxScaler(feature_range=(-1, 1)).fit(y_train)
	y_train = Yscaler.transform(y_train)
	y_test = Yscaler.transform(y_test)
	
	#Create SVM regressor
	N_CV = 10
	alpha_vals = np.logspace(-2,3,10, base=2, endpoint=True)
	gamma_vals = np.logspace(-3,0,12, base=2, endpoint=True)
	
	parameters = {'kernel':['rbf'], 'alpha':alpha_vals, 'gamma':gamma_vals}
	# Perform parallelised grid search over parmeter space (uses all available cores)
	reg = GridSearchCV(KernelRidge(), parameters, scoring='neg_mean_absolute_error', cv=N_CV, n_jobs=-1, verbose=0)
	opt = reg.fit(X_train, y_train.ravel())
	
	print(opt.best_params_.items())
	# append to file
	row = []
	for k,v in opt.best_params_.items():
		if k != 'kernel':
			row.append(v)
	
	y_pred = opt.best_estimator_.predict(X_test)
	y_pred_train = opt.best_estimator_.predict(X_train)
	
	# R^2 for training set
	row.append(r2_score(y_train, y_pred_train))
	# R^2 for test set 
	row.append(r2_score(y_test, y_pred))
	
	y_train_data = y_train[:,0]
	
	# Pearson's correlation coefficient
	r = np.corrcoef(y_train_data, y_pred_train)
	row.append(r[0,1])
	r = np.corrcoef(y_test_data, y_pred)
	row.append(r[0,1])
	
	# Explained variance
	row.append(explained_variance_score(y_train,y_pred_train))
	row.append(explained_variance_score(y_test,y_pred))
	
	# The mean squared error
	#print('RMSE for train y: %.4f'
	#        % np.sqrt(mean_squared_error(y_train, y_pred_train)))
	#print('RMSE for test y: %.4f '
	#        % np.sqrt(mean_squared_error(y_test, y_pred)))
	#print()
	
	#Mean absolute error
	row.append(mean_absolute_error(y_train, y_pred_train))
	row.append(mean_absolute_error(y_test, y_pred))
	
	# Perform the inverse scaling transormations
	X_test_inv = Xscaler.inverse_transform(X_test)
	y_test_inv = Yscaler.inverse_transform(y_test.reshape(-1,1))
	y_train_inv = Yscaler.inverse_transform(y_train.reshape(-1,1))
	y_pred_inv = Yscaler.inverse_transform(y_pred.reshape(-1,1))
	y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1))
	
	#Mean absolute error
	row.append(mean_absolute_error(y_train_inv, y_pred_train_inv))
	row.append(mean_absolute_error(y_test_inv, y_pred_inv))
	
	file1.write(','.join(map(str,row)))
	file1.write('\n')
