#####################################################################################
# A. Dutta 2021
# 
# This script trains the SVM classifier with the 'rbf' kernel on heat of formation 
# of materials. A material is classified as (un)stable if its heat of formation is 
# (>) < 0. The dataset 'cmr-combined-all-features.csv' containing 3515 2D materials 
# is loaded, out of which 70% is used for 'GridSearchCV' based 5-fold cross-
# validation and the rest 30% is used as the test set.
# ===================================================================================
# The training and evaluation of the model perfomance is done for 100
# different splits of the original dataset and the results are stored in the
# file './stats/hform-combined-rbfSVC-stat.txt'.
#####################################################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import classification_report
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

hform_data = pd.read_csv('./datasets/cmr-combined-all-features.csv',sep=',')

print('Shape= ',hform_data.shape)

# Select desired features
hform_data = hform_data[['mean_Z','del_Z','mode_Z','mean_grp','del_grp','mode_grp',\
'mean_period','del_period','mode_period','mean_valelec','del_valelec','mode_valelec','mean_electroneg','del_electroneg',\
'mode_electroneg', 'stoichio_entropy', 'cell_area', 'L2_norm', 'L3_norm', 'hform',\
'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11','smeig12'\
]]

print('Shape= ',hform_data.shape)

Nsamples = hform_data.shape[0]
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
file1 = open("./stats/hform-combined-rbfSVC-stat.txt", "a")
file1.write("# C,\tgamma,\ttn_train,\tfp_train,\tfn_train,\ttp_train,\tf1_train,\ttn_test,\tfp_test,\tfn_test,\ttp_test,\tf1_test #\n\n")
Nruns = 100
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split

for cnt in range(Nruns):
	print("Run count : ", cnt+1)

	#Shuffle rows randomly
	hform_data = hform_data.sample(frac=1, axis=0).reset_index(drop=True)

	y = hform_data[['hform']].values
	X = hform_data.drop(labels=['hform'], axis=1, inplace=False).to_numpy()
	
	# Test/train split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

	y_test_data = y_test[:,0]
	Ntrain = np.shape(y_train)[0] # no. of training examples

	# Scale the features and the target
	# Scaling feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
	X_train = Xscaler.transform(X_train)
	X_test = Xscaler.transform(X_test)

	# Scaling target data
	#y_train = y_train.reshape(-1,1)
	#y_test = y_test.reshape(-1,1)

	#Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
	#y_train = Yscaler.transform(y_train)
	#y_test = Yscaler.transform(y_test)
	#######################

	# Digitise the target variable
	y_train = [0 if i > 0 else 1 for i in y_train]
	y_test = [0 if i > 0 else 1 for i in y_test]

	#Create SVM classifier
	N_CV = 10
	C_vals = np.logspace(0,9,10,base=2)
	gamma_vals = np.logspace(-13,-4,10,base=2)
	kernels = ['rbf']

	param_grid = dict(kernel=kernels, C=C_vals, gamma=gamma_vals)
	# Perform parallelised grid search over parmeter space (uses all available cores)
	#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
	#svc = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1, verbose=0)
	svc = GridSearchCV(SVC(), param_grid=param_grid, cv=N_CV, scoring='f1', refit=True, n_jobs=-1, verbose=0)

	opt = svc.fit(X_train,y_train)

	# append to file
	row = []
	for k, v in opt.best_params_.items():
		if k != "kernel":
			row.append(v)

	#y_pred = opt.best_estimator_.predict(X_test)
	#y_pred_train = opt.best_estimator_.predict(X_train)
	print('Best parameters : ', opt.best_params_)
	print('Scorer f1 :\t', 'best score : ', opt.best_score_)

	print('Training results:\n')
	y_train_pred = opt.best_estimator_.predict(X_train)
	print(confusion_matrix(y_train,y_train_pred))
	for v in confusion_matrix(y_train,y_train_pred).flatten():
		row.append(v)
	row.append(opt.best_score_)

	print('\nTest results:\n')
	y_pred = opt.best_estimator_.predict(X_test)
	print(confusion_matrix(y_test,y_pred))
	for v in confusion_matrix(y_test,y_pred).flatten():
		row.append(v)

	print('Precision=',precision_score(y_test, y_pred, pos_label=1))
	print('Recall=',recall_score(y_test, y_pred, pos_label=1))
	print('f1=',f1_score(y_test, y_pred, pos_label=1))
	row.append(f1_score(y_test, y_pred, pos_label=1))
	file1.write(','.join(map(str, row)))
	file1.write('\n')
