#####################################################################################
# A. Dutta 2021
# 
# This script trains the SVM classifier with the 'rbf' kernel to classify magnetic vs
# non-magnetic compounds. The dataset 'cmr-mag-classification.csv' containing 2759 
# stable 2D materials is loaded, out of which 80% is used for 'GridSearchCV' based 
# 5-fold cross-validation and the rest 20% is used as the test set.
# ===================================================================================
# The training and evaluation of the model perfomance is done for 100
# different splits of the original dataset and the results are stored in the
# file './stats/mag-rbfSVC-sp0.20-stat.txt'.
#####################################################################################
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import classification_report
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

Mag_data = pd.read_csv('../../datasets/magnetic/cmr-mag-classification.csv',sep=',')
print(Mag_data.shape)

# Select desired features
Mag_data = Mag_data[['d_elect','mean_Z','del_Z','mode_Z','mean_group','del_group','mode_group',\
'mean_period','del_period','mode_period','mean_val','del_val','mode_val','mean_electroneg','del_electroneg',\
'mode_electroneg', 'cell_area',\
'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11','smeig12', 'mag_state'\
]]

#'acsf1','acsf2','acsf3','acsf4','acsf5','acsf6','acsf7','acsf8','acsf9','acsf10','acsf11','acsf12','acsf13','acsf14','acsf15','acsf16',\
#'acsf17','acsf18','acsf19','acsf20','acsf21','acsf22','acsf23','acsf24','acsf25','acsf26','acsf27','acsf28','acsf29','acsf30','acsf31','acsf32',\
#'entropy', 'L2_norm', 'L3_norm',\ 

#Mag_data.drop(labels=['id','n_metal'], axis=1, inplace=True)
print(Mag_data.shape)

Nsamples = Mag_data.shape[0]
#print('log(Mag) selected between [', y_min, ':', y_max, ']')
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
file1 = open("./stats/mag-rbfSVC-sp0.2-stat.txt", "a")
file1.write("# C,\tgamma,\ttn_train,\tfp_train,\tfn_train,\ttp_train,\tf1_train,\ttn_test,\tfp_test,\tfn_test,\ttp_test,\tf1_test #\n\n")
Nruns = 100
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split

for cnt in range(Nruns):
	print("Run count : ", cnt+1)

	#Shuffle rows randomly
	Mag_data = Mag_data.sample(frac=1, axis=0).reset_index(drop=True)

	ismagnetic = Mag_data[['mag_state']].values
	X = Mag_data.drop(labels=['mag_state'], axis=1, inplace=False).to_numpy()
	
	#print(ismagnetic)
	y = [0 if val == ' NM' else 1 for val in ismagnetic]
	#print(y)
	
	# Test/train split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

	#y_test_data = y_test[:,0]
	Ntrain = np.shape(y_train)[0] # no. of training examples

	# Scale the features and the target
	# Scaling feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
	X_train = Xscaler.transform(X_train)
	X_test = Xscaler.transform(X_test)

	#######################

	#Create SVM classifier
	N_CV = 10
	C_vals = np.logspace(0,9,10,base=2)
	gamma_vals = np.logspace(-13,-4,10,base=2)
	kernels = ['rbf']

	param_grid = dict(kernel=kernels, C=C_vals, gamma=gamma_vals)
	# Perform parallelised grid search over parmeter space (uses all available cores)
	#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
	#svc = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1, verbose=0)
	svc = GridSearchCV(SVC(), param_grid=param_grid, cv=N_CV, scoring='f1', refit=True, n_jobs=-1, verbose=0)

	opt = svc.fit(X_train,y_train)

	# append to file
	row = []
	for k, v in opt.best_params_.items():
		if k != "kernel":
			row.append(v)

	#y_pred = opt.best_estimator_.predict(X_test)
	#y_pred_train = opt.best_estimator_.predict(X_train)
	print('Best parameters : ', opt.best_params_)
	print('Scorer f1 :\t', 'best score : ', opt.best_score_)

	print('Training results:\n')
	y_train_pred = opt.best_estimator_.predict(X_train)
	print(confusion_matrix(y_train,y_train_pred))
	for v in confusion_matrix(y_train,y_train_pred).flatten():
		row.append(v)
	row.append(opt.best_score_)

	print('\nTest results:\n')
	y_pred = opt.best_estimator_.predict(X_test)
	print(confusion_matrix(y_test,y_pred))
	for v in confusion_matrix(y_test,y_pred).flatten():
		row.append(v)

	print('Precision=',precision_score(y_test, y_pred, pos_label=1))
	print('Recall=',recall_score(y_test, y_pred, pos_label=1))
	print('f1=',f1_score(y_test, y_pred, pos_label=1))
	row.append(f1_score(y_test, y_pred, pos_label=1))
	file1.write(','.join(map(str, row)))
	file1.write('\n\n')
