#######################################
# Arijit Dutta
#
# This script contains an SVM classifier 
# with the 'rbf' kernel that classifies 
# stable, magnetic compounds in to
# two classes: 'HIGH' - 1 or 'LOW' - 0 
# magnetic anisotropy energy (MAE). The cutoff 
# for LOW-HIGH classification is taken as the 
# median value of the training set MAE.
#########################################

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import plot_partial_dependence
import statistics
import pandas as pd
from sklearn.svm import SVC

MAE_data = pd.read_csv('../../datasets/anisotropy/data-long1.csv',sep=',')
print(MAE_data.shape)

# Add derived features
MAE_data['MAE'] = abs(MAE_data[['E_zx', 'E_zy']].min(axis=1))
print(MAE_data.shape)

# Scale MAE to per metal atom
MAE_data['MAE'] = MAE_data['MAE']/MAE_data['n_metal']
# Scale moment to per metal atom
#MAE_data['mom'] = MAE_data['mag_mom']/MAE_data['n_metal']

MAE_data['log_MAE'] = np.log(MAE_data['MAE'])
#MAE_data['gap'] = MAE_data[['uu','dd','ud','du']].min(axis=1)
#MAE_data['gap_inv'] = 1.0 / MAE_data['gap']
print(MAE_data.shape)

# Drop unwanted features
#MAE_data.drop(labels=['id','uu','ud','du','dd','MAE','mag_mom','E_zx','E_zy','n_metal',\
#'H','He','Li','Be','B','C','N','O','F','Ne',\
#'Na','Mg','Al','Si','P','S','Cl','Ar',\
#'K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr',\
#'Rb','Sr','Y','Zr','Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe',\
#'Cs','Ba','La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu',\
#'Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg','Tl','Pb','Bi','Po','At'], axis=1, inplace=True)
#print(MAE_data.shape)

#Keep those features with importance > 0.01
MAE_data=MAE_data[['log_MAE', 'smeig12', 'mean_val', 'mean_Z', 'smeig11', 'smeig10', 'del_Z', 'mode_group', 'del_group','mean_group', 'del_electroneg', 'acsf48', 'acsf46', 'd_elect', 'acsf43', 'mode_val', 'acsf47', 'acsf49', 'acsf45', 'acsf44', 'cell_area', 'mode_Z', 'acsf215', 'acsf212', 'smeig9', 'acsf216', 'mean_period', 'mean_val','acsf213','mean_electroneg','mode_electroneg','smeig8','del_period','smeig7', 'acsf45']] #,'mom']] #,'gap']]

print(MAE_data.shape)

#sb.distplot(MAE_data['log_MAE'])
#plt.show()

# Select rows after setting cutoff for the target variable
y_min = -6 # -6 for MAE, -10 for MAE/area, -7.5 for MAE/#metal
y_max = 6
MAE_data = MAE_data.loc[MAE_data['log_MAE'] >= y_min]
MAE_data = MAE_data.loc[MAE_data['log_MAE'] <= y_max]

Nsamples = MAE_data.shape[0]
print('log(MAE) selected between [', y_min, ':', y_max, ']')
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
file1 = open("./stats/MAE-rbfSVC-sp0.2-impgt0.01-stat.txt", "a")
file1.write("# C,\tgamma,\ttn_train,\tfp_train,\tfn_train,\ttp_train,\tf1_train,\ttn_test,\tfp_test,\tfn_test,\ttp_test,\tf1_test #\n\n")

Nruns = 100
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split
# For scaling the features and the target
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler

for cnt in range(Nruns):
	print("Run count : ", cnt+1)
	#Shuffle rows randomly
	MAE_data = MAE_data.sample(frac=1, axis=0).reset_index(drop=True)

	#area = MAE_data[['cell_area']].values
	#total_area = 0.0
	#for i in range(MAE_data.shape[0]):
	#	total_area += float(area[i])

	#av_area = total_area/(MAE_data.shape[0])
	av_area = np.mean(MAE_data['cell_area'].values)
	print('Average cell area ',av_area)


	MAE_log = MAE_data[['log_MAE']].values # Just to find the length, below.
	X = MAE_data.drop(labels=['log_MAE'], axis=1, inplace=False).to_numpy()

	# This test/train split is to find the median of the train MAE values
	X_train, X_test, y_train, y_test = train_test_split(X, MAE_log, test_size=0.2, random_state=1)

	med = statistics.median(y_train)
	print('Median log MAE = ',med)

	y = []
	for i in range(len(MAE_log)):
		if (MAE_log[i][0] <= med ): y.append(0)
		else: y.append(1)
	y = np.asarray(y)
	#print(y)


	#sb.distplot(y)
	#plt.show()

	#MAE_data.drop(labels=['log_MAE'], axis=1, inplace=True)
	#X = MAE_data.to_numpy()
	# Test/train split, again for classification model. Note, we will have exactly
	# the same train-test split as before because we set the same value for random_state.
	# Else, this would have led to data leakage.
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
	print('Shape ',X_train.shape)

	Ntrain = np.shape(y_train)[0] # no. of training examples

	# Scale feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
	#Xscaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
	X_train = Xscaler.transform(X_train)
	X_test = Xscaler.transform(X_test)

	#######################

	# Create SVM classifier
	N_CV = 10
	C_vals = np.logspace(-2,5,10,base=2)
	gamma_vals = np.logspace(-5,2,10,base=2)
	kernels = ['rbf']

	param_grid = dict(kernel=kernels, C=C_vals, gamma=gamma_vals)
	# Perform parallelised grid search over parmeter space (uses all available cores)
	svc = GridSearchCV(SVC(), param_grid=param_grid, cv=N_CV, scoring='f1', refit=True, n_jobs=-1, verbose=0)
	opt = svc.fit(X_train,y_train)

	# append to file
	row = []
	for k, v in opt.best_params_.items():
		if k != "kernel":
			row.append(v)

	#y_pred = opt.best_estimator_.predict(X_test)
	#y_pred_train = opt.best_estimator_.predict(X_train)
	print('Best parameters : ', opt.best_params_)
	print('Scorer f1 :\t', 'best score : ', opt.best_score_)

	print('Training results:\n')
	y_train_pred = opt.best_estimator_.predict(X_train)
	print(confusion_matrix(y_train,y_train_pred))
	for v in confusion_matrix(y_train,y_train_pred).flatten():
		row.append(v)
	row.append(opt.best_score_)

	print('\nTest results:\n')
	y_pred = opt.best_estimator_.predict(X_test)
	print(confusion_matrix(y_test,y_pred))
	for v in confusion_matrix(y_test,y_pred).flatten():
		row.append(v)

	print('Precision=',precision_score(y_test, y_pred, pos_label=1))
	print('Recall=',recall_score(y_test, y_pred, pos_label=1))
	print('f1=',f1_score(y_test, y_pred, pos_label=1))
	print('\n\n')
	row.append(f1_score(y_test, y_pred, pos_label=1))
	file1.write(','.join(map(str, row)))
	file1.write('\n')
