#############################################
# P. Sen 2021
#
# This script first trains a RF classifier to classify materials into 
# high and low magnetic anisotropy sets. This model is then used to classify the 10
# stable high magnetic moment (> 2.5 \mu_B/TM) materials into high and low
# anisotropy classes.
#
###############################################

# into magnetic and non-magnetic classes.
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import plot_partial_dependence
import statistics

import pandas as pd
from math import sqrt

MAE_data = pd.read_csv('./data-long2.csv',sep=',')
print(MAE_data.shape)

# Add derived features
MAE_data['MAE'] = abs(MAE_data[['E_zx', 'E_zy']].min(axis=1))

# Scale MAE to per metal atom
MAE_data['MAE'] = MAE_data['MAE']/MAE_data['n_metal']
# Scale moment to per metal atom
MAE_data['mom'] = MAE_data['mag_mom']/MAE_data['n_metal']

MAE_data['log_MAE'] = np.log(MAE_data['MAE'])
MAE_data['gap'] = MAE_data[['uu','dd','ud','du']].min(axis=1)
MAE_data['gap_inv'] = 1.0 / MAE_data['gap']
print(MAE_data.shape)

# Select desired features
#MAE_data = MAE_data[['gap', 'mom', 'd_elect', 'mean_Z','del_Z','mode_Z','mean_group','del_group','mode_group',\
#'mean_period','del_period','mode_period','mean_val','del_val','mode_val','mean_electroneg','del_electroneg',\
#'mode_electroneg', 'entropy', 'cell_area', 'L2_norm', 'L3_norm', 'log_MAE',\
#'acsf1','acsf2','acsf3','acsf4','acsf5','acsf6','acsf7','acsf8','acsf9','acsf10','acsf11','acsf12','acsf13','acsf14','acsf15','acsf16',\
#'acsf17','acsf18','acsf19','acsf20','acsf21','acsf22','acsf23','acsf24','acsf25','acsf26','acsf27','acsf28','acsf29','acsf30','acsf31','acsf32',\
#'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11',\
#]]

MAE_data.drop(labels=['id','uu','dd','ud','du','MAE','gap_inv','mag_mom','E_zx','E_zy','n_metal',\
'H','He','Li','Be','B','C','N','O','F','Ne',\
'Na','Mg','Al','Si','P','S','Cl','Ar',\
'K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr',\
'Rb','Sr','Y','Zr','Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe',\
'Cs','Ba','La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu',\
'Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg','Tl','Pb','Bi','Po','At'], axis=1, inplace=True)
print(MAE_data.shape)

#sb.distplot(MAE_data['log_MAE'])
#plt.show()

# Select rows after setting cutoff for the target variable
y_min = -6 # -6 for MAE, -10 for MAE/area, -7.5 for MAE/#metal
y_max = 6
MAE_data = MAE_data.loc[MAE_data['log_MAE'] >= y_min]
MAE_data = MAE_data.loc[MAE_data['log_MAE'] <= y_max]

Nsamples = MAE_data.shape[0]
print('log(MAE) selected between [', y_min, ':', y_max, ']')
print('No. of samples = ', Nsamples)

#Shuffle rows randomly
MAE_data = MAE_data.sample(frac=1, axis=0).reset_index(drop=True)

# Import function to create training and test set splits
from sklearn.model_selection import train_test_split


MAE_log = MAE_data[['log_MAE']].values # Just to find the length, below.
MAE_data.drop(labels=['log_MAE'], axis=1, inplace=True)

X_train = MAE_data.to_numpy()
# This test/train split is to find the median of the train MAE values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

med = statistics.median(MAE_log)
print('Median log MAE = ',med)

y_train = []
for i in range(len(MAE_log)):
	if (MAE_log[i][0] <= med ): y_train.append('LOW')
	else: y_train.append('HIGH')
#print(y)


#sb.distplot(y)
#plt.show()

#MAE_data.drop(labels=['log_MAE'], axis=1, inplace=True)
#X = MAE_data.to_numpy()
# Test/train split, again for classification model. Note, we will have exactly
# the same train-test split as before because we set the same value for random_state.
# Else, this would have led to data leakage.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

Ntrain = np.shape(y_train)[0] # no. of training examples

# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler
# Scaling feature data
Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
#Xscaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = Xscaler.transform(X_train)

#######################

#Create random forest classifier
estimators = np.linspace(100,225,6, dtype=int, endpoint=True)
depth = np.linspace(15,25,11, dtype=int, endpoint=True)
min_split = np.linspace(5,15,11, dtype=int, endpoint=True)
#min_leaf = np.linspace(1,10,9, dtype=int, endpoint=True) # 'min_samples_leaf':min_leaf
min_leaf = [1]
#max_sample = np.linspace(0.6,0.6,1, endpoint=True)  # 'max_samples':max_sample
max_sample = [0.6]
max_features = 'sqrt'
#max_leaf = np.linspace(50,600,10, dtype=int, endpoint=True) # max_leaf_nodes
#alphas = np.linspace(0.0,0.3,4, endpoint=True)
alphas = [0]

parameters = {'n_estimators':estimators, 'max_depth':depth, 'max_samples':max_sample,\
           'min_samples_leaf':min_leaf, 'min_samples_split':min_split, 'ccp_alpha':alphas}
rf = GridSearchCV(RandomForestClassifier(criterion='gini',max_features='sqrt'), parameters, cv=5, n_jobs=-1, verbose=0)

#krr.fit(X_train,y_train)
opt = rf.fit(X_train,y_train)

print('Best parameters : ', opt.best_params_)
print('Scorer gini :\t', 'best score : ', opt.best_score_)
y_pred_train = opt.best_estimator_.predict(X_train)

# Perform the inverse scaling transormations
#X_test_inv = Xscaler.inverse_transform(X_test)

print()
print(confusion_matrix(y_train,y_pred_train))
print('Precision=',precision_score(y_train, y_pred_train, pos_label='HIGH'))
print('Recall=',recall_score(y_train, y_pred_train, pos_label='HIGH'))
print('f1=',f1_score(y_train, y_pred_train, pos_label='HIGH'))
print()

# Now read the data for new compounds
new_MAE_data = pd.read_csv('./3d5d-data-MAE.csv',sep=',')
print('Shape= ',new_MAE_data.shape)

# Add derived features
# Scale moment to per metal atom
new_MAE_data['mom'] = new_MAE_data['mag_mom']/new_MAE_data['n_metal']

new_MAE_data['gap'] = new_MAE_data[['uu','dd','ud','du']].min(axis=1)

ids = new_MAE_data[['id']].values
new_MAE_data.drop(labels=['id','uu','dd','ud','du','mag_mom','n_metal'], axis=1, inplace=True)
print(new_MAE_data.shape)

print('features selected')

X_test = new_MAE_data.to_numpy()
X_test = Xscaler.transform(X_test)

y_pred = opt.best_estimator_.predict(X_test)

for i in range(len(y_pred)):
	print(i, ids[i][0], y_pred[i])
