#############################################
# P. Sen 2021
#
# This script first trains a RF regressor for magnetic moment per metal atom.
# This model is then used to predict moment per TM atom
# of the set of 278 combined 3d5d TM compounds.
#
##############################################
#import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
#rc('text', usetex=True) # If latex causes problems, comment this line 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

np.set_printoptions(threshold=np.inf, formatter={'float': lambda x: "{0:9.5f}".format(x)})

mag_data = pd.read_csv('./data-long2.csv',sep=',')

# Add derived features
# mom is mag_mom per metal atom
mag_data['mom'] = mag_data['mag_mom']/mag_data['n_metal']
mag_data['mom'] = abs(mag_data['mom'].values)
print('Shape= ',mag_data.shape)

# Plot the hform distributions before and after scaling
#import seaborn as sb
#fig, ax = plt.subplots(2,2,figsize=(6,6))
#sb.distplot(hform_data['hform'])
#sb.distplot(MAE_data['log_MAE'], ax=ax[0,1])
#plt.xlabel('Heat of formation per atom', fontsize=18)
#ax[0,1].set_xlabel('$log(MAE)$', fontsize=18)
#ax[0,0].tick_params(direction='in', labelsize=18)
#ax[0,1].tick_params(direction='in', labelsize=18)

mag_data.drop(labels=['id','uu','dd','ud','du','mag_mom','cell_area','E_zx','E_zy','n_metal',\
'H','He','Li','Be','B','C','N','O','F','Ne',\
'Na','Mg','Al','Si','P','S','Cl','Ar',\
'K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr',\
'Rb','Sr','Y','Zr','Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe',\
'Cs','Ba','La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu',\
'Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg','Tl','Pb','Bi','Po','At'], axis=1, inplace=True)

print('Shape= ',mag_data.shape)
# Select rows after setting cutoff for the target variable 
#y_min = 0.0
#y_max = -0.25
#MAE_data = MAE_data.loc[MAE_data['mom'] >= y_min]

Nsamples = mag_data.shape[0]
#print('log(MAE) selected between [', y_min, ':', y_max, ']')
print('No. of samples = ', Nsamples)


# Import function to create training and test set splits
from sklearn.model_selection import train_test_split

y_train = mag_data[['mom']].values
mag_data.drop(labels=['mom'], axis=1, inplace=True)
X_train = mag_data.to_numpy()
print('Shape= ',mag_data.shape)
print()


Ntrain = np.shape(y_train)[0] # no. of training examples

# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler
# Scaling feature data
Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
#Xscaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = Xscaler.transform(X_train)

# Scaling target data
#y_train = y_train.reshape(-1,1)
#y_test = y_test.reshape(-1,1)

#Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
#Yscaler = MinMaxScaler(feature_range=(-1, 1)).fit(y_train)
#y_train = Yscaler.transform(y_train)
#y_test = Yscaler.transform(y_test)

# Plot scaled target distributions
#fig, ax = plt.subplots(1,2,figsize=(6,6))
#sb.distplot(y_train, ax=ax[0])
#sb.distplot(y_test, ax=ax[1])
#ax[0].set_xlabel('$scaled\,y_{train}$', fontsize=18)
#ax[1].set_xlabel('$scaled\, y_{test}$', fontsize=18)
#ax[0].tick_params(direction='in', labelsize=18)
#ax[1].tick_params(direction='in', labelsize=18)
#fig.tight_layout()
#plt.savefig('target_dist.png')
#plt.show()

from sklearn.model_selection import GridSearchCV

N_CV = 5
#Create random forest regressor
#estimators = np.linspace(90,110,21, dtype=int, endpoint=True)
estimators = [125] 
depth = [19]
min_split = [2]
#min_leaf = np.linspace(1,10,9, dtype=int, endpoint=True) # 'min_samples_leaf':min_leaf
min_leaf = [1]
max_sample = np.linspace(0.6,0.6,1, endpoint=True)  # 'max_samples':max_sample
max_features = 'sqrt'
#max_leaf = np.linspace(50,600,10, dtype=int, endpoint=True) # max_leaf_nodes
#alphas = np.linspace(0.0,0.3,4, endpoint=True)
alphas = [0.0]

parameters = {'n_estimators':estimators, 'max_depth':depth, 'max_samples': max_sample, \
           'min_samples_leaf':min_leaf, 'min_samples_split':min_split, 'ccp_alpha':alphas}
#parameters = {'max_depth':depth, 'max_samples':max_sample}
#rf = GridSearchCV(RandomForestRegressor(criterion='mae',max_features='sqrt'), parameters, n_jobs=-1, cv=N_CV, verbose=0)
rf = RandomForestRegressor(n_estimators=125,criterion='mae',max_depth=19,min_samples_split=2,\
min_samples_leaf=1,max_features='sqrt',n_jobs=-1,verbose=0,ccp_alpha=0.0,max_samples=0.6,random_state=1)

#rf.fit(X_train,y_train)
opt = rf.fit(X_train,np.ravel(y_train))
#print('Best parameters : ', opt.get_params)
print('Scorer MAE :\t', 'best score : ', opt.score)

#y_pred = opt.best_estimator_.predict(X_test)
y_pred_train = opt.predict(X_train)

# R^2 for training set
print('R2 score for training y: %.4f'
        % r2_score(y_train, y_pred_train))

y_train_data = []
for i in range(len(y_train)):
	y_train_data.append(y_train[i][0])

# Pearson's correlation coefficient
r = np.corrcoef(y_train_data, y_pred_train)
print('Test Pearson correlation on train data', r)


# Explained variance
print('Explained variance for training set %.4f' % explained_variance_score(y_train,y_pred_train))
#print('Explained variance for test set %.4f'  % explained_variance_score(y_test,y_pred))
print()

# The mean squared error
#print('RMSE for train y: %.4f'
#        % np.sqrt(mean_squared_error(y_train, y_pred_train)))
#print('RMSE for test y: %.4f '
#        % np.sqrt(mean_squared_error(y_test, y_pred)))
#print()

#Mean absolute error
#print('MAE for train y: %.4f' % mean_absolute_error(y_train, y_pred_train))
#print('MAE for test y: %.4f' % mean_absolute_error(y_test, y_pred))
print()

# Perform the inverse scaling transormations
#X_test_inv = Xscaler.inverse_transform(X_test)
#y_test_inv = Yscaler.inverse_transform(y_test.reshape(-1,1))
#y_train_inv = Yscaler.inverse_transform(y_train.reshape(-1,1))
#y_pred_inv = Yscaler.inverse_transform(y_pred.reshape(-1,1))
#y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1))

#Mean absolute error
#print('MAE for train hform: %.4f' % mean_absolute_error(y_train_inv, y_pred_train_inv))
#print('MAE for test hform: %.4f' % mean_absolute_error(y_test_inv, y_pred_inv))
#print()


# Print the feature ranking

#importance = opt.best_estimator_.feature_importances_
#print("Feature ranking:")

#for x in range(len(importance)):
#	print(x,'\t',importance[x])

#ax.bar([x for x in range(len(importance))], importance)
#plt.show()

#for f in range(X.shape[1]):
#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Now read the data for new compounds
new_data_mag = pd.read_csv('./3d5d-data-mag.csv',sep=',')
print('Shape= ',new_data_mag.shape)

new_data_mag.drop(labels=['id','cell_area','n_metal'], axis=1, inplace=True)
print('Shape= ',new_data_mag.shape)

print('features selected')

X_test = new_data_mag.to_numpy()
X_test = Xscaler.transform(X_test)

y_pred = opt.predict(X_test)

cnt_high = 0
index_highmag = []
with np.printoptions(precision=5):
	for i in range(len(y_pred)):
	        #print(i, y_pred[i])
	        if (y_pred[i] >= 2.5):
	                print(i, y_pred[i])
	                index_highmag.append(i)
	                cnt_high += 1
print()
print('Number of predicted compounds with magnetization > 2.5/atom', cnt_high)

