#############################################
# P. Sen 2021
#
# This script first trains a RF classifier to classify materials into stable and unstable
# classes based on their heats of formation values. It is then used to classify
# the set of 278 combined 3d5d TM compounds as stable or unstable
#
###########################################
import numpy as np
import pandas as pd

hform_data = pd.read_csv('./cmr-combined-all-features.csv',sep=',')
print('Shape= ',hform_data.shape)

# Select desired features
hform_data = hform_data[['mean_Z','del_Z','mode_Z','mean_grp','del_grp','mode_grp',\
'mean_period','del_period','mode_period','mean_valelec','del_valelec','mode_valelec','mean_electroneg','del_electroneg',\
'mode_electroneg', 'stoichio_entropy', 'cell_area', 'L2_norm', 'L3_norm', 'hform',\
'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11','smeig12'\
]]

print('Shape= ',hform_data.shape)

Nsamples = hform_data.shape[0]
print('No. of samples = ', Nsamples)

# open a file to append parameter values and scores
#file1 = open("./stats/classify-RF-stat.txt", "a")
#file1.write("# ccp_alpha,\tmax_depth,\tmax_samples,\tmin_samples_leaf,\tmin_samples_split,\tn_estimators,\ttn_train,\tfp_train,\tfn_train,\ttp_train,\tf1_train,\ttn_test,\tfp_test,\tfn_test,\ttp_test,\tf1_test #\n\n")

imp = pd.DataFrame()

Nruns = 2
# Import function to create training and test set splits
from sklearn.model_selection import train_test_split
# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

for cnt in range(Nruns):
	print()
	print("Run count : ", cnt+1)
	
	# Shuffle rows randomly
	hform_data = hform_data.sample(frac=1, axis=0).reset_index(drop=True)
	
	y = hform_data[['hform']].values
	X = hform_data.drop(labels=['hform'], axis=1, inplace=False).to_numpy()
	
	# Test/train split
	#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
	
	#y_test_data = y_test[:,0]
	Ntrain = np.shape(y)[0] # no. of training examples
	
	# Scale the features and the target
	# Scaling feature data
	Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X) # Handles outliers better
	X = Xscaler.transform(X)
	
	#######################
	
	# Digitise the target variable
	#y_train = [0 if i > 0 else 1 for i in y_train]
	#y_test = [0 if i > 0 else 1 for i in y_test]
	y = [0 if i > -0.5 else 1 for i in y]
	
	#Create random forest regressor
	N_CV = 5
	estimators = [100, 125, 150, 175, 200, 225]
	depth = [15, 16, 17, 18, 19, 20, 21, 22]
	min_split = np.linspace(2,5,4, dtype=int, endpoint=True)
	#min_leaf = np.linspace(1,10,9, dtype=int, endpoint=True) # 'min_samples_leaf':min_leaf
	min_leaf = [1]
	max_sample = np.linspace(0.6,0.6,1, endpoint=True)  # 'max_samples':max_sample
	max_features = 'sqrt'
	#max_leaf = np.linspace(50,600,10, dtype=int, endpoint=True) # max_leaf_nodes
	#alphas = np.linspace(0.0,0.3,4, endpoint=True)
	alphas = [0.0]
	
	parameters = {'n_estimators':estimators, 'max_depth':depth, 'max_samples': max_sample, \
	   'min_samples_leaf':min_leaf, 'min_samples_split':min_split, 'ccp_alpha':alphas}
	
	rf = GridSearchCV(RandomForestClassifier(criterion='gini',max_features='sqrt'), parameters, n_jobs=-1, cv=N_CV, scoring='f1', verbose=0)
	
	#rf.fit(X_train,y_train)
	opt = rf.fit(X,np.ravel(y))
	
	# append to file
	row = []
	for k,v in opt.best_params_.items():
	    row.append(v)

	print('Best parameters : ', opt.best_params_)
	print('Scorer gini :\t', 'best score : ', opt.best_score_)

	#print('Training results:\n')
	y_pred = opt.best_estimator_.predict(X)
	print(confusion_matrix(y,y_pred))
	print('Precision=',precision_score(y, y_pred, pos_label=1))
	print('Recall=',recall_score(y, y_pred, pos_label=1))
	print('f1=',f1_score(y, y_pred, pos_label=1))
	for v in confusion_matrix(y,y_pred).flatten():
		row.append(v)
	row.append(opt.best_score_)

	#print('\nTest results:\n')
	#y_pred = opt.best_estimator_.predict(X_test)
	#print(confusion_matrix(y_test,y_pred))
	#for v in confusion_matrix(y_test,y_pred).flatten():
	#	row.append(v)
	#print('Precision=',precision_score(y_test, y_pred, pos_label=1))
	#print('Recall=',recall_score(y_test, y_pred, pos_label=1))
	#print('f1=',f1_score(y_test, y_pred, pos_label=1))
	#row.append(f1_score(y_test, y_pred, pos_label=1))
	#file1.write(','.join(map(str, row)))
	#file1.write('\n')

    # Append the feature rankings to a dataframe
	#imp = imp.append([opt.best_estimator_.feature_importances_])

#imp.columns = hform_data.drop('hform', axis=1, inplace=False).columns	

# write the feature importance table to a file
#imp.to_csv(r'./stats/classify-RF-imp.csv')

	new_hform_data = pd.read_csv('./3d5d-data-mag.csv',sep=',')

	ids = new_hform_data[['id']].values

# Select desired features
	new_hform_data = new_hform_data[['mean_Z','del_Z','mode_Z','mean_grp','del_grp','mode_grp',\
'mean_period','del_period','mode_period','mean_valelec','del_valelec','mode_valelec','mean_electroneg','del_electroneg',\
'mode_electroneg', 'stoichio_entropy', 'cell_area', 'L2_norm', 'L3_norm',\
'smeig1','smeig2','smeig3','smeig4','smeig5','smeig6','smeig7','smeig8','smeig9','smeig10','smeig11','smeig12'\
]]

	print(new_hform_data.shape)

	print('features selected')

	X_test = new_hform_data.to_numpy()


	X_test = Xscaler.transform(X_test)
	y_test = opt.best_estimator_.predict(X_test)

	for i in range(len(y_test)):
	        if (y_test[i] == 1):
	                print(i, ids[i][0], y_test[i], ' is stable')
