
# coding: utf-8

# # This Jupyter notebook enables to reproduce the final model of the current publication
# # The model building is based on Keras/Tensorflow
# # The model performances are accesss via scikit-learn functions
# # RDKit is used to compute molecular fingerprint
# # The module versions used at the time of the study can be found in the joined .yml file

# In[1]:


import sys, os, math
#from pandas import *
import keras as keras
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.metrics import Metric
import keras.backend as K
from sklearn import metrics
from sklearn import preprocessing

from keras.models import load_model

from scipy.spatial import distance
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit import DataStructs
from keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.metrics import recall_score, classification_report
from keras.models import Sequential
from keras.optimizers import rmsprop
from keras.layers import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Flatten, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, cohen_kappa_score, make_scorer, f1_score, precision_score, recall_score, confusion_matrix
#from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


# # Functions used to compute molecular and reaction fingerprints from a input RXN SMILES

# In[2]:


def reaction_smiles_to_reaction_fp_product_oeselma(r_smiles, FeaturesMethod, Rfp_flag, Numbits, Numbits_product=0):
    """
    Calculate a reaction fingerprint on basis of a reaction SMILES string, in the specific case of the "fp_product_oeselam_model"
    :param r_smiles: the SMILES of the reaction
    :type r_smiles: str
    :param Numbits: the length of the reactant fingerprint
    :type Numbits: int, optional
    :param Numbits_productants: if greater than 0, concatenate the reactants fingerprint to the end
    :type Numbits_productants: int, optional
    :return: the fingerprint
    :rtype: numpy.ndarray
    """
    reactants_smiles, product_smiles = smi_from_rxn(r_smiles)
    reactants = [Chem.MolFromSmiles(smi) for smi in reactants_smiles]
    product = Chem.MolFromSmiles(product_smiles)
    reaction_fp = mols_to_reaction_fp_product_oeselma(reactants_smiles, reactants, product, FeaturesMethod, Rfp_flag, Numbits, Numbits_product)
    return reaction_fp


def mols_to_reaction_fp_product_oeselma(reactants_smiles, reactants, product, FeaturesMethod, Rfp_flag, Numbits, Numbits_product=0):
    """Calculate reaction fingerprint from list of reactants and product, in the specific case of the "fp_product_oeselam_model"
    :param reactants: the reactants molecules
    :type reactants: list of rdkit.Chem.rdchem.Mol
    :param product: the product molecule
    :type product: rdkit.Chem.rdchem.Mol
    :param Numbits: the length of the reactant fingerprint
    :type Numbits: int, optional
    :param Numbits_product: if greater than 0, concatenate the product fingerprint to the end
    :type Numbits_product: int, optional
    :return: the constructed fingerprint
    :rtype: numpy.ndarray
    """
 
    rfp = mol_to_fp_method(product, Numbits, 'fp_mix')
    for reactant in reactants:
        rfp = rfp - mol_to_fp_method(reactant, Numbits, 'fp_mix')

    pd = mol_to_fp_method(product, Numbits, 'fp_mix')

    r1 = mol_to_fp_method(reactants_smiles[0], Numbits, 'oeselma')
    r2 = mol_to_fp_method(reactants_smiles[1], Numbits, 'oeselma')
    fp = np.concatenate([rfp, pd, r1, r2])
            
    return fp


def smi_from_rxn(r_smiles):
    """Calculate a reaction fingerprint on basis of a reaction SMILES string. Assumes only one product!
        :parameter r_smiles: Reaction SMILES string
        :returns: Reaction fingerprint as a Numpy Array
    """
    reactants = r_smiles.split(">")[0].split(".")
    product = r_smiles.split(">")[-1]   #Only assume one product
    return reactants, product


def mol_to_fp_method(mol, Numbits, FeaturesMethod):
    """Calculate a concatenated fingerprint from an RDKit mol object
        :parameter mol: RDKit molecule
        :returns: concatenated fingerprint as a numpy array"""
    
    if FeaturesMethod == "fp_mix":
        Numbits = int(Numbits/2)

    if FeaturesMethod == 'oeselma':
        f = open(oeselma_file, 'r')
        Lines = f.readlines()
        for line in Lines:
            stripped_line = line.strip()
            line_list = stripped_line.split()
            if line_list[0] == mol:
                line_list.pop(0)
                fp = np.array(line_list)
                return fp
    else:
        # FP1 is Morgan
        fp1 = AllChem.GetHashedMorganFingerprint(mol, 3, nBits=Numbits)
        array1 = np.zeros((0,), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp1,array1) #Now array will be updated with the FP bits

        if FeaturesMethod == "fp_mix":
            #FP2 is Rdkit (~Daylight), RDK is only bit based
            fp2 = RDKFingerprint(mol, minPath=1, maxPath=7, fpSize=Numbits, useHs=True, branchedPaths=True, useBondOrder=True)
            array2 = np.zeros((0,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp2,array2) #Now array will be updated with the FP bits
            fp = np.concatenate([array1, array2])
        else:
            fp = array1
        
        return fp



        


# # Functions related to the model settings and performances calculation 

# In[3]:


################################################################################
# Model setting and optimization process routines 
################################################################################
def classification_process(row_value):
    """ It has been decided to use a fraction conversion of "10" to distinuish between the successful and the failed reaction"""
    if row_value >= 10:
        return 1
    else:
        return 0


def build_keras_model(inputshape, outputsize, hidden_size, num_hidden_layers, dropout, lr):
    model = Sequential()
    model.add(Dense(hidden_size, activation='relu', input_shape=inputshape))
    model.add(Dropout(dropout))
    for i in range(num_hidden_layers):
        model.add(Dense(hidden_size, activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(outputsize, activation='sigmoid'))
    optimizer=rmsprop(lr=lr)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=['binary_accuracy'])
    return model

def model_performance(best_params, X_train, Y_train, X_test, Y_test, N_train, N_test, outputsize):
    inputshape = input = (X_train.shape[1],)
    model = build_keras_model(inputshape=inputshape, outputsize=outputsize, hidden_size=int(best_params['hidden_size']),
                    num_hidden_layers=int(best_params['num_hidden_layers']), dropout=best_params['dropout'], lr=best_params['lr'])
    batch_size=int(best_params['batch_size'])
    numepochs=int(best_params['numepochs'])
    history = model.fit(X_train, Y_train, validation_data=[X_test, Y_test], epochs=numepochs, batch_size=batch_size, verbose=0)

    model_performance_measures(model,X_train,Y_train,N_train,X_test,Y_test,N_test)

def model_performance_measures(model,X_train,Y_train,N_train,X_test,Y_test,N_test):
    print_prediction(model,X_train,Y_train,N_train,"Training")
    print_prediction(model,X_test,Y_test,N_test,"Test")
    confusion_matrix_scores(model,X_train,Y_train,N_train,"Training")
    confusion_matrix_scores(model,X_test,Y_test,N_test,"Test")

def print_prediction(model,X_set,Y_set,N_set,nameset):
    numval = len(X_set)
    Y_pred = model.predict(X_set)
    Y_pred_prob = model.predict_proba(X_set)

    for i in range(numval):
        if Y_pred[i][0] >= 0.50:
            predict_class=1
        else:
            predict_class=0
        print ('Set:', nameset, N_set[i], "Actual:", Y_set[i], "Predicted:", predict_class, Y_pred[i][0])
            

def confusion_matrix_scores(model, X_set, Y_set, N_set, setname):

    # Compute confusion matrix and some performance paraemters
    y_pred = model.predict_classes(X_set)
    cm = confusion_matrix(Y_set, y_pred)
    tp = cm[1][1]
    tn = cm[0][0]
    fp = cm[0][1]
    fn = cm[1][0]
    ac_score = accuracy_score(Y_set, y_pred)
    ck_score = cohen_kappa_score(Y_set, y_pred)
    re_score = recall_score(Y_set, y_pred)
    f_score = f1_score(Y_set, y_pred)
    pr_score = precision_score(Y_set, y_pred)

    # ROCS plot
    plt.figure()
    y_pred = model.predict_proba(X_set)
    au_score = roc_auc_score(Y_set, y_pred)
    fpr, tpr, _ = metrics.roc_curve(Y_set,  y_pred)
    auc = metrics.roc_auc_score(Y_set, y_pred)
    plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    name = "rocs" + str(setname) + ".png"
    plt.savefig(name)
    plt.close()

        # Print confusion matrix and some performance paraemters
    print("\nSet:", setname)
    print ("TP:", tp,"TN:",tn,"FP:",fp,"FN:",fn)
    print ("Accuracy:", ac_score)
    print ("ROC_AUC:", au_score)
    print ("Kappa:", ck_score)
    print ("Recall:", re_score)
    print ("F1score:", f_score)
    print ("Precision:", pr_score)
    
    


# # Main 
# # How to use it: to reproduce the final model discussed in the publication: 
# # set load_final_model to '1'
# # set to '0' it will re-build the model based on the same hyper-parameters (note that the model will then slighly differ from the final model)

# In[8]:


if __name__ == "__main__":

    # Paremeters mandatoty to change:
    # current model files location
#    PATH_DATA = "/path_to_data/"
    PATH_DATA = "/projects/mai/users/ktbf533_Thierry/projects/ReactionInformatics/Ilab_amidation/Data_Samuel_Claudio/HTE_Claudio/Publication_data/"

    # Paremeters to change if needed, per default load_final_model = 1, it will reproduce the final model
    # discussed in the publication
    load_final_model = 0
  
    # and if load_final_model set to '0' :
    
    # Parameters to tune in order to build different models using the same type of features but
    # varying the fingerprint size
    ################################################################################
    Numbits = 512
    Numbits_product = 512
    # Modelling or random model: In the publication, we also access how the models perform in comparison to the case where the model are trained on a shuffled dataset  
    Randomize=0  # to be set to "1" to get random modelling : will shuffle the Y vector

    # do not change further this line
    
    # Fix parameters, cannot be tuned in this code version
    ################################################################################
    FeaturesMethod= 'rfp_product_oeselma'
    Rfp_flag = 1
    outputsize = 1 # do not change in any case
    
    # Datasets, data for the oelselma and fixed parameters
    ################################################################################
    oeselma_file = PATH_DATA + 'Reactants.properties.txt'
    File= PATH_DATA + 'HTE_Final.txt'
        
    ####################################################################################
    # Dataset loading
    ####################################################################################
    df = pd.read_csv(File, delim_whitespace=True)


    ####################################################################################
    # Compute features
    ####################################################################################
    df['label'] = df.apply(lambda row: classification_process(row.Success), axis = 1)
    df['fp'] = df.apply(lambda row: reaction_smiles_to_reaction_fp_product_oeselma(row.RXN, FeaturesMethod, Rfp_flag, Numbits, Numbits_product), axis = 1)
    df["fp_str"] = df.fp.apply(str)

    # Create the one-hot condition vector, this has to be done when the X dataset is defined
    # add the vecto as a numpy array in a new called 'Vector' column
    vect1 = pd.get_dummies(df['CA'], prefix='Vector')
    df['Vector_Condition'] = [a for a in vect1.values]
        
    # Create the one-hot temperature vector, this has to be done when the X dataset is defined
    # add the vecto as a numpy array in a new called 'Vector' column
    vect2 = pd.get_dummies(df['Temperature'], prefix='Temp')
    df['Vector_Temp'] = [a for a in vect2.values]

    # Create the one-hot time vector, this has to be done when the X dataset is defined
    # add the vecto as a numpy array in a new called 'Vector' column
    vect3 = pd.get_dummies(df['Time'], prefix='Time')
    df['Vector_Time'] = [a for a in vect3.values]
    
    ####################################################################################
    # Prepare X_train, Y_train
    ####################################################################################
    df_train = df.loc[df['Set'] == 'Train']
        
    fps_train = np.stack(df_train["fp"].values)
    cond_train = np.stack(df_train["Vector_Condition"])
    temp_train = np.stack(df_train["Vector_Temp"])
    time_train = np.stack(df_train["Vector_Time"])
    columns = df_train.columns  #returns index object holding new column names
    concated_train = np.concatenate([fps_train, cond_train, temp_train, time_train], axis=1)
    df_train['Concat'] = [a for a in concated_train]
    number_column = df_train.loc[:,'Concat']
    X_train = np.stack(number_column.values)
    Y_train = np.stack(df_train['label'].values)
    N_train = np.stack(df_train['Exp_name'].values)
    X_train_scaled = preprocessing.scale(X_train)
    
    if Randomize:
        seed = np.random.randint(0, 10000)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

    ####################################################################################
    # Prepare X_test, Y_test
    ####################################################################################
    df_test = df.loc[df['Set'] == 'Test']

    fps_test = np.stack(df_test["fp"].values)
    cond_test = np.stack(df_test["Vector_Condition"])
    temp_test = np.stack(df_test["Vector_Temp"])
    time_test = np.stack(df_test["Vector_Time"])
    columns = df_test.columns  #returns index object holding new column names
    concated_test = np.concatenate([fps_test, cond_test, temp_test, time_test], axis=1)
    df_test['Concat'] = [a for a in concated_test]
    number_column = df_test.loc[:,'Concat']
    X_test = np.stack(number_column.values)
    Y_test = np.stack(df_test['label'].values)
    N_test = np.stack(df_test['Exp_name'].values)
    X_test_scaled = preprocessing.scale(X_test)

    best_params = {'batch_size': 32, 'lr': 0.00030645969762385465, 'numepochs': 30, 'num_hidden_layers': 1, 'dropout': 0.0004887962267077486, 'hidden_size': 1024}

    if load_final_model:
        inputshape = input = (X_train_scaled.shape[1],)
        model = build_keras_model(inputshape=inputshape, outputsize=outputsize, hidden_size=int(best_params['hidden_size']),
                 num_hidden_layers=int(best_params['num_hidden_layers']), dropout=best_params['dropout'], lr=best_params['lr'])

        model.load_weights('/projects/mai/users/ktbf533_Thierry/projects/ReactionInformatics/Ilab_amidation/Data_Samuel_Claudio/HTE_Claudio/Publication_data/Model_CV.h5')
        model_performance_measures(model,X_train_scaled,Y_train,N_train,X_test_scaled,Y_test,N_test)
    else:
        history={}
        model_performance(best_params, X_train_scaled, Y_train, X_test_scaled, Y_test, N_train, N_test, outputsize)







