"""
Supporting Information for Prediction of Activity Coefficients by Similarity-Based Imputation using Quantum-Chemical Descriptors
N. Hayer, T. Specht, J. Arweiler, D. Gond, H. Hasse, F. Jirasek

This code calculates the similarity score between two components based on the information from .sigma files.
In our work, we used .sigma files provided by the Bell et al. (2020) https://doi.org/10.1021/acs.jctc.9b01016
"""
import numpy as np
import re


##############################################
# Supporting Functions
##############################################

def find_all_indices_in_string(input_string, target_char):
    """
    Finding all indices of the specified target character in the given input string

    Parameters:
    - input_string (str): string in which to search for indices
    - target_char (str): target character to find indices for

    Returns:
    - list[int]: list of indices where the target character is found in the input string
    """
    pattern = re.escape(target_char)
    indices = [match.start() for match in re.finditer(pattern, input_string)]
    return indices

def moving_average(array):
    kernel = np.array([0.5, 0.5])  # Define the kernel for a window size of 2
    return np.convolve(array, kernel, mode='same')  # Perform convolution


##############################################
# Functions for Similarity Calculation
##############################################

def read_sigma_files(file:list, w_P:float):
    """
    Reading the .sigma files and returning a list containing all information for further processing

    Parameters:
    - file (str): content of the .sigma file to be processed
    - w_P (float): factor thats controls the weight on the polar regions in the sigma-profiles

    Returns:
    - Set (list): information extracted from the .sigma file for further processing
    """
    ID = re.findall(r"# meta: {(.*?)}",file)[0] # get everything within the meta object and store it as ID
    indices = find_all_indices_in_string(ID,'"')    
    A = float(ID[indices[5]+3:indices[6]-2]) # get A from ID
    V = float(ID[indices[7]+3:indices[8]-2]) # get V from ID
    splitcontent = file.split("\n")
    NHB_OH_OT = list(map(lambda x: x.split()[1], splitcontent[3:-1]))
    NHB = np.array(NHB_OH_OT[:51]).astype(np.float64)
    OH = np.array(NHB_OH_OT[51:102]).astype(np.float64)
    OT = np.array(NHB_OH_OT[102:]).astype(np.float64)
    p_ges = (NHB+OH+OT)/A
    ProfileSum = NHB+OH+OT
    ProfileSum = ProfileSum/sum(ProfileSum)
    sigma = np.arange(-0.025, 0.026, 0.001)
    Profile = moving_average((ProfileSum * (abs(sigma))**w_P)/np.sum((ProfileSum * (abs(sigma))**w_P)))
    Set = [A,V,NHB,OH,OT,p_ges,Profile]
    return Set


def calcSimilarity(w_sigma:float,Comp1:list,Comp2:list):
    """
    Calculating the similarity score between two profiles

    Parameters:
    - w_sigma: factor for weighting the similarities based on surface charge distribution and based on size
    - Comp1 (list): information from sigma files for component 1
    - Comp2 (list): information from sigma files for component 2

    Returns:
    - float: The calculated similarity score
    """
    Score = [None,None]
    Profile_Size = [Comp1[0],Comp2[0]]
    Profiles = np.array([Comp1[6],Comp2[6]])
    Score[0] = min(Profile_Size)/max(Profile_Size)
    Score[1] = np.sum(np.min((Profiles[0,:],Profiles[1,:]),axis = 0))
    return (Score[0] * (1-w_sigma) + Score[1] * w_sigma)



##############################################
# Main
##############################################

# Hyperparameters of the "Best" Model
w_sigma = 0.6 # between 0 and 1
w_P = 2 # 0 or 2

# Path to .sigma Files
# In this work, .sigma files were obtained from the database of Bell et al. 2020.
# Example: Water + Ethanol
sigma_profile_1 = 'water.sigma'   # .sigma file of water
sigma_profile_2 = 'ethanol.sigma' # .sigma file of ethanol

# Loading Data
with open(f"{sigma_profile_1}","r") as f:
    content = f.read()
    Set_1 = read_sigma_files(file = content, w_P=w_P)

with open(f"{sigma_profile_2}","r") as f:
    content = f.read()
    Set_2 = read_sigma_files(file = content, w_P=w_P)

# Similarity Score Calculation
S_12 = calcSimilarity(w_sigma,Set_1,Set_2)
print(S_12) 