#BOR QMRA -- run whole file. results are in "data_mc". Takes ~1hr to run.
#---Files needed:----
# Dose-Responses-all-SI.R
# hypergeometric1F1.R
# Traveltimes_QMRA_v2.xlsx
# Parameters for QMRA.xlsx

setwd("C:/Users/crankk/LVWATER - Las Vegas Valley Water District/Emily Clements - BOR_QMRA")#--- Set working directory
source("R Code/Dose-Responses-all-SI.R")                                                    #--- Get Dose Response Models
source("R Code/hypergeometric1F1.R")  

packages<-c(                                                                                #--- List of required packages
  "ggplot2",
  "tidyr",
  "dplyr",
  "readxl",
  "truncnorm",
  "purrr",
  "vtable", #for summary statistics table
  "beepr" #just for dinging when code finishes running since it takes a while
)

lapply(packages, library, character.only=TRUE)                                              #--- Load required packages

                                                                                            #--- Number of samples
n<-10950 #30*365, equivalent to one sampling per day for 30 years
n_samples <- 10950 
vol <- 2.5 # Liters, 1 ingestion event per day                                              #--- Exposure volumes
set.seed(1)                                                                                 #--- From here down, all code must be run in sequence to maintain replicability

                                                                                            #--- Defined Functions

##--define function to auto sample from parameters table in excel

sample_values <- function(dist_type, n, mean_or_value, stdev, alpha, beta, min_val, max_val) {
  # Handle missing or invalid parameters gracefully
  if (is.na(dist_type) || is.na(n)) {
    return(rep(NA, n))  # Return NA if any key parameter is missing
  }
  # Sampling based on the distribution type
   else if (dist_type == "normal") {
    # Normal distribution
    return(rnorm(n, mean = as.numeric(mean_or_value), sd = as.numeric(stdev)))
  } else if (dist_type == "point") {
    # Point distribution, replicate the value n times
    return(rep(as.numeric(mean_or_value), n))
  } else if (dist_type == "uniform") {
    # Uniform distribution
    return(runif(n, min = as.numeric(min_val), max = as.numeric(max_val)))
  } else if (dist_type == "empirical") {
    # Empirical distribution (define later) ##get the other excel file DFR% at intake to sample from 
    return(rep(NA, n))
  } else {
    # If no valid distribution type is provided, return NA
    return(rep(NA, n))
  }
}

##----Define function to 'sum' probabilities-----

sumrisk <- function(probabilities) {
  # Validate input
  if (!is.numeric(probabilities)) {
    stop("Input must be numeric.")
  }
  if (any(is.na(probabilities))) {
    stop("Input contains missing values (NA).")
  }

  # Convert probabilities to mpfr for high precision
  probabilities <- mpfr(probabilities, precBits = 512)  # Set precision to 512 bits

  # Compute union probability for independent events (non-mutually exclusive)
  union_probability <- 1 - prod(1 - probabilities)

  # Convert the result back to numeric and return it
  return(as.numeric(union_probability))
}

### Function for summing by years

yearly_sumrisk_vector <- function(vec) {
  split_vec <- split(vec, ceiling(seq_along(vec) / 365))
  unname(map_dbl(split_vec, sumrisk))  # strip names from the result
}

###----Bring in Parameters from "Parameters for QMRA" File-----

data <- read_excel("Parameters/Parameters for QMRA.xlsx", sheet = 1, range = cell_cols(1:15))#--Only loading in the first 15 columns, edit here if this changes

head(data)
suppressWarnings(data[, c(3:7, 11:12)] <- lapply(data[, c(3:7, 11:12)], as.numeric))         #--- Convert columns 3 through 7, 10, and 11 to numeric
head(data)

##Add monte carlo samplings to each stochastic parameter in the initial list
data_mc <- data %>%
  filter(parameter != "dr") %>% ##- we don't distribute the dose-response  parameters
  rowwise() %>%
  mutate(values = mapply(
    sample_values,
    dist_type = distribution,
    n = n,
    mean_or_value = mean_or_value,
    stdev = stdev,
    min_val = min,
    max_val = max,
    SIMPLIFY = FALSE
  )) %>%
  ungroup()

##Add monte carlo samplings to each stochastic parameter in the initial list
data_mc <- data %>%
  filter(parameter != "dr") %>% ##- we don't distribute the dose-response  parameters
  rowwise() %>%
  mutate(values = mapply(
    sample_values,
    dist_type = distribution,
    n = n,
    mean_or_value = mean_or_value,
    stdev = stdev,
    min_val = min,
    max_val = max,
    SIMPLIFY = FALSE
  )) %>%
  ungroup()

## Lake Mead travel times 

X975ft <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "297m")
X1025ft <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "312m")
X1080ft <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "329m")
Xfall <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "fall329")
Xspring <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "spring329")
Xsummer <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "summer329")
Xwinter <- read_excel("Parameters/Traveltimes_QMRA_v2.xlsx", sheet = "winter329")

hr <- X975ft$hours
hr1080 <- X1080ft$hours
prob975 <- X975ft$concentration / sum(X975ft$concentration)
prob1025 <- X1025ft$concentration / sum(X1025ft$concentration)
prob1080 <- X1080ft$concentration / sum(X1080ft$concentration)


  
set.seed(1)
tt975 <- sample(hr, 10000, prob = prob975, replace = TRUE)
tt1025 <- sample(hr, 10000, prob = prob1025, replace = TRUE)
tt1080 <- sample(hr1080, 10000, prob = prob1080, replace = TRUE)
ttfall <-sample(Xfall$hours, 10000, prob=Xfall$concentration / sum(Xfall$concentration),
                replace = TRUE)
ttwinter <-sample(Xwinter$hours, 10000, prob=Xwinter$concentration / sum(Xwinter$concentration),
                replace = TRUE)
ttspring <-sample(Xspring$hours, 10000, prob=Xspring$concentration / sum(Xspring$concentration),
                replace = TRUE)
ttsummer <-sample(Xsummer$hours, 10000, prob=Xsummer$concentration / sum(Xsummer$concentration),
                replace = TRUE)



##add travel times to data_mc
data_mc <- data_mc %>%
  rowwise() %>%
  mutate(values = case_when(
    parameter == "t_lm" & season == "all" & lakelevel == 297 ~ list(c(tt975)),
    parameter == "t_lm" & season == "all" & lakelevel == 312 ~ list(c(tt1025)),
    parameter == "t_lm" & season == "all" & lakelevel == 329 ~ list(c(tt1080)),
    parameter == "t_lm" & season == "fall" & lakelevel == 329 ~ list(c(ttfall)),
    parameter == "t_lm" & season == "summer" & lakelevel == 329 ~ list(c(ttsummer)),
    parameter == "t_lm" & season == "spring" & lakelevel == 329 ~ list(c(ttspring)),
    parameter == "t_lm" & season == "winter" & lakelevel == 329 ~ list(c(ttwinter)),
    TRUE ~ list(values) # Wrap original values in a list
  )) %>%
  ungroup()

#### GC:IU Ratio 

data_mc$values_GCIU <- lapply(1:nrow(data_mc), function(i) {
  if (data_mc$parameter[i] == "C") {
    # Find matching lrv_ww row, only works crypto and giardia
    gciu_row <- data_mc[data_mc$parameter == "GC:IU" &
                         data_mc$pathogen == data_mc$pathogen[i], ]
    #unlist
    gciu=unlist(gciu_row$values)

    # Subtract values if a match is found
    if (nrow(gciu_row) == 1) {
      gc_values=10^data_mc$values[[i]] / 10^gciu

      return(log10(gc_values))
    } else {
      data_mc$values[[i]]
    }
  } else {
    data_mc$values[[i]]
  }
})


### WWTP 

## BELOW---MANUALLY CODED LRVS FOR SECONDARY TREATMENT---THESE ARE IN THE PARAMETERS SHEET BUT MANUALLY CODED RATHER THAN AUTOMATICALLY LOADED

Z_samples <- rnorm(n_samples)
LRV_Crypto <- 2.0962 + Z_samples * 0.1085
LRV_Giardia <- 2.5156 + Z_samples * 0.1070
LRV_AdV <- rtruncnorm(n, a = 0.6, b = 3.6, mean = 2.3, sd = 0.8)
LRV_EnV <- rtruncnorm(n, a = 0.3, b = 3.5, mean = 1.3, sd = 0.8)
LRV_NoV <- rtruncnorm(n, a = 0, b = 2.7, mean = 1.1, sd = 0.7)

### The rest of WWTP LRVs are automatically brought in from the parameters sheet

data_mc$values_WWTP <- lapply(1:nrow(data_mc), function(i) {
  if (data_mc$parameter[i] == "C") {
    # Find matching lrv_ww (lrvs from ww WITHOUT secondary treatment) row, only works crypto and giardia 
    lrv_row <- data_mc[data_mc$parameter == "lrv_ww" &
                         data_mc$pathogen == data_mc$pathogen[i] &
                         data_mc$facility == data_mc$facility[i], ]
    
    # If no match, pretend pathogen is "Virus"
    if (nrow(lrv_row) == 0) {
      lrv_row <- data_mc[data_mc$parameter == "lrv_ww" &
                           data_mc$pathogen == "Virus" &
                           data_mc$facility == data_mc$facility[i], ]
    }
    
    # Subtract values if a match is found
    value <- if (nrow(lrv_row) == 1) {
      data_mc$values_GCIU[[i]] - unlist(lrv_row$values)
    } else {
      data_mc$values_GCIU[[i]]
    }
    ##SECONDARY TREATMENT (MANUAL)
    if (data_mc$facility[i] != 1) { ### Facility 1 has no secondary treatment
      if (data_mc$pathogen[i] == "AdV" || data_mc$pathogen[i] == "AdV.culture") {
        value <- value - LRV_AdV
      } else if (data_mc$pathogen[i] == "EnV" || data_mc$pathogen[i] == "EnV.culture") {
        value <- value - LRV_EnV
      } else if (data_mc$pathogen[i] == "NoV") {
        value <- value - LRV_NoV
      } else if (data_mc$pathogen[i] == "Crypto") {
        value <- value - LRV_Crypto
      } else if (data_mc$pathogen[i] == "Giardia") {
        value <- value - LRV_Giardia
      }
    }
    value
    
  } else {
    data_mc$values[[i]]
  }
})

### LVW tt and flow fraction 

data_mc$values_LVW <- lapply(1:nrow(data_mc), function(i) {
  if (data_mc$parameter[i] == "C") {
    # Find the matching 'k' value based on pathogen
    k_row <- data_mc[data_mc$parameter == "k" &
                       (data_mc$pathogen == data_mc$pathogen[i] |
                          (data_mc$pathogen == "AdV" & data_mc$pathogen[i] == "AdV.culture") |
                          (data_mc$pathogen == "EnV" & data_mc$pathogen[i] == "EnV.culture")), ]
    
    # Find the matching 't_lvw' value based on facility
    t_lvw_row <- data_mc[data_mc$parameter == "t_lvw" &
                           data_mc$facility == data_mc$facility[i], ]
    
    # percentages 
    percFlow <- data_mc[data_mc$parameter == "LVW_perc" &
                          data_mc$facility == data_mc$facility[i], ]
    
    # Ensure both k and t_lvw rows are found
    if (nrow(k_row) == 1 && nrow(t_lvw_row) == 1 && nrow(percFlow) == 1) {
      k_value <- unlist(k_row$values_WWTP)
      t_lvw_value <- unlist(t_lvw_row$values_WWTP)
      LVW_perc_value <- unlist(percFlow$values)
      

      # decay 
      original_values <- 10^data_mc$values_WWTP[[i]]
      decayed_values <- original_values * exp(-10^k_value * t_lvw_value/24)*LVW_perc_value
      
    
      return(log10(decayed_values))
    } else {
      # If no k or t_lvw , return original 
      return(data_mc$values_WWTP[[i]])
      
    }
  } else {return(data_mc$values_WWTP[[i]])}
})



### tt in LM 

lakelevels <- c(329, 312, 297)

# Initialize new columns for each lakelevel
new_columns <- paste0("values_LMintake_", lakelevels)
for (col in new_columns) {
  data_mc[[col]] <- NA
}

for (j in seq_along(lakelevels)) {
  lakelevel <- lakelevels[j]
  new_column <- new_columns[j]
  
  data_mc[[new_column]] <- lapply(1:nrow(data_mc), function(i) {
    if (data_mc$parameter[i] == "C") {
      # Find the matching 'k' value based on pathogen
      k_row <- data_mc[data_mc$parameter == "k" &
                         (data_mc$pathogen == data_mc$pathogen[i] |
                            (data_mc$pathogen == "AdV" & data_mc$pathogen[i] == "AdV.culture") |
                            (data_mc$pathogen == "EnV" & data_mc$pathogen[i] == "EnV.culture")), ]
      
      # Find the matching 't_lm' value based on lakelevel
      t_lm_row <- data_mc[data_mc$parameter == "t_lm" & 
                            data_mc$season == data_mc$season[i] &
                            data_mc$lakelevel == lakelevel, ]
      
      # Ensure both k and t_lm rows are found
      if (nrow(k_row) == 1 && nrow(t_lm_row) == 1) {
        k_value <- unlist(k_row$values_LVW)
        t_lm_value <- unlist(t_lm_row$values_LVW)
        
      
        #Decay calculation
        original_values2 <- 10^data_mc$values_LVW[[i]]
        decayed_values2 <- original_values2 * exp(-10^k_value * t_lm_value / 24)
        
       
        
        # Return numeric value
   
       return(log10(decayed_values2)) # Leave as decayed values for this column
      } else {
        return(NA) # Assign NA if no matching rows are found
      }
    } else {
      return(NA) # Assign NA for non-"C" parameters
    }
  })
}


### RWC 
# Initialize new columns for each lake level
new_columns <- paste0("values_RWC_", lakelevels)
for (col in new_columns) {
  data_mc[[col]] <- NA
}

for (j in seq_along(lakelevels)) {
  lakelevel <- lakelevels[j]
  new_column <- new_columns[j]
  
  data_mc[[new_column]] <- lapply(1:nrow(data_mc), function(i) {
    if (data_mc$parameter[i] == "C") {
      # Find matching rwc row
      rwc_row <- data_mc[data_mc$parameter == "rwc" & data_mc$season == data_mc$season[i] & data_mc$lakelevel == lakelevel, ]
      
      # Multiply by rwc
      if (nrow(rwc_row) == 1) {

        
        conc <- 10^data_mc[[paste0("values_LMintake_", lakelevel)]][[i]] * unlist(rwc_row$values) / 100
        
       
        # Convert result back to log10 scale
        log_conc <- log10(conc)
        
        return(log_conc)
     
      } else {
        return(NA)  # Return NA if no matching rwc row is found
      }
    } else {
      return(NA)  # Return NA for non-"C" parameters
    }
  })
}


### DW LRV 

new_columns <- paste0("values_DW_", lakelevels)

# Initialize new columns for DW results
for (col in new_columns) {
  data_mc[[col]] <- NA
}

for (j in seq_along(lakelevels)) {
  lakelevel <- lakelevels[j]
  rwc_column <- paste0("values_RWC_", lakelevel)
  dw_column <- new_columns[j]
  
  data_mc[[dw_column]] <- lapply(1:nrow(data_mc), function(i) {
    if (data_mc$parameter[i] == "C") {
      # Find matching lrv_dw row
      lrv_dw_row <- data_mc[data_mc$parameter == "lrv_dw" &
                              data_mc$pathogen == data_mc$pathogen[i], ]
      
      # If no match, default to "Virus"
      if (nrow(lrv_dw_row) == 0) {
        lrv_dw_row <- data_mc[data_mc$parameter == "lrv_dw" &
                                data_mc$pathogen == "Virus", ]
      }
      
      # Perform subtraction if a match is found
      if (nrow(lrv_dw_row) == 1) {
        rwc_values <- unlist(data_mc[[rwc_column]][i])  # concentration with RWC 
        lrv_values <- unlist(lrv_dw_row$values) # Extract LRV values
        result <- rwc_values - lrv_values
        return(result)
      } else {
        return(data_mc[[rwc_column]][i])  # Return original values if no match
      }
    } else {
      return(NA)  # Assign NA for non-"C" parameters
    }
  })
}

### Dose Responses 

risk_columns <- paste0("Risk_", lakelevels)
dw_columns <- paste0("values_DW_", lakelevels)  # These columns already exist
start.time <- Sys.time() #starting a timer here, because this takes a while. 
beep_on_error(
# For each lake level

for (j in seq_along(lakelevels)) {
  lakelevel <- lakelevels[j]
  dw_column <- dw_columns[j]
  risk_column <- risk_columns[j]
  
  data_mc[[risk_column]] <- lapply(1:nrow(data_mc), function(i) {
    if (data_mc$parameter[i] == "C") {
     
      values_DW <- unlist(data_mc[[dw_column]][i])  
      if (all(is.na(values_DW))) return(NA)
      
      # Apply functions based on pathogen. "hp" suffix indicates high precision version of dose-response function
      #DAILY RISK
      if (data_mc$pathogen[i] == "NoV") {
        return(nov_dr(vol * 10^values_DW)) # fp= fractional poisson
        #  return(1 - (1 - (sapply(values_DW, function(v) nov_dr_hp(vol * 10^v))))^365))
      } else if (data_mc$pathogen[i] %in% c("AdV", "AdV.culture")) {
        # return(sapply(values_DW, function(v) adv_dr_hp(vol * 10^v)))
        return(adv_dr(vol * 10^values_DW))
      } else if (data_mc$pathogen[i] == "Crypto") {
        return(cry_dr_hp(vol * 10^values_DW)) #exponential is the non- precise one, hp is the regular one
      } else if (data_mc$pathogen[i] %in% c("EnV", "EnV.culture")) {
        return(env_dr_hp(vol * 10^values_DW))
      } else if (data_mc$pathogen[i] == "Giardia") {
        return(gia_dr_hp(vol * 10^values_DW))
      } else {
        return(NA)  # Default case for non-matching pathogens
      }
      
    } else {
      return(NA)  # Default case for non-"C" rows
    }
  })
}
)


###fix up the high precision printing
data_mc <- data_mc %>%
  mutate(across(starts_with("Risk_"), 
                ~purrr::map(.x, ~as.numeric(.x)))) # %>%  # Convert mpfr to numeric

### NOTE ----- THE HIGH PRECISION DRs NEED ADDITIONAL HANDLING. BELOW IS MANUALLY CALCULATING THE HIGH PRECISION RISKS----

### Emily precision fix

calculate_risk <- function(df, lake_col, result_col_name, pathogen_set, dr_function) {
  df[[result_col_name]] <- vector("list", nrow(df)) 
  
  for (i in seq_len(nrow(df))) {
    if (df$pathogen[i] %in% pathogen_set) {
      values <- df[[lake_col]][[i]]
      result_list <- numeric(length(values))
      
      for (j in seq_along(values)) {
        value_DW <- as.numeric(values[j])
        if (!is.na(value_DW)) {
          result <- mpfr(dr_function(vol * 10^value_DW), precBits = 512)
          result_list[j] <- as.numeric(result)
        }
      }
      
      df[[result_col_name]][[i]] <- result_list
    }
  }
  
  return(df)
}

env_pathogens <- c("EnV", "EnV.culture")
adv_pathogens <- c("AdV", "AdV.culture")
nov_pathogens <- c("NoV")
lake_levels <- c("297", "312", "329")

for (level in lake_levels) {
  lake_col <- paste0("values_DW_", level)
  
  # Calc risks
  data_mc <- calculate_risk(data_mc, lake_col, paste0("EnV_risk_DR_", level), env_pathogens, env_dr_hp)
  data_mc <- calculate_risk(data_mc, lake_col, paste0("AdV_risk_DR_", level), adv_pathogens, adv_dr_hp)
  data_mc <- calculate_risk(data_mc, lake_col, paste0("NoV_risk_DR_", level), nov_pathogens, nov_dr_hp)
  
  # Create subset for merge
  risk_cols <- paste0(c("EnV", "AdV", "NoV"), "_risk_DR_", level)
  data_mc_advenv <- data_mc[, c("pathogen", "facility", "season", risk_cols), drop = FALSE]
  
  # Copy back 
  for (i in seq_len(nrow(data_mc))) {
    if (data_mc$parameter[i] == "C") {
      match_idx <- which(data_mc_advenv$pathogen == data_mc$pathogen[i] &
                           data_mc_advenv$facility == data_mc$facility[i] &
                           data_mc_advenv$season == data_mc$season[i])
      
      if (length(match_idx) > 0) {
        if (data_mc$pathogen[i] %in% env_pathogens) {
          data_mc[[paste0("Risk_", level)]][i] <- data_mc_advenv[[paste0("EnV_risk_DR_", level)]][match_idx[1]]
        } else if (data_mc$pathogen[i] %in% adv_pathogens) {
          data_mc[[paste0("Risk_", level)]][i] <- data_mc_advenv[[paste0("AdV_risk_DR_", level)]][match_idx[1]]
        } else if (data_mc$pathogen[i] %in% nov_pathogens) {
          data_mc[[paste0("Risk_", level)]][i] <- data_mc_advenv[[paste0("NoV_risk_DR_", level)]][match_idx[1]]
        }
      } else {
        data_mc[[paste0("Risk_", level)]][i] <- NA
      }
    }
  }
}



#####---end 
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken
beep()

##FINAL RESULTS: 
View(data_mc)
#save(data_mc, file = "data_mc_wGCIU.RData")

#post-processing is required to sum risks and combine for various scenarios. 


