## ----setup, include = FALSE-----------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  fig.align = "center",
  out.width = "95%",
  dpi = 90,
  message = FALSE,
  warning = FALSE
)
options(width = 100)

## -------------------------------------------------------------------------------------------------
# Load required packages
library(missoNet)
library(ggplot2)
library(reshape2)
library(gridExtra)

# Set ggplot theme
theme_set(theme_minimal(base_size = 11))

## -------------------------------------------------------------------------------------------------
# Set dimensions mimicking a genomic region
n <- 600   # Number of samples
p <- 80   # Number of SNPs in the region
q <- 20    # Number of CpG sites

# Create realistic correlation structure
# SNPs: Linkage disequilibrium (LD) blocks
create_ld_structure <- function(p, n_blocks = 4, within_block_cor = 0.7) {
  Sigma <- matrix(0, p, p)
  block_size <- p / n_blocks
  
  for (b in 1:n_blocks) {
    idx <- ((b-1)*block_size + 1):(b*block_size)
    for (i in idx) {
      for (j in idx) {
        Sigma[i, j] <- within_block_cor^abs(i - j)
      }
    }
  }
  diag(Sigma) <- 1
  return(Sigma)
}

Sigma.X <- create_ld_structure(p, n_blocks = 4)

# Variable missing rates (technical dropout patterns)
# Higher missingness for CpG sites with extreme GC content
gc_content <- runif(q, 0.3, 0.7)  # Simulated GC content
rho_vec <- 0.01 + 0.3 * (abs(gc_content - 0.5) / 0.2)^2  # U-shaped missing pattern

# Generate the data
sim <- generateData(
  n = n,
  p = p,
  q = q,
  rho = rho_vec,  # Missing rates
  missing.type = "MAR",  # Missing depends on technical factors
  Sigma.X = Sigma.X,
  Beta.row.sparsity = 0.15,  # 15% of SNPs are meQTLs
  Beta.elm.sparsity = 0.4,     # Each meQTL affects 40% of CpGs
  seed = 100
)

# Add meaningful variable names
colnames(sim$X) <- sprintf("rs%d", 1000000 + 1:p)  # SNP IDs
colnames(sim$Y) <- sprintf("cg%d", 2000000 + 1:q)  # CpG IDs
colnames(sim$Z) <- colnames(sim$Y)
rownames(sim$X) <- rownames(sim$Y) <- rownames(sim$Z) <- paste0("Sample", 1:n)

{
  cat("\nDataset Summary:\n")
  cat("================\n")
  cat("Samples:", n, "\n")
  cat("SNPs:", p, "\n")
  cat("CpG sites:", q, "\n")
  cat("Overall missing rate:", sprintf("%.1f%%", mean(is.na(sim$Z)) * 100), "\n")
  cat("True meQTLs:", sum(rowSums(abs(sim$Beta)) > 0), "\n")
}

## -------------------------------------------------------------------------------------------------
# Analyze missing patterns
miss_by_cpg <- colMeans(is.na(sim$Z))
miss_by_sample <- rowMeans(is.na(sim$Z))

# Create visualization
par(mfrow = c(2, 2))

# 1. Missing rate by CpG
plot(miss_by_cpg, type = "h", lwd = 2, col = "steelblue",
     xlab = "CpG Site", ylab = "Missing Rate",
     main = "Missing Data by CpG Site")
abline(h = mean(miss_by_cpg), col = "red", lty = 2)

# 2. Missing rate by sample
hist(miss_by_sample, breaks = 20, col = "lightblue",
     xlab = "Missing Rate", main = "Distribution of Missing Rates (Samples)")
abline(v = mean(miss_by_sample), col = "red", lwd = 2)

# 3. Heatmap of missingness
image(t(is.na(sim$Z[1:100, ])), col = c("white", "darkred"),
      xlab = "CpG Site", ylab = "Sample (first 100)",
      main = "Missing Data Pattern")

# 4. Correlation of missingness with GC content
plot(gc_content, miss_by_cpg, pch = 19, col = "darkblue",
     xlab = "GC Content", ylab = "Missing Rate",
     main = "Technical Dropout vs GC Content")
lines(lowess(gc_content, miss_by_cpg), col = "red", lwd = 2)

## -------------------------------------------------------------------------------------------------
# Use complete data for visualization
Y_complete <- sim$Y
cor_cpg <- cor(Y_complete)

# Create enhanced heatmap
library(RColorBrewer)
colors <- colorRampPalette(brewer.pal(9, "RdBu"))(100)

heatmap(cor_cpg, 
        col = rev(colors),
        symm = TRUE,
        main = "CpG Methylation Correlation Structure",
        xlab = "CpG Sites", ylab = "CpG Sites",
        margins = c(8, 8))

# Identify CpG modules
hc <- hclust(as.dist(1 - abs(cor_cpg)))
modules <- cutree(hc, k = 3)

cat("\nCpG modules identified:", table(modules), "\n")

## -------------------------------------------------------------------------------------------------
fit_initial <- missoNet(
  X = sim$X,
  Y = sim$Z,
  GoF = "BIC",
  adaptive.search = TRUE,  # Fast exploration
  verbose = 1
)

# Examine initial selection
{
  cat("\nStep 1: Initial parameter exploration\n")
  cat("=====================================\n")
  cat("  Lambda.beta:", fit_initial$est.min$lambda.beta, "\n")
  cat("  Lambda.theta:", fit_initial$est.min$lambda.theta, "\n")
  cat("  Active SNPs:", sum(rowSums(abs(fit_initial$est.min$Beta)) > 1e-8), "\n")
  cat("  Network edges:", 
      sum(abs(fit_initial$est.min$Theta[upper.tri(fit_initial$est.min$Theta)]) > 1e-8), "\n")
}

## -------------------------------------------------------------------------------------------------
# Define refined grid based on initial exploration
lambda.beta.refined <- 10^(seq(
  log10(min(max(fit_initial$lambda.beta.seq) * 0.9, fit_initial$est.min$lambda.beta * 50)),
  log10(max(max(fit_initial$lambda.beta.seq) * 0.005, fit_initial$est.min$lambda.beta / 20)),
  length.out = 25
))

lambda.theta.refined <- 10^(seq(
  log10(min(max(fit_initial$lambda.theta.seq) * 0.9, fit_initial$est.min$lambda.theta * 50)),
  log10(max(max(fit_initial$lambda.theta.seq) * 0.005, fit_initial$est.min$lambda.theta / 20)),
  length.out = 25
))

# Perform 5-fold cross-validation
cvfit <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  lambda.beta = lambda.beta.refined,
  lambda.theta = lambda.theta.refined,
  compute.1se = TRUE,
  verbose = 0,
  seed = 1000
)

## -------------------------------------------------------------------------------------------------
# Compare different model choices
models <- list(
  "CV Minimum" = cvfit$est.min,
  "1SE Beta" = cvfit$est.1se.beta,
  "1SE Theta" = cvfit$est.1se.theta,
  "Initial BIC" = fit_initial$est.min
)

if (!is.null(models$`1SE Beta`) & !is.null(models$`1SE Theta`)) {  # Ensure models exist
  comparison <- data.frame(
    Model = names(models),
    Lambda.Beta = sapply(models, function(x) x$lambda.beta),
    Lambda.Theta = sapply(models, function(x) x$lambda.theta),
    Active.SNPs = sapply(models, function(x) 
      sum(rowSums(abs(x$Beta)) > 1e-8)),
    Total.Effects = sapply(models, function(x)
      sum(abs(x$Beta) > 1e-8)),
    Network.Edges = sapply(models, function(x)
      sum(abs(x$Theta[upper.tri(x$Theta)]) > 1e-8))
  )
  print(comparison, digits = 4)
}


# Select the more regularized model, fallback if NULL
if (!is.null(models$`1SE Beta`)) {
  final_model <- cvfit$est.1se.beta
} else final_model <- fit_initial$est.min

## -------------------------------------------------------------------------------------------------
# Extract coefficients
Beta <- final_model$Beta
rownames(Beta) <- colnames(sim$X)
colnames(Beta) <- colnames(sim$Z)

# Identify significant associations
threshold <- 1e-3
sig_meqtls <- which(abs(Beta) > threshold, arr.ind = TRUE)

if (nrow(sig_meqtls) > 0) {
  meqtl_df <- data.frame(
    SNP = rownames(Beta)[sig_meqtls[,1]],
    CpG = colnames(Beta)[sig_meqtls[,2]],
    Effect = Beta[sig_meqtls],
    AbsEffect = abs(Beta[sig_meqtls])
  )
  meqtl_df <- meqtl_df[order(meqtl_df$AbsEffect, decreasing = TRUE), ]
  
  cat("Top 15 meQTL associations:\n")
  print(head(meqtl_df, 15), digits = 3)
  
  # Visualization
  top_snps <- unique(meqtl_df$SNP[1:min(30, nrow(meqtl_df))])
  Beta_subset <- Beta[top_snps, , drop = FALSE]
  
  # Create heatmap
  par(mfrow = c(1, 1))
  colors <- colorRampPalette(c("blue", "white", "red"))(100)
  heatmap(as.matrix(Beta_subset), 
          col = colors,
          scale = "none",
          main = "Top meQTL Effects",
          xlab = "CpG Sites", 
          ylab = "SNPs",
          margins = c(8, 8))
}

## -------------------------------------------------------------------------------------------------
# Extract precision matrix and convert to partial correlations
Theta <- final_model$Theta
rownames(Theta) <- colnames(Theta) <- colnames(sim$Z)

# Compute partial correlations
partial_cor <- -cov2cor(Theta)
diag(partial_cor) <- 0

# Network statistics
edge_threshold <- 0.1
n_edges <- sum(abs(partial_cor[upper.tri(partial_cor)]) > edge_threshold)
{
  cat("\nNetwork Statistics:\n")
  cat("  Total possible edges:", q * (q-1) / 2, "\n")
  cat("  Selected edges (|r| >", edge_threshold, "):", n_edges, "\n")
  cat("  Network density:", sprintf("%.1f%%", 100 * n_edges / (q * (q-1) / 2)), "\n")
}

# Identify hub CpGs
degree <- colSums(abs(partial_cor) > edge_threshold)
hub_cpgs <- names(sort(degree, decreasing = TRUE)[1:5])
cat("\nHub CpG sites (highest connectivity):\n")
for (cpg in hub_cpgs) {
  cat("  ", cpg, ": degree =", degree[cpg], "\n")
}

# Visualize network
if (requireNamespace("igraph", quietly = TRUE)) {
  library(igraph)
  
  # Create network from significant edges
  edges <- which(abs(partial_cor) > 0.15 & upper.tri(partial_cor), arr.ind = TRUE)
  if (nrow(edges) > 0) {
    edge_list <- data.frame(
      from = rownames(partial_cor)[edges[,1]],
      to = colnames(partial_cor)[edges[,2]],
      weight = abs(partial_cor[edges])
    )
    
    g <- graph_from_data_frame(edge_list, directed = FALSE)
    
    # Node properties
    V(g)$size <- 5 + sqrt(degree[V(g)$name]) * 3
    V(g)$color <- ifelse(V(g)$name %in% hub_cpgs, "red", "lightblue")
    
    # Plot
    par(mfrow = c(1, 1))
    plot(g, 
         layout = layout_with_fr(g),
         vertex.label.cex = 0.7,
         edge.width = E(g)$weight * 3,
         main = "CpG Conditional Dependency Network")
    legend("topright", legend = c("Hub CpG", "Regular CpG"),
           pch = 21, pt.bg = c("red", "lightblue"), pt.cex = 2)
  }
}

## -------------------------------------------------------------------------------------------------
# Analyze how meQTLs relate to network structure
active_snps <- which(rowSums(abs(Beta)) > threshold)

if (length(active_snps) > 0) {
  cat("\nIntegration Analysis:\n")
  cat("=====================\n")
  # For each active SNP, check which CpGs it affects
  for (i in active_snps[1:min(5, length(active_snps))]) {
    affected_cpgs <- which(abs(Beta[i,]) > threshold)
    if (length(affected_cpgs) > 1) {
      # Check if affected CpGs are connected in the network
      subnet_partial <- partial_cor[affected_cpgs, affected_cpgs]
      mean_connection <- mean(abs(subnet_partial[upper.tri(subnet_partial)]))
      
      cat("\n", rownames(Beta)[i], "affects", length(affected_cpgs), "CpGs\n")
      cat("Mean network connection among affected CpGs:", 
          round(mean_connection, 3), "\n")
      cat("Affected CpGs:", paste(colnames(Beta)[affected_cpgs], collapse = ", "), "\n")
    }
  }
}

## -------------------------------------------------------------------------------------------------
# Split data for validation
n_train <- round(0.75 * n)
train_idx <- sample(n, n_train)
test_idx <- setdiff(1:n, train_idx)

# Refit on training data
cvfit_train <- cv.missoNet(
  X = sim$X[train_idx, ],
  Y = sim$Z[train_idx, ],
  kfold = 5,
  lambda.beta = lambda.beta.refined,
  lambda.theta = lambda.theta.refined,
  verbose = 0
)

# Predictions
Y_pred <- predict(cvfit_train, newx = sim$X[test_idx, ])
Y_test <- sim$Y[test_idx, ]  # True complete values

# Calculate performance metrics
mse_per_cpg <- colMeans((Y_pred - Y_test)^2)
cor_per_cpg <- sapply(1:q, function(j) cor(Y_pred[,j], Y_test[,j]))

# Visualization
par(mfrow = c(1, 2))

# MSE vs missing rate
plot(miss_by_cpg, mse_per_cpg, pch = 19, col = "darkblue",
     xlab = "Missing Rate", ylab = "Prediction MSE",
     main = "Prediction Error vs Missing Rate")
lines(lowess(miss_by_cpg, mse_per_cpg), col = "red", lwd = 2)

# Correlation vs missing rate
plot(miss_by_cpg, cor_per_cpg, pch = 19, col = "darkgreen",
     xlab = "Missing Rate", ylab = "Prediction Correlation",
     main = "Prediction Accuracy vs Missing Rate")
lines(lowess(miss_by_cpg, cor_per_cpg), col = "red", lwd = 2)

{
  cat("\nPrediction Performance:\n")
  cat("  Mean MSE:", round(mean(mse_per_cpg), 4), "\n")
  cat("  Mean correlation:", round(mean(cor_per_cpg), 3), "\n")
  cat("  Worst CpG MSE:", round(max(mse_per_cpg), 4), "\n")
  cat("  Best CpG correlation:", round(max(cor_per_cpg), 3), "\n")
}

## -------------------------------------------------------------------------------------------------
# Bootstrap stability (simplified for demonstration)
n_boot <- 10
selection_freq_beta <- matrix(0, p, q)
selection_freq_theta <- matrix(0, q, q)

for (b in 1:n_boot) {
  # Bootstrap sample
  boot_idx <- sample(n, replace = TRUE)
  
  # Fit model
  fit_boot <- missoNet(
    X = sim$X[boot_idx, ],
    Y = sim$Z[boot_idx, ],
    lambda.beta = final_model$lambda.beta,
    lambda.theta = final_model$lambda.theta,
    verbose = 0
  )
  
  # Track selections
  selection_freq_beta <- selection_freq_beta + (abs(fit_boot$est.min$Beta) > threshold)
  selection_freq_theta <- selection_freq_theta + 
    (abs(fit_boot$est.min$Theta) > edge_threshold)
}

# Normalize
selection_freq_beta <- selection_freq_beta / n_boot
selection_freq_theta <- selection_freq_theta / n_boot

# Identify stable features
stable_meqtls <- which(selection_freq_beta > 0.8, arr.ind = TRUE)
stable_edges <- which(selection_freq_theta > 0.8 & upper.tri(selection_freq_theta), 
                     arr.ind = TRUE)

{
  cat("\nStability Results:\n")
  cat("  Stable meQTLs (>80% selection):", nrow(stable_meqtls), "\n")
  cat("  Stable network edges (>80% selection):", nrow(stable_edges), "\n")
}

if (nrow(stable_meqtls) > 0) {
  cat("\nMost stable meQTL associations:\n")
  stable_df <- data.frame(
    SNP = colnames(sim$X)[stable_meqtls[,1]],
    CpG = colnames(sim$Z)[stable_meqtls[,2]],
    Frequency = selection_freq_beta[stable_meqtls]
  )
  print(head(stable_df[order(stable_df$Frequency, decreasing = TRUE), ], 10))
}

## -------------------------------------------------------------------------------------------------
# Annotate SNPs with genes (simulated)
active_snp_ids <- which(rowSums(abs(Beta)) > threshold)
if (length(active_snp_ids) > 0) {
  gene_names <- paste0("GENE", sample(1:50, length(active_snp_ids), replace = TRUE))
  
  snp_annotation <- data.frame(
    SNP = colnames(sim$X)[active_snp_ids],
    Gene = gene_names,
    Effect_Size = rowSums(abs(Beta[active_snp_ids, ]))
  )
  
  cat("Genes with meQTLs:\n")
  gene_summary <- aggregate(Effect_Size ~ Gene, snp_annotation, sum)
  gene_summary <- gene_summary[order(gene_summary$Effect_Size, decreasing = TRUE), ]
  print(head(gene_summary, 10))
}

# CpG annotation (simulated)
cpg_annotation <- data.frame(
  CpG = colnames(sim$Z),
  Region = sample(c("Promoter", "Gene Body", "Enhancer", "Intergenic"), 
                  q, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
  Island = sample(c("Island", "Shore", "Shelf", "Open Sea"),
                  q, replace = TRUE, prob = c(0.3, 0.2, 0.2, 0.3))
)

{
  cat("\nCpG distribution by genomic region:\n")
  print(table(cpg_annotation$Region))
}

{
  cat("\nCpG distribution by island status:\n")
  print(table(cpg_annotation$Island))
}

## ----echo = FALSE, include = TRUE-----------------------------------------------------------------
cat("\n========================================\n")
cat("       ANALYSIS SUMMARY REPORT\n")
cat("========================================\n\n")

cat("\nDATA CHARACTERISTICS:\n")
cat("--------------------\n")
cat("• Samples analyzed:", n, "\n")
cat("• SNPs tested:", p, "\n")
cat("• CpG sites measured:", q, "\n")
cat("• Missing data rate:", sprintf("%.1f%%", mean(is.na(sim$Z)) * 100), "\n")
cat("• Missing pattern: MAR (technical dropout)\n\n")

cat("\nMODEL SELECTION:\n")
cat("---------------\n")
cat("• Method: 5-fold cross-validation\n")
cat("• Selection criterion: 1-SE.Beta CV error\n")
cat("• Lambda (Beta):", sprintf("%.4f", final_model$lambda.beta), "\n")
cat("• Lambda (Theta):", sprintf("%.4f", final_model$lambda.theta), "\n\n")

cat("\nKEY FINDINGS:\n")
cat("------------\n")
cat("• meQTLs identified:", sum(rowSums(abs(Beta)) > threshold), "/", p, "SNPs\n")
cat("• SNP-CpG associations:", sum(abs(Beta) > threshold), "\n")
cat("• CpG network edges:", n_edges, "/", q*(q-1)/2, "possible\n")
cat("• Hub CpGs identified:", length(hub_cpgs), "\n\n")

cat("\nMODEL PERFORMANCE:\n")
cat("-----------------\n")
cat("• Mean prediction correlation:", sprintf("%.3f", mean(cor_per_cpg)), "\n")
cat("• Mean prediction MSE:", sprintf("%.4f", mean(mse_per_cpg)), "\n")
cat("• Stability (bootstrap):", sprintf("%.0f%%", 
    100 * nrow(stable_meqtls) / max(sum(abs(Beta) > threshold), 1)), 
    "of associations stable\n\n")

cat("\nBIOLOGICAL INSIGHTS (SIMULATED):\n")
cat("-------------------\n")
cat("• Primary affected regions: Promoters and gene bodies\n")
cat("• Network structure suggests co-regulated CpG modules\n")
cat("• Hub CpGs may represent key regulatory sites\n")