## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  eval = FALSE  # Set to TRUE when frameworks are available
)

## ----setup--------------------------------------------------------------------
# library(leakr)

## ----caret_basic--------------------------------------------------------------
# # Load required libraries
# library(caret)
# 
# # Prepare iris data for caret
# data(iris)
# set.seed(123)
# 
# # Create train/test split using caret
# train_index <- createDataPartition(iris$Species, p = 0.8, list = FALSE)
# train_data <- iris[train_index, ]
# test_data <- iris[-train_index, ]
# 
# # Train a model using caret
# model_fit <- train(
#   Species ~ .,
#   data = train_data,
#   method = "rf",
#   trControl = trainControl(method = "cv", number = 5)
# )
# 
# # Use leakr to audit the caret model
# caret_audit <- leakr_from_caret(
#   train_obj = model_fit,
#   original_data = iris,
#   target_name = "Species"
# )
# 
# print(caret_audit)

## ----caret_preprocessing------------------------------------------------------
# # Example with preprocessing steps that might introduce leakage
# set.seed(456)
# 
# # Create a more complex dataset
# complex_data <- data.frame(
#   feature1 = rnorm(200),
#   feature2 = rnorm(200, 50, 10),
#   feature3 = sample(c("A", "B", "C"), 200, replace = TRUE),
#   target = factor(sample(c("positive", "negative"), 200, replace = TRUE))
# )
# 
# # Add missing values to demonstrate preprocessing
# complex_data$feature1[sample(1:200, 20)] <- NA
# complex_data$feature2[sample(1:200, 15)] <- NA
# 
# # Create train/test split
# train_idx <- createDataPartition(complex_data$target, p = 0.7, list = FALSE)
# train_complex <- complex_data[train_idx, ]
# test_complex <- complex_data[-train_idx, ]
# 
# # Define preprocessing with potential leakage risks
# preprocess_recipe <- preProcess(
#   train_complex[, -4],  # Exclude target
#   method = c("center", "scale", "medianImpute")
# )
# 
# # Train model with preprocessing
# model_complex <- train(
#   target ~ .,
#   data = train_complex,
#   method = "glm",
#   preProcess = c("center", "scale", "medianImpute"),
#   trControl = trainControl(method = "cv", number = 3)
# )
# 
# # Audit the complex workflow
# complex_audit <- leakr_from_caret(
#   train_obj = model_complex,
#   original_data = complex_data,
#   target_name = "target"
# )
# 
# # Generate detailed summary
# caret_summary <- leakr_summarise(complex_audit, show_config = TRUE)
# print(caret_summary)

## ----mlr3_basic---------------------------------------------------------------
# # Load mlr3 components
# library(mlr3)
# library(mlr3learners)
# 
# # Create an mlr3 task
# iris_task <- TaskClassif$new(
#   id = "iris_classification",
#   backend = iris,
#   target = "Species"
# )
# 
# # Use leakr to audit the mlr3 task
# mlr3_audit <- leakr_from_mlr3(
#   task = iris_task,
#   include_target = TRUE
# )
# 
# print(mlr3_audit)

## ----mlr3_advanced------------------------------------------------------------
# library(mlr3pipelines)
# 
# # Create a more complex dataset for demonstration
# titanic_like <- data.frame(
#   age = c(rnorm(100, 35, 10), rep(NA, 20)),
#   fare = c(rnorm(100, 50, 20), rep(NA, 20)),
#   sex = sample(c("male", "female"), 120, replace = TRUE),
#   class = sample(c("1st", "2nd", "3rd"), 120, replace = TRUE),
#   survived = factor(sample(c("yes", "no"), 120, replace = TRUE)),
#   stringsAsFactors = TRUE
# )
# 
# # Create task
# survival_task <- TaskClassif$new(
#   id = "survival_prediction",
#   backend = titanic_like,
#   target = "survived"
# )
# 
# # Create preprocessing pipeline that might introduce leakage
# preprocessing_pipeline <- po("imputehist") %>>%  # Imputation
#   po("scale") %>>%                               # Scaling
#   po("encode")                                   # Factor encoding
# 
# # Create full pipeline with learner
# full_pipeline <- preprocessing_pipeline %>>%
#   po("learner", lrn("classif.rpart"))
# 
# # Convert to learner
# pipeline_learner <- as_learner(full_pipeline)
# 
# # Audit the mlr3 pipeline
# pipeline_audit <- leakr_from_mlr3(
#   task = survival_task,
#   include_target = TRUE
# )
# 
# # Detailed analysis
# mlr3_summary <- leakr_summarise(pipeline_audit, top_n = 8)
# print(mlr3_summary)

## ----tidymodels_basic---------------------------------------------------------
# # Load tidymodels components
# library(tidymodels)
# 
# # Create initial split
# data(iris)
# set.seed(789)
# iris_split <- initial_split(iris, prop = 0.8, strata = Species)
# 
# # Create recipe
# iris_recipe <- recipe(Species ~ ., data = training(iris_split)) %>%
#   step_normalize(all_numeric_predictors()) %>%
#   step_dummy(all_nominal_predictors())
# 
# # Create model specification
# iris_model <- rand_forest(mode = "classification") %>%
#   set_engine("ranger")
# 
# # Create workflow
# iris_workflow <- workflow() %>%
#   add_recipe(iris_recipe) %>%
#   add_model(iris_model)
# 
# # Use leakr to audit the tidymodels workflow
# tidymodels_audit <- leakr_from_tidymodels(
#   workflow = iris_workflow,
#   data = iris
# )
# 
# print(tidymodels_audit)

## ----tidymodels_advanced------------------------------------------------------
# # Create a dataset with potential feature engineering leakage
# set.seed(987)
# engineering_data <- data.frame(
#   customer_id = 1:300,
#   purchase_amount = rlnorm(300, 3, 1),
#   days_since_last = rpois(300, 30),
#   category = sample(c("electronics", "clothing", "books"), 300, replace = TRUE),
#   month = sample(1:12, 300, replace = TRUE),
#   will_return = factor(sample(c("yes", "no"), 300, replace = TRUE, prob = c(0.3, 0.7)))
# )
# 
# # Add potential leakage: customer_lifetime_value (future information)
# engineering_data$customer_lifetime_value <-
#   ifelse(engineering_data$will_return == "yes",
#          engineering_data$purchase_amount * runif(300, 2, 5),
#          engineering_data$purchase_amount * runif(300, 0.5, 1.5))
# 
# # Create data split
# engineering_split <- initial_split(engineering_data, prop = 0.8, strata = will_return)
# 
# # Create comprehensive recipe with potential leakage sources
# engineering_recipe <- recipe(will_return ~ ., data = training(engineering_split)) %>%
#   update_role(customer_id, new_role = "ID") %>%
#   step_log(purchase_amount, customer_lifetime_value) %>%
#   step_normalize(all_numeric_predictors()) %>%
#   step_dummy(all_nominal_predictors()) %>%
#   step_interact(terms = ~ purchase_amount:days_since_last) %>%
#   step_pca(all_numeric_predictors(), num_comp = 5)
# 
# # Create model specification
# engineering_model <- logistic_reg() %>%
#   set_engine("glm")
# 
# # Create workflow
# engineering_workflow <- workflow() %>%
#   add_recipe(engineering_recipe) %>%
#   add_model(engineering_model)
# 
# # Audit the complex tidymodels workflow
# complex_tidymodels_audit <- leakr_from_tidymodels(
#   workflow = engineering_workflow,
#   data = engineering_data
# )
# 
# # Generate detailed summary
# tidymodels_summary <- leakr_summarise(
#   complex_tidymodels_audit,
#   top_n = 10,
#   show_config = TRUE
# )
# 
# print(tidymodels_summary)

## ----import_integration-------------------------------------------------------
# # Import data with automatic leakage checking
# # This would typically be used with real files
# example_data <- data.frame(
#   id = 1:100,
#   feature1 = rnorm(100),
#   feature2 = sample(letters[1:5], 100, replace = TRUE),
#   target = factor(sample(c("A", "B"), 100, replace = TRUE))
# )
# 
# # Simulate importing and auditing in one step
# imported_audit <- leakr_audit(
#   data = example_data,
#   target = "target",
#   id = "id"
# )
# 
# # Quick import function (simulated)
# leakr_quick_audit <- function(data_path, target, ...) {
#   # In practice, this would use leakr_import() followed by leakr_audit()
#   # data <- leakr_import(data_path, ...)
#   # audit <- leakr_audit(data, target = target)
#   # return(list(data = data, audit = audit))
# 
#   # For demonstration
#   return(imported_audit)
# }

## ----export_integration-------------------------------------------------------
# # Export data along with audit reports
# export_config <- list(
#   include_audit_report = TRUE,
#   format = "comprehensive",
#   generate_summary = TRUE
# )
# 
# # This would export both data and audit results
# # leakr_export_data(
# #   data = example_data,
# #   file_path = "audited_dataset",
# #   audit_report = imported_audit,
# #   config = export_config
# # )

## ----snapshots----------------------------------------------------------------
# # Create snapshot of current data state
# snapshot_info <- leakr_create_snapshot(
#   data = example_data,
#   name = "baseline_data",
#   metadata = list(
#     created_by = "data_scientist",
#     purpose = "baseline_analysis",
#     version = "1.0"
#   )
# )
# 
# # List available snapshots
# available_snapshots <- leakr_list_snapshots()
# print(available_snapshots)
# 
# # Load previous snapshot for comparison
# # previous_data <- leakr_load_snapshot("baseline_data")

## ----pre_training-------------------------------------------------------------
# # Complete pre-training validation workflow
# validate_before_training <- function(data, target, test_split = 0.2) {
#   # Step 1: Basic data validation
#   validated_data <- validate_and_preprocess_data(
#     data = data,
#     target = target,
#     split = NULL,
#     id = NULL,
#     config = list(remove_empty_cols = TRUE)
#   )
# 
#   # Step 2: Create train/test split
#   set.seed(42)
#   n <- nrow(validated_data)
#   train_indices <- sample(1:n, (1 - test_split) * n)
#   split_vector <- rep("test", n)
#   split_vector[train_indices] <- "train"
# 
#   # Step 3: Comprehensive leakage audit
#   audit_report <- leakr_audit(
#     data = validated_data,
#     target = target,
#     split = split_vector
#   )
# 
#   # Step 4: Check for blocking issues
#   critical_issues <- length(audit_report$issues[
#     sapply(audit_report$issues, function(x) x$severity == "high")
#   ])
# 
#   if (critical_issues > 0) {
#     warning(paste("Found", critical_issues, "critical issues. Review before training."))
#   }
# 
#   return(list(
#     data = validated_data,
#     split = split_vector,
#     audit = audit_report,
#     safe_to_train = critical_issues == 0
#   ))
# }
# 
# # Example usage
# # validation_result <- validate_before_training(your_data, "target_column")
# # if (validation_result$safe_to_train) {
# #   # Proceed with model training
# # }

## ----post_training------------------------------------------------------------
# # Post-training comprehensive audit
# post_training_audit <- function(model_object, framework = "caret") {
# 
#   audit_result <- switch(framework,
#     "caret" = leakr_from_caret(model_object),
#     "mlr3" = leakr_from_mlr3(model_object),
#     "tidymodels" = leakr_from_tidymodels(model_object),
#     stop("Unsupported framework")
#   )
# 
#   # Generate comprehensive summary
#   summary_report <- leakr_summarise(
#     audit_result,
#     top_n = 15,
#     show_config = TRUE
#   )
# 
#   return(list(
#     audit = audit_result,
#     summary = summary_report
#   ))
# }

## ----continuous_monitoring----------------------------------------------------
# # Set up continuous monitoring for production data
# setup_leakage_monitoring <- function(data_source, target, schedule = "daily") {
# 
#   monitor_config <- list(
#     alert_threshold = "medium",  # Alert on medium+ severity issues
#     notification_email = "data-team@company.com",
#     generate_plots = TRUE,
#     archive_reports = TRUE
#   )
# 
#   # This would typically integrate with a scheduler like cron
#   monitoring_function <- function() {
#     # Load current data
#     current_data <- data_source()  # Function to fetch current data
# 
#     # Run audit
#     current_audit <- leakr_audit(
#       data = current_data,
#       target = target,
#       config = monitor_config
#     )
# 
#     # Check for issues requiring attention
#     medium_high_issues <- length(current_audit$issues[
#       sapply(current_audit$issues, function(x) x$severity %in% c("medium", "high"))
#     ])
# 
#     if (medium_high_issues > 0) {
#       # Send alert (implementation would depend on your notification system)
#       message(paste("Leakage monitoring alert:", medium_high_issues, "issues detected"))
#     }
# 
#     # Archive report with timestamp
#     timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
#     # save(current_audit, file = paste0("audit_", timestamp, ".RData"))
# 
#     return(current_audit)
#   }
# 
#   return(monitoring_function)
# }
# 
# # Example setup
# # monitor <- setup_leakage_monitoring(
# #   data_source = function() { read.csv("daily_data.csv") },
# #   target = "outcome"
# # )
# # daily_audit <- monitor()

## ----memory_efficient---------------------------------------------------------
# # Configuration for large-scale processing
# large_scale_config <- list(
#   sample_size = 10000,           # Limit memory usage
#   chunk_processing = TRUE,        # Process in chunks
#   parallel_detectors = FALSE,     # Disable if memory constrained
#   save_intermediate = TRUE,       # Save intermediate results
#   cleanup_temp = TRUE            # Clean up temporary objects
# )
# 
# # Process very large dataset efficiently
# process_large_dataset <- function(data_path, target, config = large_scale_config) {
# 
#   # Process in chunks if dataset is too large to fit in memory
#   if (file.size(data_path) > 1e9) {  # > 1GB
#     # Implement chunked processing
#     message("Large dataset detected, using chunked processing")
# 
#     # This would implement actual chunked reading and processing
#     # chunk_results <- process_in_chunks(data_path, target, config)
#     # combined_audit <- combine_audit_results(chunk_results)
#     # return(combined_audit)
#   } else {
#     # Standard processing for manageable datasets
#     data <- read.csv(data_path)
#     return(leakr_audit(data, target = target, config = config))
#   }
# }

## ----config_management--------------------------------------------------------
# # Create environment-specific configurations
# development_config <- list(
#   sample_size = 1000,
#   generate_plots = TRUE,
#   detailed_logging = TRUE
# )
# 
# production_config <- list(
#   sample_size = 10000,
#   generate_plots = FALSE,
#   detailed_logging = FALSE,
#   alert_on_issues = TRUE
# )
# 
# testing_config <- list(
#   sample_size = 500,
#   run_all_detectors = TRUE,
#   strict_thresholds = TRUE
# )

## ----error_handling-----------------------------------------------------------
# # Robust error handling for production environments
# safe_audit <- function(data, target, ...) {
#   tryCatch({
#     audit_result <- leakr_audit(data, target = target, ...)
# 
#     # Log successful audit
#     message(paste("Audit completed successfully at", Sys.time()))
# 
#     return(list(
#       success = TRUE,
#       audit = audit_result,
#       timestamp = Sys.time()
#     ))
# 
#   }, error = function(e) {
#     # Log error details
#     error_msg <- paste("Audit failed:", e$message)
#     warning(error_msg)
# 
#     return(list(
#       success = FALSE,
#       error = error_msg,
#       timestamp = Sys.time()
#     ))
#   })
# }

