---
title: "Advanced Features"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Advanced Features}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6
)
```

```{r setup}
library(manureshed)
```

## Overview

This vignette covers advanced features for power users:

- Custom thresholds and parameters
- Parallel processing for multiple years
- State-specific analysis
- Performance optimization
- Custom workflows

## Custom Thresholds

### Understanding Thresholds

The package excludes areas with little cropland from analysis. The default is 500 hectares (1,234 acres), but you can change this:

```{r custom_thresholds, eval=FALSE}
# More restrictive (exclude more areas)
results_conservative <- run_builtin_analysis(
  scale = "huc8",
  year = 2016,
  nutrients = "nitrogen",
  cropland_threshold = 2000  # 2000 acres instead of 1234
)

# Less restrictive (include more areas)
results_liberal <- run_builtin_analysis(
  scale = "huc8", 
  year = 2016,
  nutrients = "nitrogen",
  cropland_threshold = 500   # 500 acres
)

# Compare the difference
conservative_excluded <- sum(results_conservative$agricultural$N_class == "Excluded")
liberal_excluded <- sum(results_liberal$agricultural$N_class == "Excluded")

print(paste("Conservative:", conservative_excluded, "excluded"))
print(paste("Liberal:", liberal_excluded, "excluded"))
```

### Automatic Threshold Calculation

For HUC8 and HUC2 scales, the package calculates thresholds automatically based on county data:

```{r auto_threshold, eval=FALSE}
# Load data for threshold calculation
county_data <- load_builtin_nugis("county", 2016)
huc8_data <- load_builtin_nugis("huc8", 2016)

# Calculate appropriate threshold for HUC8
custom_threshold <- get_cropland_threshold(
  scale = "huc8",
  county_data = county_data,
  target_data = huc8_data,
  baseline_ha = 750  # Use 750 ha instead of 500 ha baseline
)

print(paste("Custom threshold:", round(custom_threshold, 2), "acres"))

# Use in analysis
results <- run_builtin_analysis(
  scale = "huc8",
  year = 2016,
  nutrients = "nitrogen",
  cropland_threshold = custom_threshold
)
```

## State-Specific Analysis

### Single State Analysis

```{r state_analysis, eval=FALSE}
# Analyze specific states
iowa_results <- run_state_analysis(
  state = "IA",
  scale = "county", 
  year = 2016,
  nutrients = c("nitrogen", "phosphorus"),
  include_wwtp = TRUE
)

# Quick state analysis with maps
texas_results <- quick_state_analysis(
  state = "TX",
  scale = "huc8",
  year = 2015,
  nutrients = "nitrogen",
  create_maps = TRUE,
  create_networks = TRUE
)
```

### Multi-State Comparison

```{r multi_state, eval=FALSE}
# Compare agricultural states
corn_belt_states <- c("IA", "IL", "IN", "NE", "OH")
state_results <- list()

for (state in corn_belt_states) {
  state_results[[state]] <- run_state_analysis(
    state = state,
    scale = "county",
    year = 2016, 
    nutrients = "nitrogen",
    include_wwtp = TRUE,
    verbose = FALSE  # Reduce output
  )
}

# Compare state summaries
state_summary <- do.call(rbind, lapply(names(state_results), function(state) {
  result <- state_results[[state]]
  classes <- table(result$agricultural$N_class)
  data.frame(
    State = state,
    Total_Counties = sum(classes),
    Source_Counties = classes["Source"] %||% 0,
    Sink_Counties = sum(classes[c("Sink_Deficit", "Sink_Fertilizer")], na.rm = TRUE),
    Source_Percent = round((classes["Source"] %||% 0) / sum(classes) * 100, 1)
  )
}))

print(state_summary)
```

## Batch Processing

### Multiple Years

```{r batch_years, eval=FALSE}
# Analyze multiple years efficiently
batch_results <- batch_analysis_years(
  years = 2012:2016,
  scale = "huc8",
  nutrients = "nitrogen", 
  include_wwtp = TRUE,
  create_comparative_plots = TRUE
)

# Check which years succeeded
successful_years <- names(batch_results$results)
print(paste("Successful years:", paste(successful_years, collapse = ", ")))
```

### Parallel Processing

For faster processing of multiple analyses:

```{r parallel, eval=FALSE}
# Use multiple CPU cores
parallel_results <- batch_analysis_parallel(
  years = 2014:2016,
  n_cores = 2,  # Use 2 cores
  scale = "county",
  nutrients = "nitrogen",
  include_wwtp = TRUE
)

# Check results
successful <- sum(!sapply(parallel_results, function(x) "error" %in% names(x)))
print(paste("Successful analyses:", successful, "out of", length(parallel_results)))
```

### Enhanced Batch Analysis

For comprehensive batch analysis with full visualizations:

```{r enhanced_batch, eval=FALSE}
# Full batch analysis with all visualizations
enhanced_results <- batch_analysis_enhanced(
  years = 2015:2016,  # Use fewer years for demonstration
  scale = "county",
  nutrients = "nitrogen",
  create_all_visualizations = TRUE,
  create_comparative_plots = TRUE
)
```

## Performance Optimization

### Benchmarking

Test analysis performance:

```{r benchmarking, eval=FALSE}
# Benchmark different configurations
benchmark_results <- benchmark_analysis(
  scale = "county",
  year = 2016,
  nutrients = "nitrogen",
  n_runs = 3,
  include_wwtp = TRUE
)

print(benchmark_results)

# Compare scales
scales_to_test <- c("county", "huc8", "huc2")
benchmark_comparison <- list()

for (scale in scales_to_test) {
  benchmark_comparison[[scale]] <- benchmark_analysis(
    scale = scale,
    year = 2016,
    nutrients = "nitrogen", 
    n_runs = 2,
    include_wwtp = TRUE
  )
}

# Compare timing
timing_comparison <- data.frame(
  Scale = scales_to_test,
  Mean_Time_Seconds = sapply(benchmark_comparison, function(x) x$timing$mean),
  Memory_MB = sapply(benchmark_comparison, function(x) x$memory_mb$mean)
)

print(timing_comparison)
```

### Memory Management

```{r memory_management, eval=FALSE}
# Clear cache to free up space
clear_data_cache()

# Check package health
health_status <- health_check(verbose = TRUE)

# Test OSF connection
connection_ok <- test_osf_connection()
print(paste("OSF connection:", ifelse(connection_ok, "OK", "Failed")))
```

## Advanced Spatial Analysis

### Transition Probabilities

Analyze how nutrient classifications change across space:

```{r transition_analysis, eval=FALSE}
# Run analysis
results <- run_builtin_analysis(
  scale = "huc8",
  year = 2016,
  nutrients = "nitrogen",
  include_wwtp = TRUE
)

# Calculate centroids for spatial analysis
centroids <- add_centroid_coordinates(results$integrated$nitrogen)

# Calculate transition probabilities
agri_transitions <- calculate_transition_probabilities(
  centroids, "N_class"
)

combined_transitions <- calculate_transition_probabilities(
  centroids, "combined_N_class"
)

# Compare transition matrices
print("Agricultural transitions:")
print(agri_transitions)

print("Combined (WWTP + Agricultural) transitions:")
print(combined_transitions)

# Create network plots
create_network_plot(
  agri_transitions, "nitrogen", "Agricultural",
  "network_agricultural.png"
)

create_network_plot(
  combined_transitions, "nitrogen", "Combined", 
  "network_combined.png"
)
```

### Spatial Statistics

```{r spatial_stats, eval=FALSE}
# Calculate spatial statistics
library(sf)

# Get results as spatial data
spatial_results <- results$integrated$nitrogen

# Calculate areas of different classifications
class_areas <- spatial_results %>%
  mutate(area_km2 = as.numeric(st_area(.)) / 1000000) %>%
  st_drop_geometry() %>%
  group_by(combined_N_class) %>%
  summarise(
    count = n(),
    total_area_km2 = sum(area_km2),
    mean_area_km2 = mean(area_km2),
    .groups = 'drop'
  )

print(class_areas)

# Find neighboring relationships
neighbors <- st_touches(spatial_results)
neighbor_summary <- data.frame(
  ID = spatial_results$ID,
  N_neighbors = lengths(neighbors),
  Class = spatial_results$combined_N_class
)

print("Average neighbors by classification:")
neighbor_summary %>%
  group_by(Class) %>%
  summarise(mean_neighbors = mean(N_neighbors), .groups = 'drop')
```

## Custom Analysis Workflows

### Research-Specific Analysis

```{r custom_workflow, eval=FALSE}
# Example: Livestock-intensive regions analysis
analyze_livestock_regions <- function(states, year = 2016) {
  
  results <- list()
  
  for (state in states) {
    # Custom threshold for livestock regions
    county_data <- load_builtin_nugis("county", year)
    
    # Calculate state-specific threshold
    state_county <- county_data[substr(county_data$ID, 1, 2) == get_state_fips(state), ]
    custom_threshold <- quantile(state_county$cropland, 0.25)  # 25th percentile
    
    # Run analysis
    state_result <- run_state_analysis(
      state = state,
      scale = "county",
      year = year,
      nutrients = "nitrogen",
      include_wwtp = TRUE,
      cropland_threshold = custom_threshold,
      verbose = FALSE
    )
    
    results[[state]] <- state_result
  }
  
  return(results)
}

# Apply to livestock states
livestock_states <- c("IA", "NE", "NC", "MN")
livestock_results <- analyze_livestock_regions(livestock_states, 2016)
```

### Time Series Analysis

```{r time_series, eval=FALSE}
# Custom time series analysis
analyze_trends <- function(scale, years, nutrient = "nitrogen") {
  
  trend_data <- list()
  
  for (year in years) {
    # Check if WWTP data available
    use_wwtp <- year >= 2007 && year <= 2016
    
    result <- run_builtin_analysis(
      scale = scale,
      year = year,
      nutrients = nutrient,
      include_wwtp = use_wwtp,
      save_outputs = FALSE,
      verbose = FALSE
    )
    
    # Extract classification summary
    if (use_wwtp && "integrated" %in% names(result)) {
      classes <- table(result$integrated[[nutrient]][[paste0("combined_", toupper(substr(nutrient, 1, 1)), "_class")]])
    } else {
      classes <- table(result$agricultural[[paste0(toupper(substr(nutrient, 1, 1)), "_class")]])
    }
    
    trend_data[[as.character(year)]] <- list(
      year = year,
      wwtp_included = use_wwtp,
      classes = classes,
      source_count = classes["Source"] %||% 0,
      sink_count = sum(classes[c("Sink_Deficit", "Sink_Fertilizer")], na.rm = TRUE)
    )
  }
  
  return(trend_data)
}

# Analyze nitrogen trends
nitrogen_trends <- analyze_trends("huc8", 2008:2016, "nitrogen")

# Convert to data frame for plotting
trend_df <- do.call(rbind, lapply(nitrogen_trends, function(x) {
  data.frame(
    Year = x$year,
    WWTP_Included = x$wwtp_included,
    Source_Count = x$source_count,
    Sink_Count = x$sink_count,
    Total_Units = sum(x$classes)
  )
}))

print(trend_df)
```

## Data Export and Integration

### Export for Other Software

```{r data_export, eval=FALSE}
# Export for GIS software
gis_files <- export_for_gis(
  results,
  output_dir = "gis_export",
  formats = c("shapefile", "geojson")
)

# Export for publication
pub_files <- export_for_publication(
  results,
  output_dir = "publication_export",
  dpi = 600
)

# Export for policy makers
policy_files <- export_for_policy(
  results,
  output_dir = "policy_export"
)

print("Created files:")
print(c(gis_files, pub_files, policy_files))
```

### Integration with Other Packages

```{r package_integration, eval=FALSE}
# Example: Using with tigris for custom boundaries
if (requireNamespace("tigris", quietly = TRUE)) {
  # Get state boundary
  ohio_boundary <- tigris::states() %>%
    filter(STUSPS == "OH") %>%
    st_transform(5070)  # Transform to analysis CRS
  
  # Spatial filter results
  ohio_watersheds <- st_filter(results$integrated$nitrogen, ohio_boundary)
  
  print(paste("Ohio watersheds:", nrow(ohio_watersheds)))
}

# Example: Using with nhdplusTools for watershed delineation
if (requireNamespace("nhdplusTools", quietly = TRUE)) {
  # Find watershed for a specific point
  point <- st_sfc(st_point(c(-83.0458, 42.3314)), crs = 4326)  # Detroit
  watershed_info <- nhdplusTools::discover_nhdplus_id(point)
  
  # Filter results to this watershed
  if (!is.null(watershed_info$huc8)) {
    watershed_result <- results$integrated$nitrogen %>%
      filter(ID == watershed_info$huc8)
    print(watershed_result)
  }
}
```

## Quality Control

### Advanced Validation

```{r advanced_validation, eval=FALSE}
# Comprehensive quality check
validation_report <- list()

# Check data completeness
validation_report$completeness <- list(
  total_units = nrow(results$agricultural),
  missing_n_class = sum(is.na(results$agricultural$N_class)),
  excluded_percent = sum(results$agricultural$N_class == "Excluded", na.rm = TRUE) / 
                    nrow(results$agricultural) * 100
)

# Check balance calculations
if ("integrated" %in% names(results)) {
  n_data <- results$integrated$nitrogen
  validation_report$balance_check <- list(
    impossible_sources = sum(n_data$combined_N_surplus <= 0 & 
                            n_data$combined_N_class == "Source", na.rm = TRUE),
    impossible_sinks = sum(n_data$combined_N_surplus > 0 & 
                          n_data$combined_N_class %in% c("Sink_Deficit", "Sink_Fertilizer"), na.rm = TRUE)
  )
}

# Check spatial validity
if (inherits(results$agricultural, "sf")) {
  validation_report$spatial_check <- list(
    invalid_geometries = sum(!st_is_valid(results$agricultural)),
    crs = st_crs(results$agricultural)$input
  )
}

print("Validation Report:")
print(str(validation_report))
```

## Tips for Advanced Users

### Performance Tips

```{r performance_tips, eval=FALSE}
# 1. Use appropriate scales
# County: ~3000 units, good for policy analysis
# HUC8: ~2000 units, good for watershed analysis  
# HUC2: ~18 units, good for regional analysis

# 2. Manage memory for large analyses
gc()  # Garbage collection
clear_data_cache()  # Clear package cache

# 3. Use parallel processing for multiple years
# But be careful not to use too many cores

# 4. Save intermediate results
save_spatial_data(results$agricultural, "intermediate_results.rds")
```

### Reproducibility

```{r reproducibility, eval=FALSE}
# Always document your analysis parameters
analysis_params <- list(
  scale = "huc8",
  year = 2016,
  nutrients = "nitrogen", 
  cropland_threshold = 1234,
  include_wwtp = TRUE,
  analysis_date = Sys.Date(),
  package_version = packageVersion("manureshed")
)

# Save parameters with results
results$analysis_params <- analysis_params
save_analysis_summary(results, "analysis_summary.json", format = "json")
```

This covers the main advanced features. The package is designed to be flexible for power users while remaining accessible for basic analyses.