## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE
)

## -----------------------------------------------------------------------------
library(quickOutlier)
library(ggplot2)

## -----------------------------------------------------------------------------
# Create data with an obvious outlier
set.seed(123)
df <- data.frame(val = c(rnorm(50), 100))

# Detect using Z-Score (Standard Deviation)
outliers <- detect_outliers(df, "val", method = "zscore", threshold = 3)
print(head(outliers))

## -----------------------------------------------------------------------------
plot_outliers(df, "val", method = "zscore")

## -----------------------------------------------------------------------------
# Scan the entire dataframe
scan_data(mtcars, method = "iqr")

## -----------------------------------------------------------------------------
# Create correlated data and add an outlier
df_multi <- data.frame(x = 1:20, y = 1:20)
df_multi <- rbind(df_multi, data.frame(x = 5, y = 20)) # Anomalous point

res_multi <- detect_multivariate(df_multi, c("x", "y"))
tail(res_multi, 3)

## -----------------------------------------------------------------------------
# Lower confidence level to make it more sensitive for the demo
plot_interactive(df_multi, "x", "y", confidence_level = 0.99)

## -----------------------------------------------------------------------------
# Use the same multi-dimensional data
# k = number of neighbors to consider
res_lof <- detect_density(df_multi, k = 5, threshold = 1.5)
res_lof

## -----------------------------------------------------------------------------
# Generate a 2D blob of data
data_ml <- data.frame(
  feat1 = rnorm(100),
  feat2 = rnorm(100)
)
# Add an extreme outlier
data_ml[1, ] <- c(10, 10)

# Run Isolation Forest
# ntrees = 100 is standard. contamination = 0.05 means we expect ~5% outliers.
res_if <- detect_iforest(data_ml, ntrees = 100, contamination = 0.05)

# View the outlier score (0 to 1)
head(subset(res_if, Is_Outlier == TRUE))

## -----------------------------------------------------------------------------
# Create a synthetic time series: Sine wave + Noise + Outlier
t <- seq(1, 10, length.out = 60)
y <- sin(t) + rnorm(60, sd = 0.1)
y[30] <- 5 # Spike (Outlier)

# Detect using STL Decomposition
res_ts <- detect_ts_outliers(y, frequency = 12)

# Check the detected outlier
subset(res_ts, Is_Outlier == TRUE)

## -----------------------------------------------------------------------------
cities <- c(rep("Madrid", 10), "Barcalona", "Barcelona", "MAdrid")
detect_categorical_outliers(cities, min_freq = 0.1)

## -----------------------------------------------------------------------------
# Use mtcars and create a high leverage point
cars_df <- mtcars
cars_df[1, "wt"] <- 10; cars_df[1, "mpg"] <- 50

infl <- diagnose_influence(cars_df, "mpg", "wt")
head(subset(infl, Is_Influential == TRUE))

## -----------------------------------------------------------------------------
# Create data with an extreme value
df_treat <- data.frame(val = c(1, 2, 3, 2, 1, 100))

# Cap values at 1.5 * IQR
df_clean <- treat_outliers(df_treat, "val", method = "iqr", threshold = 1.5)
print(df_clean$val)

