Skip to contents

4 years, 2 regions and 2 outliers (28 provinces)

In this toy example, we filter the dataset on Vietnam temperature to consider only the two northern regions and two provinces from the south, from 2013 to 2016, for a total of 112 observations. The aim is to study the influence of the preprocessing parameters.

Data presentation

data(vietnam_provinces)
data(vietnam_regions)

vietnam_provinces |>
  mutate(
    province = if_else(region %in% c("NMM", "RRD") | province %in% c("ANGIANG", "BACLIEU"), province, NA),
    region = if_else(region %in% c("NMM", "RRD", "MDR"), region, NA)
  ) |>
  left_join(vietnam_regions, by = c("region" = "code")) |>
  select(!region) |>
  rename(region = name) |>
  ggplot(aes(fill = region, label = province)) +
  geom_sf() +
  geom_sf_label(fill = "white", size = 3) +
  scale_fill_discrete(na.translate = FALSE) +
  scale_x_continuous(breaks = seq(from = -180, to = 180, by = 2)) +
  labs(x = "longitude", y = "latitude")
Map of Vietnam showing the 63 provinces, color-coded by region. The 28 provinces included in the toy example are labeled.

Map of Vietnam showing the 63 provinces, color-coded by region. The 28 provinces included in the toy example are labeled.

# Filter the data set
data(vietnam_temperature)
selected_years <- 2013:2016

vnt <- vietnam_temperature |>
  filter(year %in% selected_years) |>
  filter(region %in% c("NMM", "RRD") | province %in% c("ANGIANG", "BACLIEU")) |>
  left_join(vietnam_regions, by = c("region" = "code")) |>
  select(!region) |>
  rename(region = name) |>
  arrange(region, province)

vnts <- vnt |>
  mutate(t_max = as_dd(t_max,
    lambda = 1, nbasis = 12, mc.cores = 12
  ))

vnts |>
  plot_funs(t_max, color = region) +
  theme_legend_inside +
  labs(x = "temperature (deg. Celsius)", y = "density")
Densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016, color-coded by region.

Densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016, color-coded by region.


vnts |>
  mutate(t_max = as.fd(center(t_max))) |>
  plot_funs(t_max, color = region) +
  theme(legend.position = "none") +
  labs(x = "temperature (deg. Celsius)", y = "centred clr(density)")
Centred log-ratio transforms of the centred densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016, color-coded by region.

Centred log-ratio transforms of the centred densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016, color-coded by region.


vnts |>
  mutate(
    t_max = as.fd(t_max),
    year = ifelse(province == "LAICHAU", year, "not LAICHAU"),
    alpha = ifelse(province == "LAICHAU", 1, 0.85)
  ) |>
  plot_funs(t_max, color = year, alpha = alpha) +
  labs(x = "temperature (deg. Celsius)", y = "clr(density)") +
  guides(alpha = "none")
Centred log-ratio transforms of the densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016. The case of LAICHAU.

Centred log-ratio transforms of the densities of maximum temperature over 28 provinces of Vietnam and during the years 2013-2016. The case of LAICHAU.

Influence of the preprocessing parameters

Let us create a grid of parameters and define the function to call for each parameter.

smooth_ICS <- function(vnt, knots_pos, nknots, lambda, index) {
  vnts <- vnt |>
    mutate(t_max = as_dd(t_max,
      lambda = lambda, nbasis = 5 + nknots,
      mc.cores = parallel::detectCores() - 1, knots_pos = knots_pos
    ))
  knots_pos_str <- ifelse(knots_pos == "quantiles",
    "Knots at quantiles", "Equally spaced knots"
  )
  index_str <- ifelse(is.na(index), "Auto selection of components",
    paste("# of components:", index)
  )

  print(paste(
    "# of knots:", nknots, "|",
    "log10(lambda):", log10(lambda), "|",
    knots_pos_str, "|",
    index_str
  ))

  if (is.na(index)) {
    list(
      dd = vnts$t_max,
      ics = tryCatch(ICS(as.fd(vnts$t_max)), error = function(e) e),
      ics_out = tryCatch(ICS_outlier(as.fd(vnts$t_max),
        n_cores = parallel::detectCores() - 1
      ), error = function(e) e)
    )
  } else {
    list(
      dd = vnts$t_max,
      ics = tryCatch(ICS(as.fd(vnts$t_max)), error = function(e) e),
      ics_out = tryCatch(ICS_outlier(as.fd(vnts$t_max),
        n_cores = parallel::detectCores() - 1, index = seq_len(index)
      ), error = function(e) e)
    )
  }
}
knots_pos_seq <- c("quantiles", "equally spaced")
nknots_seq <- c(5:19, seq(20, 40, 10)) - 5
lambda_seq <- 10^(-8:8)
index_seq <- 4

param_grid <- expand.grid(
  knots_pos = knots_pos_seq,
  nknots = nknots_seq,
  lambda = lambda_seq,
  index = index_seq
) |>
  arrange(nknots, knots_pos, lambda)

knitr::kable(
  tibble(
    `# of knots positions` = length(knots_pos_seq),
    `# of # of knots` = length(nknots_seq),
    `# of values for lambda` = length(lambda_seq),
    `# of # of components` = length(lambda_seq),
    `# of values for index` = length(index_seq),
    `Total grid size` = nrow(param_grid)
  )
)
# of knots positions # of # of knots # of values for lambda # of # of components # of values for index Total grid size
2 18 17 17 1 612

if (!file.exists("ICS_north_south_outliers.RData") && !file.exists("ICS_north_south.RData")) {
  system.time(
    ics_grid <- param_grid |>
      mutate(id = seq_len(n())) |>
      rowwise() |>
      mutate(res = list(smooth_ICS(vnt, knots_pos, nknots, lambda, index)))
  )
  save(ics_grid, file = "ICS_north_south.RData")
}

if (!file.exists("ICS_north_south_outliers.RData")) {
  load("ICS_north_south.RData")

  ics_grid_out <- ics_grid |>
    rowwise() |>
    filter(!(any(class(res$ics_out) == "error"))) |>
    mutate(res = list(data.frame(
      obs_id = seq_len(length((res$ics_out$outliers))),
      outlying = res$ics_out$outliers == 1,
      distance = res$ics_out$ics_distances
    ))) |>
    unnest(res)

  save(ics_grid_out, file = "ICS_north_south_outliers.RData")
}

load("ICS_north_south_outliers.RData")
ics_grid_out |>
  left_join(vnt |> mutate(obs_id = row_number()), by = c("obs_id")) |>
  filter(nknots %% 2 == 0 | nknots > 20) |>
  mutate(region = str_wrap(region, 25)) |>
  ggplot(aes(log10(lambda), obs_id, fill = region, alpha = outlying)) +
  geom_tile() +
  facet_grid(cols = vars(nknots), rows = vars(knots_pos)) +
  labs(y = "observation index")
Outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters. Columns correspond to the number of inside knots (0-35).

Outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters. Columns correspond to the number of inside knots (0-35).

ics_grid_summary <- ics_grid_out |>
  mutate(outlying = ifelse(outlying, 1, 0)) |>
  group_by(obs_id) |>
  summarise(n_outlying = sum(outlying), mean_distance = mean(distance)) |>
  left_join(vnt |> mutate(obs_id = row_number()), by = c("obs_id")) |>
  select(obs_id, region, province, year, n_outlying, mean_distance)

rmarkdown::paged_table(ics_grid_summary)
ics_grid_summary |>
  ggplot(aes(obs_id, n_outlying, fill = region)) +
  geom_bar(stat = "identity") +
  theme_legend_inside_right +
  labs(x = "observation index", y = "outlier detection frequency")
Summary of outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters.

Summary of outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters.


ics_grid_summary |>
  ggplot(aes(obs_id, mean_distance, fill = region)) +
  geom_bar(stat = "identity") +
  theme_legend_inside_right +
  labs(x = "observation index", y = "mean ICS distance")
Summary of outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters.

Summary of outlier detection using ICS on 28 provinces of Vietnam, during the years 2013-2016, over a grid of preprocessing parameters.

30 years, 1 climate region (13 provinces)

Data presentation

vietnam_provinces <- vietnam_provinces |>
  arrange(region, province) |>
  mutate(climate_region = case_when(
    province %in% c("LAICHAU", "DIENBIEN", "SONLA") ~ "S1",
    province %in% c(
      "LAOCAI", "YENBAI", "HAGIANG", "CAOBANG", "TUYENQUANG",
      "BACKAN", "THAINGUYEN", "LANGSON", "BACGIANG", "QUANGNINH"
    ) ~ "S2",
    region %in% c("NMM", "RRD") ~ "S3",
    .default = NA
  ))

vietnam_provinces |>
  filter(region %in% c("RRD", "NMM")) |>
  ggplot(aes(fill = climate_region, label = province)) +
  geom_sf() +
  geom_sf_label(fill = "white") +
  scale_fill_viridis_d() +
  labs(x = "longitude", y = "latitude", fill = "climate region")
The three climate regions of northern Vietnam.

The three climate regions of northern Vietnam.

library(tidyverse)
library(sf)
data(vietnam_temperature)
data(vietnam_regions)
data(vietnam_provinces)

vntcr <- vietnam_temperature |>
  filter(region %in% c("RRD", "NMM")) |>
  dplyr::select(year, province, t_max) |>
  left_join(vietnam_provinces, by = "province") |>
  st_as_sf() |>
  dplyr::select(year, climate_region, province, t_max) |>
  filter(climate_region == "S3") |>
  arrange(year, province)

vnt <- vntcr |>
  mutate(t_max = as_dd(t_max, lambda = 10, nbasis = 10, mc.cores = 15))
class(vnt$t_max) <- c("ddl", "fdl", "list")

vnt |> plot_funs(t_max, color = province) +
  facet_wrap(vars(year)) +
  labs(x = "temperature (deg. Celsius)", y = "density")
Densities of maximum temperatures in the 13 provinces of the S3 climate region of Vietnam during the years 1987-2016.

Densities of maximum temperatures in the 13 provinces of the S3 climate region of Vietnam during the years 1987-2016.

Outlier detection using ICS

icsout <- ICS_outlier(vnt$t_max, index = 1:2, n_cores = 14)

icsout$result |> ggplot(aes(index, gen_kurtosis, group = 1)) +
  geom_line(alpha = 0.5) +
  geom_point(aes(color = index, alpha = selected), size = 3) +
  theme_legend_inside_right +
  labs(x = "", y = "ICS eigenvalues") +
  guides(color = "none")
Scree plot of ICS eigenvalues.

Scree plot of ICS eigenvalues.


icsout$result |>
  filter(selected) |>
  plot_funs(H_dual, color = index) +
  geom_hline(yintercept = 1 / diff(icsout$result$H_dual[[1]]$basis$rangeval), linetype = "dotted") +
  labs(x = "temperature (deg. Celsius)", y = "density") +
  scale_color_manual(values = scales::pal_hue()(9)[1:2]) +
  theme_legend_inside
Dual ICS eigendensities.

Dual ICS eigendensities.


h_dual_shift <- c(3 * icsout$result$H_dual)
h_dual_shift$coefs <- sweep(h_dual_shift$coefs, 1, -c(mean(vnt$t_max))$coefs)
icsout$result |>
  mutate(H_dual_shift = as.list(h_dual_shift)) |>
  filter(selected) |>
  plot_funs(H_dual_shift, color = index) +
  geom_function(fun = function(x) eval(c(mean(vnt$t_max)), x), color = "black", linetype = "dotted") +
  scale_color_manual(values = scales::pal_hue()(9)[1:2]) +
  labs(x = "temperature (deg. Celsius)", y = "density") +
  theme_legend_inside
Shifting the mean in the direction of each dual ICS eigendensity.

Shifting the mean in the direction of each dual ICS eigendensity.

vnt <- vnt |>
  mutate(
    index = seq_len(nrow(icsout$scores)),
    outlying = as.factor(icsout$outliers),
    distances = icsout$ics_distances
  ) |>
  cbind(icsout$scores)

vnt |>
  mutate(year = as.factor(ifelse(outlying == 1, year, NA))) |>
  arrange(outlying) |>
  plot_funs(t_max, color = year, alpha = outlying) +
  theme_legend_inside +
  labs(
    x = "temperature (deg. Celsius)", y = "density",
    color = "outlying\nyears"
  ) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  guides(alpha = "none")
Outlying densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Outlying densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.


vnt |>
  mutate(
    t_max = as.fd(center(t_max)),
    year = as.factor(ifelse(outlying == 1, year, NA))
  ) |>
  arrange(outlying) |>
  plot_funs(t_max, color = year, alpha = outlying) +
  theme(legend.position = "none") +
  labs(
    x = "temperature (deg. Celsius)", y = "centred clr(density)",
    color = "outlying\nyears"
  )
Centred log-ratio transforms of the outlying centred densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Centred log-ratio transforms of the outlying centred densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

vnt |>
  mutate(province = ifelse(outlying == 1, province, NA)) |>
  arrange(outlying) |>
  plot_funs(t_max, color = province, alpha = outlying) +
  facet_wrap(vars(year)) +
  labs(
    x = "temperature (deg. Celsius)", y = "density",
    color = "outlying\nprovinces"
  ) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  guides(alpha = "none")
Outlying densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Outlying densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.


vnt |>
  mutate(
    t_max = as.fd(center(t_max)),
    province = ifelse(outlying == 1, province, NA)
  ) |>
  arrange(outlying) |>
  plot_funs(t_max, color = province, alpha = outlying) +
  facet_wrap(vars(year)) +
  labs(
    x = "temperature (deg. Celsius)", y = "centred clr(density)",
    color = "outlying\nprovinces"
  ) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  guides(alpha = "none")
Centred log-ratio transforms of the outlying centred densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Centred log-ratio transforms of the outlying centred densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

vnt |>
  mutate(province = ifelse(outlying == 1, province, NA)) |>
  ggplot(aes(year, distances, color = province, label = year)) +
  geom_text() +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  geom_hline(yintercept = icsout$ics_dist_cutoff) +
  labs(color = "outlying\nprovinces")
ICS distances of the densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

ICS distances of the densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

vnt |>
  mutate(province = ifelse(outlying == 1, province, NA)) |>
  ggplot(aes(IC.1, IC.2)) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  geom_text(aes(color = province, label = year)) +
  labs(color = "outlying\nprovinces")
Selected invariant coordinates of the densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Selected invariant coordinates of the densities of maximum temperature in the provinces of the S3 climate region of Vietnam during the years 1987-2016.

Influence of the preprocessing parameters

Now let us apply ICS for outlier detection on the same dataset but with different smoothing parameters.

knots_pos_seq <- "quantiles"
nknots_seq <- 5 * 1:5
lambda_seq <- 10^(-2:2)
index_seq <- c(2, NA)

param_grid <- expand.grid(
  knots_pos = knots_pos_seq,
  nknots = nknots_seq,
  lambda = lambda_seq,
  index = index_seq
) |>
  arrange(nknots, knots_pos, lambda)

knitr::kable(
  tibble(
    `# of knots positions` = length(knots_pos_seq),
    `# of # of knots` = length(nknots_seq),
    `# of values for lambda` = length(lambda_seq),
    `# of # of components` = length(lambda_seq),
    `# of values for index` = length(index_seq),
    `Total grid size` = nrow(param_grid)
  )
)
# of knots positions # of # of knots # of values for lambda # of # of components # of values for index Total grid size
1 5 5 5 2 50

if (!file.exists("ICS_climate_regions.RData")) {
  system.time(
    ics_grid <- param_grid |>
      mutate(id = seq_len(n())) |>
      rowwise() |>
      mutate(res = list(smooth_ICS(vntcr, knots_pos, nknots, lambda, index)))
  )
  save(ics_grid, file = "ICS_climate_regions.RData")
}

load("ICS_climate_regions.RData")

ics_grid_out <- ics_grid |>
  rowwise() |>
  filter(!(any(class(res$ics_out) == "error"))) |>
  mutate(res = list(data.frame(
    obs_id = seq_len(length((res$ics_out$outliers))),
    outlying = res$ics_out$outliers == 1
  ))) |>
  unnest(res)
ics_grid_out |>
  mutate(index = ifelse(is.na(index), "auto. selection of ICs",
    paste(index, "ICs selected")
  )) |>
  ggplot(aes(log10(lambda), obs_id, fill = outlying, alpha = outlying)) +
  geom_tile() +
  scale_fill_grey(start = 1, end = 0) +
  facet_grid(cols = vars(nknots), rows = vars(index)) +
  labs(y = "observation index")
Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).

Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).


ics_grid_out |>
  mutate(index = ifelse(is.na(index), "auto. selection of ICs",
    paste(index, "ICs selected")
  )) |>
  rename(index_select = index) |>
  left_join(vnt |> mutate(obs_id = row_number()) |> select(!outlying), by = c("obs_id")) |>
  group_by(nknots, lambda, index_select, year) |>
  mutate(n_outlying = sum(outlying) / 13) |>
  ggplot(aes(log10(lambda), year, fill = n_outlying, alpha = n_outlying)) +
  geom_tile() +
  scale_alpha_continuous(range = c(0, 1)) +
  scale_fill_gradient(low = "white", high = "black") +
  facet_grid(cols = vars(nknots), rows = vars(index_select)) +
  labs(y = "year", fill = "outlier\nfrequency", alpha = "") +
  guides(alpha = "none")
Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).

Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).


ics_grid |>
  filter(is.na(index)) |>
  mutate(`number of selected ICs` = max(res$ics_out$index)) |>
  ggplot(aes(log10(lambda), `number of selected ICs`)) +
  geom_bar(stat = "identity") +
  facet_grid(cols = vars(nknots))
Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).

Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters. Top: 2 invariant components selected; bottom: automatic selection through D’Agostino tests. Columns correspond to the number of inside knots (5-25).

ics_grid_summary <- ics_grid_out |>
  mutate(outlying = ifelse(outlying, 1, 0)) |>
  group_by(obs_id) |>
  summarise(n_outlying = sum(outlying)) |>
  left_join(vnt |> mutate(obs_id = row_number()), by = c("obs_id")) |>
  select(obs_id, province, year, n_outlying)

rmarkdown::paged_table(ics_grid_summary)
ics_grid_summary |>
  ggplot(aes(year, n_outlying)) +
  geom_bar(stat = "identity") +
  labs(y = "outlier detection frequency")
Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters.

Outlier detection using ICS on the S3 climate region of Vietnam, during the years 1987-2016, over a grid of preprocessing parameters.

Constraining the centred log-ratio transforms to zero on the boundaries

If we force to 0 the first and last ZB-spline coordinates of the \(\operatorname{clr}\), we project them onto the space of splines constrained to \(0\) on the boundaries of the interval.

boundary_to_zero <- function(x) {
  b <- x$basis
  p <- b$nbasis
  zbsp <- fd(to_zbsplines(inv = TRUE, coefs = diag(p - 1), basis = b), b)
  plot(as.list(zbsp))

  y <- to_zbsplines(x)
  y[c(1, p - 1), ] <- 0
  yfd <- fd(to_zbsplines(inv = TRUE, coefs = y, basis = b), b)

  list(dd = yfd, coefs = t(y[2:(p - 2), ]))
}
icsout <- ICS_outlier(boundary_to_zero(c(vnt$t_max))$coefs)

vnt |>
  mutate(
    t_max = as.list(as.fd(boundary_to_zero(c(t_max))$dd)),
    outlying = icsout$outliers == 1,
    province = ifelse(outlying == 1, province, NA)
  ) |>
  plot_funs(t_max, color = province, alpha = outlying) +
  facet_wrap(vars(year)) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  labs(
    x = "temperature (deg. Celsius)", y = "clr(density)",
    color = "outlying\nprovinces"
  )
Outlier detection using ICS on the S3 climate region of Vietnam, with centred log-ratio transforms constrained to 0 on the boundaries.

Outlier detection using ICS on the S3 climate region of Vietnam, with centred log-ratio transforms constrained to 0 on the boundaries.

Comparison with other methods

See this vignette from fdaoutlier for more details about simulation models.

schemes <- c("GP_clr", "Gumbel", "GP_margin")

simulation_gp_clr <- function(n, p) {
  sim <- fdaoutlier::simulation_model4(n, p, mu = 15, outlier_rate = 0.02)
  y <- exp(tfd(sim$data, arg = seq(0, 1, length.out = p)))

  list(
    data = as.matrix(y / tf_integrate(y)),
    true_outliers = replace(integer(n), sim$true_outliers, 1)
  )
}

dgumbel <- function(x, mu = 0, beta = 1) {
  z <- (x - mu) / beta
  exp(-z - exp(-z)) / beta
}

simulation_gumbel <- function(n, p, outlier_rate = 0.02, deterministic = TRUE) {
  tt <- seq(0, 1, length.out = p)
  a <- matrix(rnorm(n, sd = 0.01), nrow = p, ncol = n, byrow = TRUE)
  b <- matrix(rnorm(n, sd = 0.01), nrow = p, ncol = n, byrow = TRUE)
  y <- dgumbel(tt, mu = 0.25 + a, beta = 0.05 + b)
  dtt <- fdaoutlier:::determine(deterministic, n, outlier_rate)
  n_outliers <- dtt$n_outliers
  true_outliers <- dtt$true_outliers
  if (n_outliers > 0) {
    a <- matrix(rnorm(n_outliers, sd = 0.01),
      nrow = p, ncol = n_outliers, byrow = TRUE
    )
    b <- matrix(rnorm(n_outliers, sd = 0.01),
      nrow = p, ncol = n_outliers, byrow = TRUE
    )
    y[, true_outliers] <- dgumbel(tt, mu = 0.32 + a, beta = 0.05 + b)
  }
  rownames(y) <- tt
  return(list(
    data = t(y),
    true_outliers = replace(integer(n), true_outliers, 1)
  ))
}

simulation_gp_margin <- function(n, p) {
  sim <- fdaoutlier::simulation_model5(n, p, outlier_rate = 0.02)
  y <- tibble(f = split(sim$data, row(sim$data))) |>
    mutate(f = as_dd(f, lambda = 1, nbasis = 10, mc.cores = 14))
  tt <- seq(min(sim$data), max(sim$data), length.out = p)
  densities_matrix <- t(eval(c(y$f), tt))
  colnames(densities_matrix) <- tt
  list(
    data = densities_matrix,
    true_outliers = replace(integer(n), sim$true_outliers, 1)
  )
}

generate_data <- function(scheme, n = 200, p = 100) {
  print(paste("Generating", scheme, "data"))
  switch(scheme,
    "GP_clr" = simulation_gp_clr(n, p),
    "Gumbel" = simulation_gumbel(n, p),
    "GP_margin" = simulation_gp_margin(n, p)
  )
}

tibble(Scheme = schemes) |>
  mutate(sim = map(Scheme, \(x) generate_data(x, p = 200))) |>
  mutate(sim = map(sim, function(x) {
    tibble(
      data = tfd(x$data),
      `true outlier` = as.character(x$true_outliers == 1)
    )
  })) |>
  unnest(sim) |>
  arrange(`true outlier`) |>
  ggplot() +
  scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
  geom_spaghetti(aes(y = data, color = `true outlier`, linetype = `true outlier`)) +
  facet_wrap(~Scheme, scales = "free") +
  labs(y = "density")
#> [1] "Generating GP_clr data"
#> [1] "Generating Gumbel data"
#> [1] "Generating GP_margin data"
Generation schemes for density data with outliers

Generation schemes for density data with outliers

methods <- c(
  "ICS_B2",
  "ICS_L2",
  "Median_LQD",
  "Median_Wasserstein",
  "MBD_L2",
  "MBD_LQD",
  "MBD_QF",
  "MUOD_L2",
  "MUOD_LQD",
  "MUOD_QF"
)
N <- 200 # Number of repetitions per combination

method_ics_b2 <- function(data) {
  u <- fda.usc::fdata(log(pmax(data, 1e-8)), argvals = as.numeric(colnames(data))) |>
    fda.usc::fdata2fd(nbasis = 10)
  for (i in integer(3)) {
    u$coefs <- sweep(u$coefs, 2, fda::inprod(u) / diff(u$basis$rangeval))
  }
  if (any(is.nan(u$coefs))) stop("ICS preprocessing has failed for at least one density")
  ICS_outlier(u, n_cores = 14)$ics_distances
}

method_ics_l2 <- function(data) {
  f <- fda.usc::fdata(data, argvals = as.numeric(colnames(data))) |>
    fda.usc::fdata2fd(nbasis = 10)
  ICS_outlier(f, n_cores = 14)$ics_distances
}

method_deboinr <- function(data, distance) {
  density_order <- deboinr(
    x_grid = seq(0, 1, length.out = ncol(data)),
    densities_matrix = data,
    distance = distance,
    num_cores = 14
  )$density_order
  order(density_order)
}

pdf_to_quant <- function(pdf, x_grid) {
  interp_fn <- approxfun(x_grid, pdf)
  cdf <- DeBoinR:::pdf_to_cdf(pdf, x_grid, norm = TRUE)
  quant <- DeBoinR:::cdf_to_quant(cdf, x_grid)
  return(quant)
}

pdf_transform <- function(data, transform = c("id", "QF", "LQD")) {
  switch(transform,
    "id" = data,
    "QF" = t(apply(
      data, 1,
      function(f) pdf_to_quant(f, seq(0, 1, length.out = ncol(data)))
    )),
    "LQD" = t(apply(
      data, 1,
      function(f) {
        DeBoinR:::pdf_to_lqd(
          DeBoinR:::alpha_mix(f, 0.1),
          seq(0, 1, length.out = ncol(data))
        )
      }
    ))
  )
}

method_muod <- function(data) {
  res <- muod(data)$indices
  res <- apply(res, 2, function(x) (x - quantile(x, 0.75)) / IQR(x))
  apply(res, 1, max)
}

outlier_score <- function(method, data) {
  print(paste("Computing outlier scores with", method))
  tryCatch(switch(method,
    "ICS_B2" = method_ics_b2(data),
    "ICS_L2" = method_ics_l2(data),
    "Median_LQD" = method_deboinr(data, distance = "nLQD"),
    "Median_Wasserstein" = method_deboinr(data, distance = "wasserstein"),
    "MBD_L2" = -modified_band_depth(data),
    "MBD_LQD" = -modified_band_depth(pdf_transform(data, "LQD")),
    "MBD_QF" = -modified_band_depth(pdf_transform(data, "QF")),
    "MUOD_L2" = method_muod(data),
    "MUOD_LQD" = method_muod(pdf_transform(data, "LQD")),
    "MUOD_QF" = method_muod(pdf_transform(data, "QF"))
  ), error = function(e) NA)
}

compute_roc <- function(scores, true_outliers) {
  thresholds <- c(scores, max(scores) + 1)
  tibble(threshold = thresholds) |>
    mutate(
      TP = map_int(threshold, ~ sum(scores >= .x & true_outliers)),
      FP = map_int(threshold, ~ sum(scores >= .x & !true_outliers)),
      FN = map_int(threshold, ~ sum(scores < .x & true_outliers)),
      TN = map_int(threshold, ~ sum(scores < .x & !true_outliers)),
      PP = map_int(threshold, ~ sum(scores >= .x)),
      TPR = TP / (TP + FN),
      FPR = FP / (FP + TN),
    ) |>
    select(TPR, FPR, PP)
}

if (!(file.exists("ICS_comparison.RData")) & !(file.exists("ICS_comparison_data.RData"))) {
  comparison_data <- crossing(Simulation = 1:N, Scheme = schemes) |>
    mutate(sim = map(Scheme, generate_data)) |>
    crossing(Method = methods)
  save(comparison_data, file = "ICS_comparison_data.RData")
}

if (!(file.exists("ICS_comparison.RData"))) {
  load("ICS_comparison_data.RData")
  comparison <- comparison_data |>
    filter(Method %in% methods & Scheme %in% schemes) |>
    unnest_wider(sim) |>
    mutate(
      scores = map2(Method, data, outlier_score),
      roc = map2(scores, true_outliers, compute_roc)
    ) |>
    unnest(roc) |>
    select(!data)
  save(comparison, file = "ICS_comparison.RData")
}

load("ICS_comparison.RData")

# Failed score computations
comparison_failed <- comparison |>
  group_by(Scheme, Method) |>
  summarize(
    missing = sum(is.na(scores)),
    .groups = "drop"
  ) |>
  pivot_wider(names_from = Scheme, values_from = missing)

knitr::kable(comparison_failed)
Method GP_clr GP_margin Gumbel
ICS_B2 0 0 0
ICS_L2 0 0 0
MBD_L2 0 0 0
MBD_LQD 0 0 0
MBD_QF 0 0 0
MUOD_L2 0 0 0
MUOD_LQD 0 0 0
MUOD_QF 0 0 0
Median_LQD 0 0 0
Median_Wasserstein 0 0 0

# Aggregate results (averaging over simulations)
comparison_avg <- comparison |>
  separate_wider_delim(Method, "_", names = c("Method", "Transf")) |>
  group_by(Scheme, Transf, Method, PP) |>
  summarize(
    TPR_avg = mean(TPR),
    n = n(),
    TPR_conf = 1.96 * sd(TPR) / sqrt(n()),
    FPR_avg = mean(FPR),
    .groups = "drop"
  )

comparison_auc <- comparison_avg |>
  group_by(Scheme, Method, Transf) |>
  summarise(AUC = integrate(approxfun(FPR_avg, TPR_avg), 0, 1)$value)

ggplot(comparison_avg, aes(FPR_avg, TPR_avg, color = Scheme, group = Scheme)) +
  geom_line() +
  geom_ribbon(aes(
    ymin = pmax(0, TPR_avg - TPR_conf),
    ymax = pmin(1, TPR_avg + TPR_conf),
    fill = Scheme
  ), alpha = 0.1, linetype = "dotted") +
  geom_function(fun = identity, linetype = "dotted") +
  facet_grid(Transf ~ Method) +
  coord_fixed() +
  labs(
    x = "False Positive Rate (FPR)",
    y = "True Positive Rate (TPR)",
  )
ROC Curves for Outlier Detection Methods

ROC Curves for Outlier Detection Methods

10 periods, 2 regions (26 provinces)

We compute an average density by province and by period of 3 years. There are 10 periods total, and only 26 provinces in the two northern regions of Vietnam, so 260 observations total.

Data presentation

data(vietnam_temperature)
data(vietnam_regions)

vietnam_temperature <- vietnam_temperature |>
  select(!t_min) |>
  mutate(period = year - (year - min(year)) %% 3)

if (file.exists("ICS_ten_periods.RData")) {
  load("ICS_ten_periods.RData")
} else {
  vnt <- vietnam_temperature |>
    filter(region %in% c("RRD", "NMM")) |>
    mutate(t_max = as_dd(t_max, lambda = 10, nbasis = 10, mc.cores = 15)) |>
    group_by(region, province, period) |>
    summarise(t_max = list(c(mean(t_max)))) |>
    ungroup() |>
    left_join(vietnam_regions, by = c("region" = "code")) |>
    rename(region_name = name) |>
    arrange(region, province, period)
  class(vnt$t_max) <- c("ddl", "fdl", "list")
}
vnt |> plot_funs(t_max, color = province) +
  facet_wrap(vars(period)) +
  labs(x = "temperature (deg. Celsius)", y = "density")
Densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.

Densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.

Outlier detection using ICS

if (!(file.exists("ICS_ten_periods.RData"))) {
  icsout <- ICS_outlier(vnt$t_max, index = 1:4, n_cores = 14)

  vnt <- vnt |>
    mutate(outlying = as.factor(icsout$outliers)) |>
    cbind(icsout$scores)
  save(icsout, vnt, file = "ICS_ten_periods.RData")
}

vnt |> plot_matrix(paste0("IC.", 1:4),
  label = period,
  color = ifelse(outlying == 1, province, NA),
  alpha = outlying == 1
) +
  labs(color = "outlying\nprovinces", x = "", y = "") +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  guides(alpha = "none")
Selected invariant components of the densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.

Selected invariant components of the densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.


vnt |>
  mutate(province = ifelse(outlying == 1, province, NA)) |>
  arrange(outlying) |>
  plot_funs(t_max, color = province, alpha = outlying) +
  facet_wrap(vars(period)) +
  labs(
    x = "temperature (deg. Celsius)", y = "density",
    color = "outlying\nprovinces"
  ) +
  scale_color_discrete(limits = function(x) ifelse(is.na(x), "", x)) +
  guides(alpha = "none")
Outlying densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.

Outlying densities of maximum temperature in the two northern regions of Vietnam averaged over three-year periods.