This file provides all the code permitting the reproduction of examples in the article submitted More specifically, we display here the maps and curves of the data, and we seek to detect multi-pollutant clusters: 1) on the data from which we have removed the temporal component by averaging over time, 2) on the NO2 pollutant only but taking into account the temporal variability, and 3) on all pollutants by taking into account the temporal variability.
As the computation time of some spatial scan statistics can be a bit long in file “frevent.R”, we provide here a version with fewer permutations for the p-value estimation, that is 99 instead of the default value which is 999. In the generated html file, the time taken by each method is indicated at the end of each section.
library(HDSpatialScan)
set.seed(123)
####################
# Load the data #
####################
data("map_sites")
data("multi_data")
data("funi_data")
data("fmulti_data")
#########################
# Visualize the data #
#########################
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(sp)
mean_data_no2 <- data.frame(canton = map_sites$identifiant, mean_no2 = multi_data[,1])
mean_data_o3 <- data.frame(canton = map_sites$identifiant, mean_o3 = multi_data[,2])
mean_data_pm10 <- data.frame(canton = map_sites$identifiant, mean_pm10 = multi_data[,3])
mean_data_pm2.5 <- data.frame(canton = map_sites$identifiant, mean_pm2.5 = multi_data[,4])
map_sites@data <- left_join(map_sites@data, mean_data_no2, by = c("identifiant" = "canton"))
map_sites@data <- left_join(map_sites@data, mean_data_o3, by = c("identifiant" = "canton"))
map_sites@data <- left_join(map_sites@data, mean_data_pm10, by = c("identifiant" = "canton"))
map_sites@data <- left_join(map_sites@data, mean_data_pm2.5, by = c("identifiant" = "canton"))
# Cut the mean concentrations in 5 classes
classes_no2 <- quantile(multi_data[,1], probs = c(0,0.2,0.4,0.6,0.8,1))
classes_o3 <- quantile(multi_data[,2], probs = c(0,0.2,0.4,0.6,0.8,1))
classes_pm10 <- quantile(multi_data[,3], probs = c(0,0.2,0.4,0.6,0.8,1))
classes_pm2.5 <- quantile(multi_data[,4], probs = c(0,0.2,0.4,0.6,0.8,1))
# Choice of colors
palette <- brewer.pal(n = 5, name = "YlOrRd")
map_sites@data$classe_no2 <- as.character(cut(map_sites@data$mean_no2, breaks = classes_no2, labels = palette, include.lowest = TRUE))
map_sites@data$classe_o3 <- as.character(cut(map_sites@data$mean_o3, breaks = classes_o3, labels = palette, include.lowest = TRUE))
map_sites@data$classe_pm10 <- as.character(cut(map_sites@data$mean_pm10, breaks = classes_pm10, labels = palette, include.lowest = TRUE))
map_sites@data$classe_pm2.5 <- as.character(cut(map_sites@data$mean_pm2.5, breaks = classes_pm2.5, labels = palette, include.lowest = TRUE))
legend_no2 <- as.character(levels(cut(map_sites@data$mean_no2, breaks = classes_no2, include.lowest = TRUE)))
legend_o3 <- as.character(levels(cut(map_sites@data$mean_o3, breaks = classes_o3, include.lowest = TRUE)))
legend_pm10 <- as.character(levels(cut(map_sites@data$mean_pm10, breaks = classes_pm10, include.lowest = TRUE)))
legend_pm2.5 <- as.character(levels(cut(map_sites@data$mean_pm2.5, breaks = classes_pm2.5, include.lowest = TRUE)))
plot(map_sites, col = map_sites@data$classe_no2, border = "black", lwd = 0.5)
legend("topleft", legend = legend_no2, fill = palette, cex=1,
title = expression(paste("Average ", NO[2], " concentration in ", mu, "g/", m^3)),
bty = "n")
plot(map_sites, col = map_sites@data$classe_o3, border = "black", lwd = 0.5)
legend("topleft", legend = legend_o3, fill = palette, cex=1,
title = expression(paste("Average ", O[3], " concentration in ", mu, "g/", m^3)),
bty = "n")
plot(map_sites, col = map_sites@data$classe_pm10, border = "black", lwd = 0.5)
legend("topleft", legend = legend_pm10, fill = palette, cex=1,
title = expression(paste("Average ", PM[10], " concentration in ", mu, "g/", m^3)),
bty = "n")
plot(map_sites, col = map_sites@data$classe_pm2.5, border = "black", lwd = 0.5)
legend("topleft", legend = legend_pm2.5, fill = palette, cex=1,
title = expression(paste("Average ", PM[2.5], " concentration in ", mu, "g/", m^3)),
bty = "n")
# The curves
times <- c("2020-05-01", "2020-05-02", "2020-05-03", "2020-05-04","2020-05-05","2020-05-06","2020-05-07","2020-05-08","2020-05-09","2020-05-10","2020-05-11","2020-05-12","2020-05-13","2020-05-14","2020-05-15","2020-05-16","2020-05-17","2020-05-18","2020-05-19","2020-05-20","2020-05-21","2020-05-22","2020-05-23","2020-05-24","2020-05-25","2020-05-26","2020-05-27","2020-05-28","2020-05-29","2020-05-30","2020-05-31","2020-06-01","2020-06-02","2020-06-03","2020-06-04","2020-06-05","2020-06-06","2020-06-07","2020-06-08","2020-06-09","2020-06-10","2020-06-11","2020-06-12","2020-06-13","2020-06-14","2020-06-15","2020-06-16","2020-06-17","2020-06-18","2020-06-19","2020-06-20","2020-06-21","2020-06-22","2020-06-23","2020-06-24","2020-06-25")
temp_no2 <- matrix(nrow = length(fmulti_data), ncol = length(times))
temp_o3 <- matrix(nrow = length(fmulti_data), ncol = length(times))
temp_pm10 <- matrix(nrow = length(fmulti_data), ncol = length(times))
temp_pm2.5 <- matrix(nrow = length(fmulti_data), ncol = length(times))
for(i in 1:length(fmulti_data)){
temp_no2[i,] <- fmulti_data[[i]][1,]
temp_o3[i,] <- fmulti_data[[i]][2,]
temp_pm10[i,] <- fmulti_data[[i]][3,]
temp_pm2.5[i,] <- fmulti_data[[i]][4,]
}
mini <- min(temp_no2)
maxi <- max(temp_no2)
plot(NULL, xlim = c(1,ncol(temp_no2)), ylim = c(mini, maxi), xlab = "", xaxt = 'n', ylab = expression(paste(NO[2], " concentration in ", mu, "g/", m^3)), cex.axis = 0.85, cex.lab = 0.85)
axis(1, at = c(1:ncol(temp_no2))[which(c(1:ncol(temp_no2))%%5==1)], labels = times[which(c(1:ncol(temp_no2))%%5==1)], cex.axis = 0.85, cex.lab = 0.85, las = 3)
for(i in 1:nrow(temp_no2)){
lines(x = c(1:ncol(temp_no2)), y = temp_no2[i,], col = "black", lwd = 1)
}
mini <- min(temp_o3)
maxi <- max(temp_o3)
plot(NULL, xlim = c(1,ncol(temp_o3)), ylim = c(mini, maxi), xlab = "", xaxt = 'n', ylab = expression(paste(O[3], " concentration in ", mu, "g/", m^3)), cex.axis = 0.85, cex.lab = 0.85)
axis(1, at = c(1:ncol(temp_o3))[which(c(1:ncol(temp_o3))%%5==1)], labels = times[which(c(1:ncol(temp_o3))%%5==1)], cex.axis = 0.85, cex.lab = 0.85, las = 3)
for(i in 1:nrow(temp_o3)){
lines(x = c(1:ncol(temp_o3)), y = temp_o3[i,], col = "black", lwd = 1)
}
mini <- min(temp_pm10)
maxi <- max(temp_pm10)
plot(NULL, xlim = c(1,ncol(temp_pm10)), ylim = c(mini, maxi), xlab = "", xaxt = 'n', ylab = expression(paste(PM[10], " concentration in ", mu, "g/", m^3)), cex.axis = 0.85, cex.lab = 0.85)
axis(1, at = c(1:ncol(temp_pm10))[which(c(1:ncol(temp_pm10))%%5==1)], labels = times[which(c(1:ncol(temp_pm10))%%5==1)], cex.axis = 0.85, cex.lab = 0.85, las = 3)
for(i in 1:nrow(temp_pm10)){
lines(x = c(1:ncol(temp_pm10)), y = temp_pm10[i,], col = "black", lwd = 1)
}
mini <- min(temp_pm2.5)
maxi <- max(temp_pm2.5)
plot(NULL, xlim = c(1,ncol(temp_pm2.5)), ylim = c(mini, maxi), xlab = "", xaxt = 'n', ylab = expression(paste(PM[2.5], " concentration in ", mu, "g/", m^3)), cex.axis = 0.85, cex.lab = 0.85)
axis(1, at = c(1:ncol(temp_pm2.5))[which(c(1:ncol(temp_pm2.5))%%5==1)], labels = times[which(c(1:ncol(temp_pm2.5))%%5==1)], cex.axis = 0.85, cex.lab = 0.85, las = 3)
for(i in 1:nrow(temp_pm2.5)){
lines(x = c(1:ncol(temp_pm2.5)), y = temp_pm2.5[i,], col = "black", lwd = 1)
}
##################################################
# Apply a multivariate spatial scan statistic #
##################################################
Is the data gaussian at least for each component ?
hist(multi_data[,1])
hist(multi_data[,2])
hist(multi_data[,3])
hist(multi_data[,4])
qqnorm(multi_data[,1])
qqline(multi_data[,1], col = "red")
qqnorm(multi_data[,2])
qqline(multi_data[,2], col = "red")
qqnorm(multi_data[,3])
qqline(multi_data[,3], col = "red")
qqnorm(multi_data[,4])
qqline(multi_data[,4], col = "red")
not really : we apply the MNP
coords <- coordinates(map_sites)
time_start <- Sys.time()
res_mnp <- SpatialScan(method = "MNP", data = multi_data, sites_coord = coords, system = "WGS84", mini = 1, maxi = nrow(coords)/2, type_minimaxi = "sites/indiv", mini_post = 0, maxi_post = 10, type_minimaxi_post = "radius", nbCPU = 7, MC = 99, variable_names = c("NO2", "O3", "PM10", "PM2.5"))$MNP
## ##### Multivariate nonparametric scan statistic #####
## Computation of the scan statistic
## ---Done---
## Estimation of the statistical significance
## ---Done---
## Finalization
## ======================================================
time_stop <- Sys.time()
# print
print(res_mnp)
## MNP scan procedure
## ####################
## 8 significant clusters have been detected by the scan procedure with p-values of 0.01, 0.01, 0.01, 0.02, 0.02, 0.02, 0.02, 0.02
# see the location of the most likely cluster
plot(x = res_mnp, type = "schema", system_conv = "+init=epsg:2154", only.MLC = TRUE)
plot(x = res_mnp, type = "map", spobject = map_sites, only.MLC = TRUE)
plot(x = res_mnp, type = "map2", spobject = map_sites, only.MLC = TRUE)
# characteristics
summary(res_mnp, type_summ = "nparam", only.MLC = TRUE)
## $basic_summary
## Cluster 1
## p-value 0.010
## Radius 9.999
##
## $complete_summary
## Overall Inside cluster 1 Outside cluster 1
## Number of sites 169.000 12.000 157.000
## Q25 NO2 8.673 11.327 8.635
## Median NO2 9.183 11.721 9.075
## Q75 NO2 9.848 12.382 9.692
## Q25 O3 66.778 67.527 66.721
## Median O3 67.895 67.609 67.961
## Q75 O3 68.564 67.922 68.658
## Q25 PM10 16.397 17.483 16.205
## Median PM10 16.970 17.877 16.933
## Q75 PM10 17.372 17.962 17.266
## Q25 PM2.5 9.132 10.584 9.113
## Median PM2.5 9.833 10.678 9.790
## Q75 PM2.5 10.213 10.919 10.107
plotSummary(res_mnp, type = "median", only.MLC = TRUE)
The computation of this spatial scan statistic (MNP) took 0.23 min
###########################################################
# Apply a functional univariate spatial scan statistic #
###########################################################
time_start <- Sys.time()
res_urbfss <- SpatialScan(method = "URBFSS", data = funi_data, sites_coord = coords, system = "WGS84", mini = 1, maxi = nrow(coords)/2, type_minimaxi = "sites/indiv", mini_post = 0, maxi_post = 10, type_minimaxi_post = "radius", nbCPU = 7, MC = 99)$URBFSS
## ##### Univariate rank-based functional scan statistic #####
## Computation of the scan statistic
## ---Done---
## Estimation of the statistical significance
## ---Done---
## Finalization
## ======================================================
time_stop <- Sys.time()
# print
print(res_urbfss)
## URBFSS scan procedure
## #######################
## 3 significant clusters have been detected by the scan procedure with p-values of 0.01, 0.01, 0.01
# see the location of the most likely cluster
plot(res_urbfss, type = "map2", spobject = map_sites, only.MLC = TRUE)
# characteristics
summary(res_urbfss, type_summ = "nparam", only.MLC = TRUE)
## $basic_summary
## Cluster 1
## p-value 0.010
## Radius 9.999
##
## $complete_summary
## Overall Inside cluster 1 Outside cluster 1
## Number of sites 169.000 15.000 154.000
## Q25 6.909 8.742 6.811
## Median 9.036 11.194 8.844
## Q75 11.575 13.981 11.323
plotCurves(res_urbfss, add_median = TRUE, only.MLC = TRUE)
plotSummary(res_urbfss, type = "median", only.MLC = TRUE)
The computation of this spatial scan statistic (URBFSS) took 0.31 min
#############################################################
# Apply a functional multivariate spatial scan statistic #
#############################################################
time_start <- Sys.time()
res_mrbfss <- SpatialScan(method = "MRBFSS", data = fmulti_data, sites_coord = coords, system = "WGS84", mini = 1, maxi = nrow(coords)/2, type_minimaxi = "sites/indiv", mini_post = 0, maxi_post = 10, type_minimaxi_post = "radius", nbCPU = 7, MC = 99, variable_names = c("NO2", "O3", "PM10", "PM2.5"))$MRBFSS
## ##### Multivariate rank-based functional scan statistic #####
## Computation of the scan statistic
## ---Done---
## Estimation of the statistical significance
## ---Done---
## Finalization
## ======================================================
time_stop <- Sys.time()
# print
print(res_mrbfss)
## MRBFSS scan procedure
## #######################
## 10 significant clusters have been detected by the scan procedure with p-values of 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.03, 0.03, 0.04
# see the location of the most likely cluster
plot(res_mrbfss, type = "map2", spobject = map_sites, only.MLC = TRUE)
# characteristics
summary(res_mrbfss, type_summ = "nparam", only.MLC = TRUE)
## $basic_summary
## Cluster 1
## p-value 0.010
## Radius 9.999
##
## $complete_summary
## Overall Inside cluster 1 Outside cluster 1
## Number of sites 169.000 15.000 154.000
## Q25 NO2 6.909 8.742 6.811
## Median NO2 9.036 11.194 8.844
## Q75 NO2 11.575 13.981 11.323
## Q25 O3 58.800 57.459 58.905
## Median O3 66.787 66.247 66.853
## Q75 O3 74.723 74.924 74.698
## Q25 PM10 12.131 13.788 12.070
## Median PM10 16.092 17.565 16.014
## Q75 PM10 19.775 20.754 19.632
## Q25 PM2.5 6.212 7.566 6.129
## Median PM2.5 8.308 9.069 8.211
## Q75 PM2.5 10.926 12.140 10.829
plotCurves(res_mrbfss, add_median = TRUE, only.MLC = TRUE)
plotSummary(res_mrbfss, type = "median", only.MLC = TRUE)
The computation of this spatial scan statistic (MRBFSS) took 3.93 min
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Gentoo/Linux
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.10.0
## LAPACK: /usr/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=C
## [4] LC_COLLATE=C LC_MONETARY=C LC_MESSAGES=C
## [7] LC_PAPER=C LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] sp_1.4-6 RColorBrewer_1.1-2 dplyr_1.0.9
## [4] HDSpatialScan_1.0.3
##
## loaded via a namespace (and not attached):
## [1] xfun_0.31 tidyselect_1.1.1 purrr_0.3.4
## [4] pbapply_1.5-0 sf_1.0-4 lattice_0.20-41
## [7] vctrs_0.4.1 generics_0.1.1 testthat_3.1.0
## [10] htmltools_0.5.2 utf8_1.2.2 blob_1.2.2
## [13] rlang_1.0.3 e1071_1.7-9 pillar_1.6.4
## [16] glue_1.6.2 withr_2.5.0 DBI_1.1.1
## [19] fmsb_0.7.0 matrixStats_0.61.0 lifecycle_1.0.1
## [22] stringr_1.4.0 rgeos_0.5-8 htmlwidgets_1.5.3
## [25] SpatialNP_1.1-4 evaluate_0.15 knitr_1.39
## [28] fastmap_1.1.0 parallel_4.0.5 class_7.3-18
## [31] fansi_0.5.0 highr_0.9 Rcpp_1.0.7
## [34] KernSmooth_2.23-18 DT_0.16 classInt_0.4-3
## [37] plotrix_3.7-8 desc_1.4.0 pkgload_1.1.0
## [40] swfscMisc_1.4.3 mapdata_2.3.0 TeachingDemos_2.12
## [43] digest_0.6.28 stringi_1.7.5 grid_4.0.5
## [46] rprojroot_2.0.2 rgdal_1.5-28 cli_3.3.0
## [49] tools_4.0.5 magrittr_2.0.3 maps_3.4.0
## [52] proxy_0.4-26 tibble_3.1.6 crayon_1.4.2
## [55] pkgconfig_2.0.3 ellipsis_0.3.2 assertthat_0.2.1
## [58] rstudioapi_0.13 R6_2.5.1 units_0.7-2
## [61] compiler_4.0.5