---
title: "bio-oracle-5"
format: html
---
## Libraries
```{r}
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(terra)
library(biooracler)
library(sf)
library(ggplot2)
library(ggcorrplot)
library(usdm)
library(catboost)
library(DALEX)
library(pdp)
# library(ggspatial)
# library(rnaturalearth)
# library(tidyterra)
# source("./scripts/degree_labels.R")
```
## Range and study area
Load the species range from IUCN and 5° buffer to define an area of the study.
```{r}
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
bbox = ext(seal_range) |> extend(5)
bbox_vect = bbox |> as.lines(crs="EPSG:4326")
# land = ne_download(scale=110, type="land", category = "physical", returnclass = "sv")
land = vect("land.geojson")
lon_range = c(bbox$xmin, bbox$xmax)
lat_range = c(bbox$ymin, bbox$ymax)
constraints_geo = list(
longitude = lon_range,
latitude = lat_range
)
saveRDS(constraints_geo, file="constraints_geo.Rda")
```
```{r}
plot(seal_range, col="#bcbddc", xlim=c(-170, 170), ylim=c(90, -80))
plot(land, col="#f0f0f0", add=T)
lines(bbox, col="#756bb1")
```
## Bio-ORACLE
> Environmental predictors were sourced from the Bio-ORACLE v3.0 database, providing standardized global marine rasters for present-day conditions and future climate projections under CMIP6 Shared Socioeconomic Pathways (SSPs).
```{r}
all_layers = list_layers()
ids_to_remove = c(
# no projection data
# the database flaws (?)
"par_mean_baseline_2000_2020_depthsurf",
"kdpar_mean_baseline_2000_2020_depthsurf",
"chl_baseline_2000_2018_depthmax",
"chl_baseline_2000_2018_depthmean",
"chl_baseline_2000_2018_depthmin",
# nature of the variable
"terrain_characteristics"
)
layers = all_layers |>
filter(! dataset_id %in% c(ids_to_remove)) |>
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) |>
mutate(
var_depth = paste0(var, "_", depth),
var_depth_humane = str_extract(title, ".*]") |> str_remove("Bio-Oracle ")
)
# aware that not all variables have ssp126
saveRDS(layers, "layers.Rda")
layers |> select(var_depth_humane, var_depth) |> distinct() |> print.data.frame()
```
```{r}
download_slice = function(scenario_value, decade_start, layers_to_filter) {
scenario_layers = layers_to_filter |>
filter(scenario == scenario_value)
time_point = paste0(decade_start, "-01-01T00:00:00Z")
slice_constraints = list(
time = c(time_point, time_point),
longitude = constraints_geo$longitude,
latitude = constraints_geo$latitude
)
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
slice_rasters = sapply(
scenario_layers$dataset_id,
function(id) download_layers(
id,
constraints = slice_constraints,
directory = download_dir
),
simplify = TRUE
)
return(slice_rasters)
}
```
```{r}
slice_to_brick = function(list_of_rasters) {
brick = rast(list_of_rasters)
depths = brick |> names() |> str_extract("depth[:letter:]+")
var_stat = brick |>
varnames() |>
as_tibble() |>
separate_wider_delim("value",delim="_", names=c("var", "stat"))
prev_longnames = longnames(brick)
longnames(brick) = paste0(prev_longnames, " [", depths ,"]")
names(brick) = paste(var_stat$var, depths, var_stat$stat, sep = "_")
return(brick)
}
```
## Data exploration
Feel free to skip this step as it shows the logic behind the layers selected for
analysis.
### Download
```{r eval=FALSE}
baseline_rasters = download_slice("baseline", 2010, layers)
```
```{r}
baseline_brick = slice_to_brick(baseline_rasters)
```
300 hundred layers seem too many for a controlled analysis
```{r}
nlyr(baseline_brick)
```
### Filter by ecology
Knowing smth about the species lets clean up variables before any formal analysis of variables releations
```{r}
filtered_layers = tibble(
names = names(baseline_brick),
longnames = longnames(baseline_brick)
) |>
separate_wider_delim(
"names",
delim="_",
names=c("var", "depth", "stat"),
cols_remove=F
) |>
filter(
!(
depth %in% c("depthmax", "depthmean") |
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
stat %in% c("ltmin", "ltmax", "range")
)
)
baseline_brick_subset = baseline_brick |>
subset(filtered_layers$names)
filtered_layers |> select(longnames) |> print.data.frame()
```
### Sample for correlation analysis
```{r}
ten_percent_cells = nrow(baseline_brick_subset) * ncol(baseline_brick_subset) * 0.1
baseline_brick_subset_sample = baseline_brick_subset |>
spatSample(size=ten_percent_cells, method="regular", na.rm=TRUE)
```
#### Initial correlation
```{r}
corr_matrix = cor(baseline_brick_subset_sample)
```
```{r}
ggcorrplot(corr_matrix,
type = "lower", # Only show half (it's symmetrical anyway)
outline.col = "white",
colors = c("#6D9EC1", "white", "#E46726"),
lab = FALSE) + # don't label values
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.text.y = element_text(size = 7))
```
```{r}
high_cor_pairs <- corr_matrix |>
as.data.frame() |>
rownames_to_column("Var1") |>
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 900 total pairs
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
filter(abs(value) > 0.8 & Var1 < Var2) |>
mutate(value = round(value, 3)) |>
arrange(desc(abs(value)))
# 59 are highly correlated
print.data.frame(high_cor_pairs)
```
#### Variance Inflation Factor
> It calculates how much one variable can be predicted by a linear combination of all other variables.
```{r}
vif_results = vifstep(baseline_brick_subset_sample, th = 10)
```
```{r}
vars_to_keep = vif_results@results$Variables
vif_results@results
```
Then check the correlations of variables filtered by VIF step
```{r}
baseline_brick_subset_sample_vif = baseline_brick_subset_sample |>
select(all_of(vars_to_keep))
corr_matrix_vif = cor(baseline_brick_subset_sample_vif)
ggcorrplot(corr_matrix_vif,
outline.col = "white",
colors = c("#6D9EC1", "white", "#E46726"),
lab = FALSE) + # don't label values
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.text.y = element_text(size = 7))
```
```{r}
high_cor_pairs_vif <- corr_matrix_vif |>
as.data.frame() |>
rownames_to_column("Var1") |>
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 144 total pairs
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
filter(abs(value) > 0.8 & Var1 < Var2) |>
mutate(value = round(value, 3)) |>
arrange(desc(abs(value)))
# 3 are highly correlated
print.data.frame(high_cor_pairs_vif)
```
Having high correlation pairs and VIF values we manually select variables we can interpret
```{r}
manually_selected_vars = c(
"siconc_depthsurf_min",
"thetao_depthsurf_min",
"thetao_depthmin_max",
"chl_depthsurf_mean",
"phyc_depthmin_max"
)
```
Then again check correlation
```{r}
baseline_brick_subset_sample_manual = baseline_brick_subset_sample |>
select(all_of(manually_selected_vars))
corr_matrix_manual = cor(baseline_brick_subset_sample_manual)
ggcorrplot(corr_matrix_manual,
outline.col = "white",
colors = c("#6D9EC1", "white", "#E46726"),
lab = FALSE) + # don't label values
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.text.y = element_text(size = 7))
```
```{r}
selected_layers = filtered_layers |>
filter(names %in% manually_selected_vars)
saveRDS(selected_layers, file="selected_layers.Rda")
```
## Learning
### Input layers
Filter layers based on selected layers info.
```{r}
layers = readRDS("layers.Rda")
selected_layers = readRDS("selected_layers.Rda")
constraints_geo = readRDS("constraints_geo.Rda")
features_layers = inner_join(selected_layers, layers, by=c("var", "depth"))
```
```{r}
baseline_features_rasters = download_slice("baseline", 2010, features_layers)
```
Set up features raster brick
```{r}
baseline_features_brick = slice_to_brick(baseline_features_rasters) |>
subset(c(selected_layers$names))
```
And target raster
```{r}
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
ocean_mask = ifel(is.na(baseline_features_brick[[1]]), NA, 1)
```
```{r}
seal_range_raster = rasterize(
seal_range,
baseline_features_brick[[1]],
field = "",
background = 0
) |>
mask(ocean_mask)
```
```{r}
plot(c(seal_range_raster, baseline_features_brick))
```
### Spatial blocks
```{r}
ROWS = 3
COLUMNS = 5
nblocks = ROWS * COLUMNS
all_blocks = 1:(nblocks)
set.seed(321) # For reproducibility
test_blocks = seq(2, nblocks, by = 2)
train_blocks = setdiff(all_blocks, test_blocks)
```
```{r}
block_grid = seal_range_raster |>
ext() |>
st_bbox() |>
st_make_grid(n = c(COLUMNS, ROWS)) |>
st_sf() |>
mutate(block_id = row_number()) |>
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
```
```{r}
block_raster = block_grid |>
rasterize(seal_range_raster, field = "block_id")
```
```{r}
plot(seal_range_raster$layer)
plot(vect(block_grid), add = TRUE, border = "black", lwd = 1)
plot(
vect(block_grid |> filter(type == "Test (Hold-out)")),
add = TRUE,
border = "red",
lwd = 3)
```
```{r}
seal_range_raster$block_id = block_raster$block_id
```
### Catboost
#### Prep
Set up the dataframe for machine learning
```{r}
seal_range_df = seal_range_raster |>
as.data.frame(cells = TRUE, na.rm=TRUE) |>
rename(target = layer)
```
```{r}
baseline_features_df = baseline_features_brick |>
as.data.frame(cells = TRUE, na.rm=TRUE)
```
```{r}
target_features = left_join(seal_range_df, baseline_features_df, by = "cell")
```
Divide training and testing pools
```{r}
train_df = target_features %>% filter(block_id %in% train_blocks)
test_df = target_features %>% filter(block_id %in% test_blocks)
train_pool <- catboost.load_pool(
data = train_df |> select(-cell, -block_id, -target),
label = train_df$target
)
test_pool = catboost.load_pool(
data = test_df |> select(-cell, -block_id, -target),
label = test_df$target
)
```
#### Learning
```{r}
params = list(
loss_function = 'Logloss',
eval_metric = 'AUC',
iterations = 200, # Plenty of trees for a smooth fit
depth = 2, # Standard depth to prevent overfitting
learning_rate = 0.02, # Lower learning rate is better for high ROC data
l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC
random_seed = 42,
rsm = 0.5,
verbose = 10,
od_type = "Iter",
od_wait = 20
)
```
```{r}
cat_model = catboost.train(train_pool, test_pool = test_pool, params = params)
saveRDS(cat_model, "cat_model.Rda")
```
#### Catboost Result
```{r}
whole_pool = catboost.load_pool(
data = target_features |> select(-cell, -block_id, -target)
)
```
```{r}
preds_prob = catboost.predict(cat_model, whole_pool, prediction_type = 'Probability')
```
```{r}
baseline_prediction_raster = seal_range_raster
baseline_prediction_raster[target_features$cell] = preds_prob
```
```{r}
plot(baseline_prediction_raster$layer)
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
```
#### Post-processing
```{r}
plot(seal_range_raster)
```
```{r}
range_distance = gridDist(seal_range_raster, target=1)
distance_decay = exp(-0.000001 * range_distance) |>
subst(NA, 0)
```
```{r}
plot(distance_decay)
```
```{r}
baseline_prediction_raster$layer_dist = baseline_prediction_raster$layer * distance_decay
```
```{r}
plot(baseline_prediction_raster$layer_dist)
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
```
```{r}
plot(ifel(baseline_prediction_raster$layer_dist > 0.7, 1, 0))
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
```
#### Interpret
Maps
Plots
```{r}
explainer_cat = explain(
model = cat_model,
data = train_df |> select(-cell, -block_id, -target),
y = train_df$target,
label = "CatBoost Harp Seal Model",
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
)
```
```{r}
vi_cat = model_parts(explainer_cat)
plot(vi_cat)
```
```{r}
pdp_cat = lapply(selected_layers$names, function(X) model_profile(explainer_cat, variables = X))
plot(pdp_cat)
```
```{r}
mp_cat = model_performance(explainer_cat)
plot(mp_cat, geom = "boxplot")
plot(mp_cat, geom = "roc") # Or geom = "boxplot" for residuals
```
```{r}
# Pick a specific "wrong" pixel from your dataframe
caspian_pixel = train_df[train_df$cell == 120363, ]
bd_cat = predict_parts(explainer_cat, new_observation = caspian_pixel)
plot(bd_cat)
```