You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
568 lines
13 KiB
568 lines
13 KiB
---
|
|
title: "bio-oracle-5"
|
|
format: html
|
|
---
|
|
|
|
## Libraries
|
|
|
|
```{r}
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(tibble)
|
|
library(stringr)
|
|
|
|
library(terra)
|
|
|
|
library(biooracler)
|
|
library(sf)
|
|
library(ggplot2)
|
|
library(ggcorrplot)
|
|
|
|
library(usdm)
|
|
|
|
library(catboost)
|
|
|
|
library(DALEX)
|
|
library(pdp)
|
|
# library(ggspatial)
|
|
# library(rnaturalearth)
|
|
# library(tidyterra)
|
|
# source("./scripts/degree_labels.R")
|
|
```
|
|
|
|
## Range and study area
|
|
|
|
Load the species range from IUCN and 5° buffer to define an area of the study.
|
|
|
|
```{r}
|
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
|
bbox = ext(seal_range) |> extend(5)
|
|
bbox_vect = bbox |> as.lines(crs="EPSG:4326")
|
|
# land = ne_download(scale=110, type="land", category = "physical", returnclass = "sv")
|
|
land = vect("land.geojson")
|
|
|
|
lon_range = c(bbox$xmin, bbox$xmax)
|
|
lat_range = c(bbox$ymin, bbox$ymax)
|
|
|
|
constraints_geo = list(
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
|
|
saveRDS(constraints_geo, file="constraints_geo.Rda")
|
|
```
|
|
|
|
```{r}
|
|
plot(seal_range, col="#bcbddc", xlim=c(-170, 170), ylim=c(90, -80))
|
|
plot(land, col="#f0f0f0", add=T)
|
|
lines(bbox, col="#756bb1")
|
|
```
|
|
|
|
## Bio-ORACLE
|
|
|
|
> Environmental predictors were sourced from the Bio-ORACLE v3.0 database, providing standardized global marine rasters for present-day conditions and future climate projections under CMIP6 Shared Socioeconomic Pathways (SSPs).
|
|
|
|
```{r}
|
|
all_layers = list_layers()
|
|
|
|
ids_to_remove = c(
|
|
# no projection data
|
|
# the database flaws (?)
|
|
"par_mean_baseline_2000_2020_depthsurf",
|
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
|
"chl_baseline_2000_2018_depthmax",
|
|
"chl_baseline_2000_2018_depthmean",
|
|
"chl_baseline_2000_2018_depthmin",
|
|
# nature of the variable
|
|
"terrain_characteristics"
|
|
)
|
|
|
|
layers = all_layers |>
|
|
filter(! dataset_id %in% c(ids_to_remove)) |>
|
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) |>
|
|
mutate(
|
|
var_depth = paste0(var, "_", depth),
|
|
var_depth_humane = str_extract(title, ".*]") |> str_remove("Bio-Oracle ")
|
|
)
|
|
# aware that not all variables have ssp126
|
|
|
|
saveRDS(layers, "layers.Rda")
|
|
|
|
layers |> select(var_depth_humane, var_depth) |> distinct() |> print.data.frame()
|
|
```
|
|
|
|
```{r}
|
|
download_slice = function(scenario_value, decade_start, layers_to_filter) {
|
|
scenario_layers = layers_to_filter |>
|
|
filter(scenario == scenario_value)
|
|
|
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
|
|
|
slice_constraints = list(
|
|
time = c(time_point, time_point),
|
|
longitude = constraints_geo$longitude,
|
|
latitude = constraints_geo$latitude
|
|
)
|
|
|
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
slice_rasters = sapply(
|
|
scenario_layers$dataset_id,
|
|
function(id) download_layers(
|
|
id,
|
|
constraints = slice_constraints,
|
|
directory = download_dir
|
|
),
|
|
simplify = TRUE
|
|
)
|
|
|
|
return(slice_rasters)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
slice_to_brick = function(list_of_rasters) {
|
|
brick = rast(list_of_rasters)
|
|
depths = brick |> names() |> str_extract("depth[:letter:]+")
|
|
var_stat = brick |>
|
|
varnames() |>
|
|
as_tibble() |>
|
|
separate_wider_delim("value",delim="_", names=c("var", "stat"))
|
|
|
|
prev_longnames = longnames(brick)
|
|
longnames(brick) = paste0(prev_longnames, " [", depths ,"]")
|
|
names(brick) = paste(var_stat$var, depths, var_stat$stat, sep = "_")
|
|
return(brick)
|
|
}
|
|
```
|
|
|
|
|
|
|
|
## Data exploration
|
|
|
|
Feel free to skip this step as it shows the logic behind the layers selected for
|
|
analysis.
|
|
|
|
### Download
|
|
|
|
```{r eval=FALSE}
|
|
baseline_rasters = download_slice("baseline", 2010, layers)
|
|
```
|
|
|
|
```{r}
|
|
baseline_brick = slice_to_brick(baseline_rasters)
|
|
```
|
|
|
|
300 hundred layers seem too many for a controlled analysis
|
|
|
|
```{r}
|
|
nlyr(baseline_brick)
|
|
```
|
|
### Filter by ecology
|
|
|
|
Knowing smth about the species lets clean up variables before any formal analysis of variables releations
|
|
|
|
```{r}
|
|
filtered_layers = tibble(
|
|
names = names(baseline_brick),
|
|
longnames = longnames(baseline_brick)
|
|
) |>
|
|
separate_wider_delim(
|
|
"names",
|
|
delim="_",
|
|
names=c("var", "depth", "stat"),
|
|
cols_remove=F
|
|
) |>
|
|
filter(
|
|
!(
|
|
depth %in% c("depthmax", "depthmean") |
|
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
|
stat %in% c("ltmin", "ltmax", "range")
|
|
)
|
|
)
|
|
|
|
baseline_brick_subset = baseline_brick |>
|
|
subset(filtered_layers$names)
|
|
|
|
filtered_layers |> select(longnames) |> print.data.frame()
|
|
```
|
|
|
|
### Sample for correlation analysis
|
|
|
|
```{r}
|
|
ten_percent_cells = nrow(baseline_brick_subset) * ncol(baseline_brick_subset) * 0.1
|
|
|
|
baseline_brick_subset_sample = baseline_brick_subset |>
|
|
spatSample(size=ten_percent_cells, method="regular", na.rm=TRUE)
|
|
```
|
|
|
|
#### Initial correlation
|
|
|
|
```{r}
|
|
corr_matrix = cor(baseline_brick_subset_sample)
|
|
```
|
|
|
|
```{r}
|
|
ggcorrplot(corr_matrix,
|
|
type = "lower", # Only show half (it's symmetrical anyway)
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE) + # don't label values
|
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7))
|
|
```
|
|
|
|
```{r}
|
|
high_cor_pairs <- corr_matrix |>
|
|
as.data.frame() |>
|
|
rownames_to_column("Var1") |>
|
|
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 900 total pairs
|
|
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
|
|
filter(abs(value) > 0.8 & Var1 < Var2) |>
|
|
mutate(value = round(value, 3)) |>
|
|
arrange(desc(abs(value)))
|
|
|
|
# 59 are highly correlated
|
|
print.data.frame(high_cor_pairs)
|
|
```
|
|
|
|
#### Variance Inflation Factor
|
|
|
|
> It calculates how much one variable can be predicted by a linear combination of all other variables.
|
|
|
|
```{r}
|
|
vif_results = vifstep(baseline_brick_subset_sample, th = 10)
|
|
```
|
|
|
|
```{r}
|
|
vars_to_keep = vif_results@results$Variables
|
|
vif_results@results
|
|
```
|
|
|
|
Then check the correlations of variables filtered by VIF step
|
|
|
|
```{r}
|
|
baseline_brick_subset_sample_vif = baseline_brick_subset_sample |>
|
|
select(all_of(vars_to_keep))
|
|
|
|
corr_matrix_vif = cor(baseline_brick_subset_sample_vif)
|
|
|
|
ggcorrplot(corr_matrix_vif,
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE) + # don't label values
|
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7))
|
|
```
|
|
|
|
```{r}
|
|
high_cor_pairs_vif <- corr_matrix_vif |>
|
|
as.data.frame() |>
|
|
rownames_to_column("Var1") |>
|
|
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 144 total pairs
|
|
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
|
|
filter(abs(value) > 0.8 & Var1 < Var2) |>
|
|
mutate(value = round(value, 3)) |>
|
|
arrange(desc(abs(value)))
|
|
|
|
# 3 are highly correlated
|
|
print.data.frame(high_cor_pairs_vif)
|
|
```
|
|
Having high correlation pairs and VIF values we manually select variables we can interpret
|
|
|
|
```{r}
|
|
manually_selected_vars = c(
|
|
"siconc_depthsurf_min",
|
|
"thetao_depthsurf_min",
|
|
"thetao_depthmin_max",
|
|
"chl_depthsurf_mean",
|
|
"phyc_depthmin_max"
|
|
)
|
|
```
|
|
|
|
Then again check correlation
|
|
|
|
```{r}
|
|
baseline_brick_subset_sample_manual = baseline_brick_subset_sample |>
|
|
select(all_of(manually_selected_vars))
|
|
|
|
corr_matrix_manual = cor(baseline_brick_subset_sample_manual)
|
|
|
|
ggcorrplot(corr_matrix_manual,
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE) + # don't label values
|
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7))
|
|
```
|
|
|
|
```{r}
|
|
selected_layers = filtered_layers |>
|
|
filter(names %in% manually_selected_vars)
|
|
|
|
saveRDS(selected_layers, file="selected_layers.Rda")
|
|
```
|
|
|
|
|
|
## Learning
|
|
|
|
### Input layers
|
|
|
|
Filter layers based on selected layers info.
|
|
|
|
```{r}
|
|
layers = readRDS("layers.Rda")
|
|
selected_layers = readRDS("selected_layers.Rda")
|
|
constraints_geo = readRDS("constraints_geo.Rda")
|
|
|
|
features_layers = inner_join(selected_layers, layers, by=c("var", "depth"))
|
|
```
|
|
|
|
```{r}
|
|
baseline_features_rasters = download_slice("baseline", 2010, features_layers)
|
|
```
|
|
|
|
Set up features raster brick
|
|
|
|
```{r}
|
|
baseline_features_brick = slice_to_brick(baseline_features_rasters) |>
|
|
subset(c(selected_layers$names))
|
|
```
|
|
|
|
And target raster
|
|
|
|
```{r}
|
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
|
ocean_mask = ifel(is.na(baseline_features_brick[[1]]), NA, 1)
|
|
```
|
|
|
|
```{r}
|
|
seal_range_raster = rasterize(
|
|
seal_range,
|
|
baseline_features_brick[[1]],
|
|
field = "",
|
|
background = 0
|
|
) |>
|
|
mask(ocean_mask)
|
|
```
|
|
|
|
```{r}
|
|
plot(c(seal_range_raster, baseline_features_brick))
|
|
```
|
|
|
|
### Spatial blocks
|
|
|
|
```{r}
|
|
ROWS = 3
|
|
COLUMNS = 5
|
|
nblocks = ROWS * COLUMNS
|
|
|
|
all_blocks = 1:(nblocks)
|
|
set.seed(321) # For reproducibility
|
|
|
|
test_blocks = seq(2, nblocks, by = 2)
|
|
train_blocks = setdiff(all_blocks, test_blocks)
|
|
```
|
|
|
|
```{r}
|
|
block_grid = seal_range_raster |>
|
|
ext() |>
|
|
st_bbox() |>
|
|
st_make_grid(n = c(COLUMNS, ROWS)) |>
|
|
st_sf() |>
|
|
mutate(block_id = row_number()) |>
|
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
|
```
|
|
|
|
```{r}
|
|
block_raster = block_grid |>
|
|
rasterize(seal_range_raster, field = "block_id")
|
|
```
|
|
|
|
```{r}
|
|
plot(seal_range_raster$layer)
|
|
plot(vect(block_grid), add = TRUE, border = "black", lwd = 1)
|
|
plot(
|
|
vect(block_grid |> filter(type == "Test (Hold-out)")),
|
|
add = TRUE,
|
|
border = "red",
|
|
lwd = 3)
|
|
```
|
|
|
|
```{r}
|
|
seal_range_raster$block_id = block_raster$block_id
|
|
```
|
|
|
|
### Catboost
|
|
|
|
#### Prep
|
|
|
|
Set up the dataframe for machine learning
|
|
|
|
```{r}
|
|
seal_range_df = seal_range_raster |>
|
|
as.data.frame(cells = TRUE, na.rm=TRUE) |>
|
|
rename(target = layer)
|
|
```
|
|
|
|
```{r}
|
|
baseline_features_df = baseline_features_brick |>
|
|
as.data.frame(cells = TRUE, na.rm=TRUE)
|
|
```
|
|
|
|
```{r}
|
|
target_features = left_join(seal_range_df, baseline_features_df, by = "cell")
|
|
```
|
|
|
|
Divide training and testing pools
|
|
|
|
```{r}
|
|
train_df = target_features %>% filter(block_id %in% train_blocks)
|
|
test_df = target_features %>% filter(block_id %in% test_blocks)
|
|
|
|
train_pool <- catboost.load_pool(
|
|
data = train_df |> select(-cell, -block_id, -target),
|
|
label = train_df$target
|
|
)
|
|
test_pool = catboost.load_pool(
|
|
data = test_df |> select(-cell, -block_id, -target),
|
|
label = test_df$target
|
|
)
|
|
```
|
|
|
|
#### Learning
|
|
|
|
```{r}
|
|
params = list(
|
|
loss_function = 'Logloss',
|
|
eval_metric = 'AUC',
|
|
iterations = 200, # Plenty of trees for a smooth fit
|
|
depth = 2, # Standard depth to prevent overfitting
|
|
learning_rate = 0.02, # Lower learning rate is better for high ROC data
|
|
l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC
|
|
random_seed = 42,
|
|
rsm = 0.5,
|
|
verbose = 10,
|
|
od_type = "Iter",
|
|
od_wait = 20
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
cat_model = catboost.train(train_pool, test_pool = test_pool, params = params)
|
|
saveRDS(cat_model, "cat_model.Rda")
|
|
```
|
|
#### Catboost Result
|
|
|
|
```{r}
|
|
whole_pool = catboost.load_pool(
|
|
data = target_features |> select(-cell, -block_id, -target)
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
preds_prob = catboost.predict(cat_model, whole_pool, prediction_type = 'Probability')
|
|
```
|
|
|
|
```{r}
|
|
baseline_prediction_raster = seal_range_raster
|
|
baseline_prediction_raster[target_features$cell] = preds_prob
|
|
```
|
|
|
|
```{r}
|
|
plot(baseline_prediction_raster$layer)
|
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
|
```
|
|
|
|
#### Post-processing
|
|
|
|
```{r}
|
|
plot(seal_range_raster)
|
|
```
|
|
|
|
|
|
```{r}
|
|
range_distance = gridDist(seal_range_raster, target=1)
|
|
distance_decay = exp(-0.000001 * range_distance) |>
|
|
subst(NA, 0)
|
|
```
|
|
|
|
```{r}
|
|
plot(distance_decay)
|
|
```
|
|
|
|
```{r}
|
|
baseline_prediction_raster$layer_dist = baseline_prediction_raster$layer * distance_decay
|
|
```
|
|
|
|
```{r}
|
|
plot(baseline_prediction_raster$layer_dist)
|
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
|
```
|
|
|
|
```{r}
|
|
plot(ifel(baseline_prediction_raster$layer_dist > 0.7, 1, 0))
|
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
|
```
|
|
|
|
|
|
#### Interpret
|
|
|
|
Maps
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Plots
|
|
|
|
```{r}
|
|
explainer_cat = explain(
|
|
model = cat_model,
|
|
data = train_df |> select(-cell, -block_id, -target),
|
|
y = train_df$target,
|
|
label = "CatBoost Harp Seal Model",
|
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
vi_cat = model_parts(explainer_cat)
|
|
plot(vi_cat)
|
|
```
|
|
|
|
```{r}
|
|
pdp_cat = lapply(selected_layers$names, function(X) model_profile(explainer_cat, variables = X))
|
|
plot(pdp_cat)
|
|
```
|
|
|
|
```{r}
|
|
mp_cat = model_performance(explainer_cat)
|
|
plot(mp_cat, geom = "boxplot")
|
|
plot(mp_cat, geom = "roc") # Or geom = "boxplot" for residuals
|
|
```
|
|
|
|
```{r}
|
|
# Pick a specific "wrong" pixel from your dataframe
|
|
caspian_pixel = train_df[train_df$cell == 120363, ]
|
|
|
|
bd_cat = predict_parts(explainer_cat, new_observation = caspian_pixel)
|
|
plot(bd_cat)
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|