You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
605 lines
16 KiB
605 lines
16 KiB
## Load required R packages
|
|
These libraries provide spatial handling, machine learning, and model explainability tools used throughout the workflow.
|
|
|
|
```{r}
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(terra)
|
|
library(mregions2)
|
|
library(biooracler)
|
|
library(stringr)
|
|
library(tibble)
|
|
library(catboost)
|
|
library(caret)
|
|
library(blockCV)
|
|
library(sf)
|
|
library(usdm)
|
|
library(ggcorrplot)
|
|
library(reshape2)
|
|
library(tidygraph)
|
|
library(ggraph)
|
|
library(CAST)
|
|
library(pdp)
|
|
library(ggplot2)
|
|
library(DALEX)
|
|
```
|
|
|
|
## Define study area and spatial constraints
|
|
Here we load the harp seal range shapefile and derive the longitude/latitude bounds used to constrain Bio-ORACLE downloads.
|
|
|
|
```{r}
|
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
|
bbox = ext(seal_range) |> extend(5)
|
|
|
|
lon_range = c(bbox$xmin, bbox$xmax)
|
|
lat_range = c(bbox$ymin, bbox$ymax)
|
|
|
|
constraints = list(
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
```
|
|
|
|
## List and filter Bio-ORACLE layers
|
|
We list available Bio-ORACLE layers, manually remove unsupported ones, and separate constant (terrain) layers from dynamic variables.
|
|
|
|
```{r}
|
|
layers = list_layers()
|
|
|
|
# Нет прогнозных данных :/
|
|
removed_layers_ids = c(
|
|
"par_mean_baseline_2000_2020_depthsurf",
|
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
|
"chl_baseline_2000_2018_depthmax",
|
|
"chl_baseline_2000_2018_depthmean",
|
|
"chl_baseline_2000_2018_depthmin"
|
|
)
|
|
|
|
constant_layers_ids = c("terrain_characteristics")
|
|
|
|
constant_layers = layers |>
|
|
filter(dataset_id %in% constant_layers_ids)
|
|
```
|
|
|
|
## Prepare dynamic layers metadata
|
|
We keep only dynamic environmental variables and parse dataset IDs into variable, scenario, time, and depth components.
|
|
|
|
```{r}
|
|
dynamic_layers = layers |>
|
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
|
|
|
saveRDS(dynamic_layers, 'dynamic_layers.rds')
|
|
```
|
|
|
|
## Helper to download a single temporal slice
|
|
This function downloads all dynamic layers for a given scenario and decade within the spatial and temporal constraints.
|
|
|
|
```{r}
|
|
download_slice = function(scenario_value, decade_start) {
|
|
scenario_layers = dynamic_layers |>
|
|
filter(scenario == scenario_value)
|
|
|
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
|
|
|
slice_constraints = list(
|
|
time = c(time_point, time_point),
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
|
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
slice_rasters = sapply(
|
|
scenario_layers$dataset_id,
|
|
function(id) download_layers(
|
|
id,
|
|
constraints = slice_constraints,
|
|
directory = download_dir
|
|
),
|
|
simplify = TRUE
|
|
)
|
|
|
|
return(slice_rasters)
|
|
}
|
|
```
|
|
|
|
## Download baseline environmental slice
|
|
We obtain baseline (historical) rasters for the 2010 decade over the study area.
|
|
|
|
```{r}
|
|
baseline_rasters = download_slice("baseline", 2010)
|
|
```
|
|
|
|
## Build baseline raster brick and extract metadata
|
|
We combine downloaded rasters into a brick and extract depth, long names, and variable names for later filtering.
|
|
|
|
```{r}
|
|
baseline_brick = rast(baseline_rasters)
|
|
baseline_brick_depths = baseline_brick |>
|
|
names() |>
|
|
str_extract("depth[:alpha:]+")
|
|
baseline_brick_longnames = baseline_brick |> longnames()
|
|
baseline_brick_varnames = baseline_brick |> varnames()
|
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
|
```
|
|
|
|
## Select ecologically relevant baseline variables
|
|
We filter out less relevant or redundant variables and keep a focused subset of candidate predictors.
|
|
|
|
```{r}
|
|
suitable_baseline_layer_names = tibble(
|
|
name = names(baseline_brick),
|
|
longname = baseline_brick_longnames,
|
|
varname = baseline_brick_varnames,
|
|
depth = baseline_brick_depths,
|
|
) |>
|
|
separate_wider_delim(
|
|
varname,
|
|
delim = "_",
|
|
names = c("var", "type")
|
|
) |>
|
|
filter(
|
|
!(
|
|
depth == "depthmax" |
|
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
|
type %in% c("ltmin", "ltmax", "range")
|
|
)
|
|
)
|
|
|
|
subset_baseline_layer_names = suitable_baseline_layer_names |>
|
|
filter(
|
|
name %in% c(
|
|
"Minimum SeaIceCover depthsurf",
|
|
"Minimum OceanTemperature depthsurf",
|
|
"Average SeaIceThickness depthsurf",
|
|
"Average Chlorophyll depthsurf",
|
|
"Maximum OceanTemperature depthmin"
|
|
)
|
|
)
|
|
```
|
|
## Inspect chosen baseline variables
|
|
We preview the table of selected variables to confirm that only the intended layers remain.
|
|
|
|
```{r}
|
|
subset_baseline_layer_names
|
|
saveRDS(subset_baseline_layer_names, file = "subset_baseline_layer_names.rds")
|
|
```
|
|
|
|
## Build subset raster brick
|
|
We subset the baseline raster brick to include only the selected variables.
|
|
|
|
```{r}
|
|
subset_baseline_brick = baseline_brick |>
|
|
subset(subset_baseline_layer_names$name)
|
|
```
|
|
|
|
## Combine features into a single brick
|
|
The final feature brick contains the chosen environmental predictors (terrain can be added later if needed).
|
|
|
|
```{r}
|
|
# features_brick = c(baseline_brick, terrain_raster)
|
|
features_brick = c(subset_baseline_brick)
|
|
```
|
|
|
|
## (Optional) Crop feature brick to a subregion
|
|
This chunk shows how to restrict the analysis to a smaller bounding box if desired (currently commented out).
|
|
|
|
```{r}
|
|
# cropped_bbox = ext(
|
|
# -20,
|
|
# 72,
|
|
# 60,
|
|
# 85
|
|
# )
|
|
|
|
# cropped_features_brick = features_brick |>
|
|
# crop(cropped_bbox)
|
|
```
|
|
|
|
## Convert raster brick to data frame
|
|
We convert the environmental rasters to a tidy data frame with cell indices and coordinates.
|
|
|
|
```{r}
|
|
baseline_df = features_brick |>
|
|
as.data.frame(cells = TRUE, xy = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
# vif_input_df <- baseline_df |>
|
|
# select(-cell) |>
|
|
# drop_na()
|
|
```
|
|
|
|
```{r}
|
|
# vif_sample = vif_input_df |>
|
|
# sample_n(10000)
|
|
|
|
# vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)]
|
|
```
|
|
|
|
## Explore correlations among predictors
|
|
We randomly sample cells, compute a correlation matrix, and visualize pairwise correlations.
|
|
|
|
```{r}
|
|
sample = baseline_df |>
|
|
sample_n(10000) |>
|
|
select(-cell, -x, -y) |>
|
|
drop_na()
|
|
corr_matrix <- cor(sample)
|
|
|
|
ggcorrplot(corr_matrix,
|
|
hc.order = TRUE, # Clusters similar variables together
|
|
type = "lower", # Only show half (it's symmetrical anyway)
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE) + # Set to TRUE only if you have <20 variables
|
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7))
|
|
```
|
|
|
|
## Identify highly correlated variable pairs
|
|
We list variable pairs with strong correlations to better understand redundancy among predictors.
|
|
|
|
```{r}
|
|
high_cor_pairs <- melt(corr_matrix) |>
|
|
filter(abs(value) > 0.8) |>
|
|
filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal)
|
|
distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A)
|
|
arrange(desc(abs(value))) |>
|
|
mutate(Var1 = as.character(Var1),
|
|
Var2 = as.character(Var2))
|
|
```
|
|
|
|
## Perform VIF-based variable selection
|
|
Variance Inflation Factor (VIF) is used to remove collinear predictors and retain a stable subset.
|
|
|
|
```{r}
|
|
vif_results <- vifstep(sample, th = 10)
|
|
```
|
|
|
|
## Extract retained predictor names
|
|
We pull out the names of variables that passed the VIF threshold.
|
|
|
|
```{r}
|
|
keeper_vars <- vif_results@results$Variables
|
|
```
|
|
|
|
## Subset baseline data frame to VIF-selected variables
|
|
We keep only the selected predictors along with cell indices and coordinates.
|
|
|
|
```{r}
|
|
baseline_df_subset = baseline_df |>
|
|
select(cell, x, y, all_of(keeper_vars))
|
|
```
|
|
## Rasterize harp seal range
|
|
We convert the harp seal polygon range into a raster aligned with the environmental brick (presence/absence mask).
|
|
|
|
```{r}
|
|
seal_range_raster = seal_range |>
|
|
rasterize(features_brick[[1]], field="", background=0)
|
|
```
|
|
|
|
```{r}
|
|
writeRaster(seal_range_raster, "seal_range_raster.tif")
|
|
```
|
|
|
|
|
|
## Define spatial blocks for cross-validation
|
|
We create block IDs over the study area and split them into train and test sets.
|
|
|
|
```{r}
|
|
all_blocks <- 1:15
|
|
set.seed(321) # For reproducibility
|
|
|
|
test_blocks <- sample(all_blocks, 5) # Randomly pick 5 blocks for testing
|
|
train_blocks <- setdiff(all_blocks, test_blocks)
|
|
```
|
|
|
|
```{r}
|
|
block_grid = seal_range_raster |>
|
|
ext() |>
|
|
st_bbox() |>
|
|
st_make_grid(n = c(5, 3)) |>
|
|
st_sf() |>
|
|
mutate(block_id = row_number()) |>
|
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
|
```
|
|
|
|
```{r}
|
|
block_raster = block_grid |>
|
|
rasterize(seal_range_raster, field = "block_id")
|
|
```
|
|
|
|
```{r}
|
|
seal_range_raster$block_id = block_raster$block_id
|
|
```
|
|
|
|
```{r}
|
|
plot(seal_range_raster$layer)
|
|
plot(vect(block_grid), add = TRUE, border = "black", lwd = 1)
|
|
plot(
|
|
vect(block_grid |> filter(type == "Test (Hold-out)")),
|
|
add = TRUE,
|
|
border = "red",
|
|
lwd = 3)
|
|
```
|
|
|
|
```{r}
|
|
seal_range_df = seal_range_raster |>
|
|
as.data.frame(cells = TRUE) |>
|
|
rename(target = layer)
|
|
|
|
saveRDS(seal_range_df, file = "seal_range_df.rds")
|
|
```
|
|
|
|
```{r}
|
|
seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell")
|
|
```
|
|
|
|
|
|
```{r}
|
|
# 3. Create the dataframes based on the blocks
|
|
train_df <- seal_baseline %>% filter(block_id %in% train_blocks)
|
|
test_df <- seal_baseline %>% filter(block_id %in% test_blocks)
|
|
```
|
|
|
|
```{r}
|
|
train_pool <- catboost.load_pool(
|
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
|
label = train_df$target
|
|
)
|
|
|
|
test_pool = catboost.load_pool(
|
|
data = test_df |> select(-cell, -x, -y, -block_id, -target),
|
|
label = test_df$target
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
params <- list(
|
|
loss_function = 'Logloss',
|
|
eval_metric = 'AUC',
|
|
iterations = 200, # Plenty of trees for a smooth fit
|
|
depth = 2, # Standard depth to prevent overfitting
|
|
learning_rate = 0.02, # Lower learning rate is better for high ROC data
|
|
l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC
|
|
random_seed = 42,
|
|
rsm = 0.5,
|
|
verbose = 10,
|
|
od_type = "Iter",
|
|
od_wait = 20
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params)
|
|
```
|
|
|
|
```{r}
|
|
explainer_cat <- explain(
|
|
model = cat_model,
|
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
|
y = train_df$target,
|
|
label = "CatBoost Harp Seal Model",
|
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
|
)
|
|
```
|
|
```{r}
|
|
pdp_temp <- model_profile(
|
|
explainer = explainer_cat,
|
|
variables = "Average Chlorophyll depthsurf"
|
|
)
|
|
|
|
# 3. Plot it
|
|
plot(pdp_temp)
|
|
```
|
|
|
|
```{r}
|
|
importanc2e <- catboost.get_feature_importance(cat_model, train_pool) |>
|
|
enframe()
|
|
```
|
|
|
|
|
|
```{r}
|
|
catboost.save_model(cat_model, "cat_model.cbm")
|
|
```
|
|
|
|
|
|
## Make a prediction
|
|
|
|
|
|
```{r}
|
|
download_slice_subset = function(scenario_value, decade_start, layers_to_download) {
|
|
scenario_layers = dynamic_layers |>
|
|
filter(
|
|
scenario == scenario_value &
|
|
var %in% layers_to_download$var &
|
|
depth %in% layers_to_download$depth
|
|
)
|
|
|
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
|
|
|
slice_constraints = list(
|
|
time = c(time_point, time_point),
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
|
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
slice_rasters = sapply(
|
|
scenario_layers$dataset_id,
|
|
function(id) download_layers(
|
|
id,
|
|
constraints = slice_constraints,
|
|
directory = download_dir
|
|
),
|
|
simplify = TRUE
|
|
)
|
|
|
|
return(slice_rasters)
|
|
}
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585 = download_slice_subset("ssp585", 2090, subset_baseline_layer_names)
|
|
```
|
|
|
|
```{r}
|
|
cat_model <- catboost.load_model('cat_model.cbm')
|
|
subset_baseline_layer_names = readRDS('subset_baseline_layer_names.rds')
|
|
seal_range_df = readRDS('seal_range_df.rds')
|
|
seal_range_raster = rast('seal_range_raster.tif')
|
|
dynamic_layers = readRDS('dynamic_layers.rds')
|
|
```
|
|
|
|
```{r}
|
|
get_prediction = function(ssp_code, decade) {
|
|
ssp_slice = download_slice_subset(ssp_code, decade, subset_baseline_layer_names)
|
|
|
|
ssp_slice_brick = rast(ssp_slice)
|
|
ssp_slice_brick_depths = ssp_slice_brick |>
|
|
names() |>
|
|
str_extract("depth[:alpha:]+")
|
|
ssp_slice_brick_longnames = ssp_slice_brick |> longnames()
|
|
# baseline_brick_varnames = baseline_brick |> varnames() // коды longnames
|
|
|
|
names(ssp_slice_brick) = paste(ssp_slice_brick_longnames, ssp_slice_brick_depths)
|
|
|
|
|
|
|
|
ssp_slice_df = ssp_slice_brick |>
|
|
as.data.frame(cells = TRUE, xy = TRUE)
|
|
|
|
ssp_slice_features = ssp_slice_df |> select(-cell, -x, -y)
|
|
|
|
ssp_slice_pool <- catboost.load_pool(data = ssp_slice_features)
|
|
|
|
preds_prob <- catboost.predict(cat_model, ssp_slice_pool, prediction_type = 'Probability')
|
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
|
|
|
ssp_slice_prediction = ssp_slice_df |>
|
|
mutate(prediction = preds_class) |>
|
|
select(cell, prediction)
|
|
|
|
ssp_slice_diff = seal_range_df |>
|
|
left_join(ssp_slice_prediction, by = "cell") |>
|
|
mutate(diff = 2*target + prediction)
|
|
|
|
r = rast(ssp_slice_brick)
|
|
r[ssp_slice_diff$cell] = ssp_slice_diff$diff
|
|
|
|
writeRaster(r[[1]], paste0(ssp_code, "-", decade, ".tif"))
|
|
|
|
png(filename = paste0(ssp_code, "-", decade, ".png"), width = 800, height = 800)
|
|
plot(
|
|
r[[1]],
|
|
type="classes",
|
|
col=c("grey", "green", "red", "purple"),
|
|
# col=c("grey", "#7fc97f", "#fdc086", "#beaed4"),
|
|
# levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"),
|
|
levels=c("00", "01", "10", "11"),
|
|
main=paste0(ssp_code, "-", decade)
|
|
)
|
|
dev.off()
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
get_prediction("ssp585", 2050)
|
|
```
|
|
|
|
```{r}
|
|
sapply(seq(2050, 2050, by=10), function(decade) {
|
|
get_prediction("ssp585", decade)
|
|
})
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_brick = rast(ssp585)
|
|
ssp585_2090_brick_depths = ssp585_2090_brick |>
|
|
names() |>
|
|
str_extract("depth[:alpha:]+")
|
|
ssp585_2090_brick_longnames = ssp585_2090_brick |> longnames()
|
|
# baseline_brick_varnames = baseline_brick |> varnames() // коды longnames
|
|
|
|
names(ssp585_2090_brick) = paste(ssp585_2090_brick_longnames, ssp585_2090_brick_depths)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_df = ssp585_2090_brick |>
|
|
as.data.frame(cells = TRUE, xy = TRUE)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_features = ssp585_2090_df |> select(-cell, -x, -y)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_pool <- catboost.load_pool(data = ssp585_2090_features)
|
|
```
|
|
|
|
|
|
```{r}
|
|
preds_prob <- catboost.predict(cat_model, ssp585_2090_pool, prediction_type = 'Probability')
|
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_prediction = ssp585_2090_df |>
|
|
mutate(prediction = preds_class) |>
|
|
select(cell, prediction)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ssp585_2090_diff = seal_range_df |>
|
|
left_join(ssp585_2090_prediction, by = "cell") |>
|
|
mutate(diff = 2*target + prediction)
|
|
```
|
|
|
|
|
|
```{r}
|
|
hist(ssp585_2090_diff$diff)
|
|
```
|
|
|
|
|
|
```{r}
|
|
r = rast(baseline_brick)
|
|
r[ssp585_2090_diff$cell] = ssp585_2090_diff$diff
|
|
```
|
|
|
|
```{r}
|
|
writeRaster(r[[1]], "ssp585-2090.tif")
|
|
```
|
|
|
|
|
|
```{r}
|
|
plot(
|
|
r[[1]],
|
|
type="classes",
|
|
col=c("grey", "green", "red", "purple"),
|
|
# col=c("grey", "#7fc97f", "#fdc086", "#beaed4"),
|
|
levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"),
|
|
main="SSP 585 - 2090"
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
a = rast('ssp585-2090.tif')
|
|
```
|
|
|
|
|
|
```{r}
|
|
plot(a)
|
|
``` |