You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
297 lines
8.4 KiB
297 lines
8.4 KiB
## Learning Pipeline (Training)
|
|
This file contains the learning stage of the prototype: data download, feature filtering, spatial split, and CatBoost model training.
|
|
|
|
## Load required R packages
|
|
```{r}
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(terra)
|
|
library(mregions2)
|
|
library(biooracler)
|
|
library(stringr)
|
|
library(tibble)
|
|
library(catboost)
|
|
library(caret)
|
|
library(blockCV)
|
|
library(sf)
|
|
library(usdm)
|
|
library(ggcorrplot)
|
|
library(reshape2)
|
|
library(tidygraph)
|
|
library(ggraph)
|
|
library(CAST)
|
|
library(pdp)
|
|
library(ggplot2)
|
|
library(DALEX)
|
|
```
|
|
|
|
## Load shared helpers and define run configuration
|
|
```{r}
|
|
source("R/shared-utils.R")
|
|
|
|
config = list(
|
|
range_shapefile = "data/iucn/Pagophilus_groenlandicus.shp",
|
|
bbox_expand_degrees = 5,
|
|
baseline_scenario = "baseline",
|
|
baseline_decade = 2010,
|
|
n_corr_sample = 10000,
|
|
n_blocks_total = 15,
|
|
n_blocks_test = 5,
|
|
seed_blocks = 321,
|
|
vif_threshold = 10,
|
|
artifacts = list(
|
|
dynamic_layers = "dynamic_layers.rds",
|
|
subset_layer_names = "subset_baseline_layer_names.rds",
|
|
seal_range_df = "seal_range_df.rds",
|
|
seal_range_raster = "seal_range_raster.tif",
|
|
model = "cat_model.cbm",
|
|
manifest = "artifacts-manifest-learning.csv",
|
|
session_info = "session-info-learning.txt"
|
|
)
|
|
)
|
|
```
|
|
|
|
## Define study area and spatial constraints
|
|
|
|
First we get a target species range from IUNC
|
|
```{r}
|
|
study_bounds = make_study_bounds(
|
|
range_shapefile = config$range_shapefile,
|
|
expand_degrees = config$bbox_expand_degrees
|
|
)
|
|
seal_range = study_bounds$seal_range
|
|
lon_range = study_bounds$lon_range
|
|
lat_range = study_bounds$lat_range
|
|
```
|
|
|
|
## List and filter Bio-ORACLE layers
|
|
|
|
Load Bio-ORACLE layers. Remove layers without forecast data: terrain characteristics are constant and some layers doesn't have the forcast data as a matter of fact.
|
|
|
|
```{r}
|
|
layers = list_layers()
|
|
|
|
# Нет прогнозных данных :/
|
|
removed_layers_ids = c(
|
|
"par_mean_baseline_2000_2020_depthsurf",
|
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
|
"chl_baseline_2000_2018_depthmax",
|
|
"chl_baseline_2000_2018_depthmean",
|
|
"chl_baseline_2000_2018_depthmin"
|
|
)
|
|
|
|
constant_layers_ids = c("terrain_characteristics")
|
|
|
|
constant_layers = layers |>
|
|
filter(dataset_id %in% constant_layers_ids)
|
|
|
|
dynamic_layers = layers |>
|
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
|
```
|
|
|
|
## Download baseline and prepare predictor brick
|
|
|
|
We download the data for current time slice as it will be the learning data.
|
|
|
|
```{r}
|
|
baseline_rasters = download_biooracle_slice(
|
|
dynamic_layers = dynamic_layers,
|
|
scenario_value = config$baseline_scenario,
|
|
decade_start = config$baseline_decade,
|
|
lon_range = lon_range,
|
|
lat_range = lat_range
|
|
)
|
|
```
|
|
|
|
And construct a raster brick from all context layers.
|
|
|
|
```{r}
|
|
baseline_brick = rast(baseline_rasters)
|
|
baseline_brick = set_brick_names_with_depth(baseline_brick)
|
|
baseline_brick_depths = names(baseline_brick) |> str_extract("depth[:alpha:]+")
|
|
baseline_brick_longnames = baseline_brick |> longnames()
|
|
baseline_brick_varnames = baseline_brick |> varnames()
|
|
```
|
|
|
|
## Select baseline variables
|
|
|
|
Next filter layers matters based on our knowledge about the species.
|
|
|
|
```{r}
|
|
suitable_baseline_layer_names = tibble(
|
|
name = names(baseline_brick),
|
|
longname = baseline_brick_longnames,
|
|
varname = baseline_brick_varnames,
|
|
depth = baseline_brick_depths
|
|
) |>
|
|
separate_wider_delim(
|
|
varname,
|
|
delim = "_",
|
|
names = c("var", "type")
|
|
) |>
|
|
filter(
|
|
!(
|
|
depth == "depthmax" |
|
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
|
type %in% c("ltmin", "ltmax", "range")
|
|
)
|
|
)
|
|
|
|
subset_baseline_layer_names = suitable_baseline_layer_names |>
|
|
filter(
|
|
name %in% c(
|
|
"Minimum SeaIceCover depthsurf",
|
|
"Minimum OceanTemperature depthsurf",
|
|
"Average SeaIceThickness depthsurf",
|
|
"Average Chlorophyll depthsurf",
|
|
"Maximum OceanTemperature depthmin"
|
|
)
|
|
)
|
|
```
|
|
|
|
## Build feature table
|
|
```{r}
|
|
subset_baseline_brick = baseline_brick |>
|
|
subset(subset_baseline_layer_names$name)
|
|
|
|
features_brick = c(subset_baseline_brick)
|
|
|
|
baseline_df = features_brick |>
|
|
as.data.frame(cells = TRUE, xy = TRUE)
|
|
```
|
|
|
|
## Correlation and VIF-based selection
|
|
```{r}
|
|
sample = baseline_df |>
|
|
sample_n(config$n_corr_sample) |>
|
|
select(-cell, -x, -y) |>
|
|
drop_na()
|
|
|
|
corr_matrix = cor(sample)
|
|
|
|
ggcorrplot(
|
|
corr_matrix,
|
|
hc.order = TRUE,
|
|
type = "lower",
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE
|
|
) +
|
|
theme(
|
|
axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7)
|
|
)
|
|
|
|
high_cor_pairs = melt(corr_matrix) |>
|
|
filter(abs(value) > 0.8) |>
|
|
filter(Var1 != Var2) |>
|
|
distinct(value, .keep_all = TRUE) |>
|
|
arrange(desc(abs(value))) |>
|
|
mutate(Var1 = as.character(Var1), Var2 = as.character(Var2))
|
|
|
|
vif_results = vifstep(sample, th = config$vif_threshold)
|
|
keeper_vars = vif_results@results$Variables
|
|
|
|
baseline_df_subset = baseline_df |>
|
|
select(cell, x, y, all_of(keeper_vars))
|
|
```
|
|
|
|
## Build target and spatial blocks
|
|
```{r}
|
|
seal_range_raster = seal_range |>
|
|
rasterize(features_brick[[1]], field = "", background = 0)
|
|
|
|
all_blocks = seq_len(config$n_blocks_total)
|
|
set.seed(config$seed_blocks)
|
|
test_blocks = sample(all_blocks, config$n_blocks_test)
|
|
train_blocks = setdiff(all_blocks, test_blocks)
|
|
|
|
block_grid = seal_range_raster |>
|
|
ext() |>
|
|
st_bbox() |>
|
|
st_make_grid(n = c(5, 3)) |>
|
|
st_sf() |>
|
|
mutate(block_id = row_number()) |>
|
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
|
|
|
block_raster = block_grid |>
|
|
rasterize(seal_range_raster, field = "block_id")
|
|
|
|
seal_range_raster$block_id = block_raster$block_id
|
|
seal_range_df = seal_range_raster |>
|
|
as.data.frame(cells = TRUE) |>
|
|
rename(target = layer)
|
|
|
|
seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell")
|
|
train_df = seal_baseline %>% filter(block_id %in% train_blocks)
|
|
test_df = seal_baseline %>% filter(block_id %in% test_blocks)
|
|
```
|
|
|
|
## Train CatBoost model
|
|
```{r}
|
|
train_pool = catboost.load_pool(
|
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
|
label = train_df$target
|
|
)
|
|
|
|
test_pool = catboost.load_pool(
|
|
data = test_df |> select(-cell, -x, -y, -block_id, -target),
|
|
label = test_df$target
|
|
)
|
|
|
|
params = list(
|
|
loss_function = "Logloss",
|
|
eval_metric = "AUC",
|
|
iterations = 200,
|
|
depth = 2,
|
|
learning_rate = 0.02,
|
|
l2_leaf_reg = 15,
|
|
random_seed = 42,
|
|
rsm = 0.5,
|
|
verbose = 10,
|
|
od_type = "Iter",
|
|
od_wait = 20
|
|
)
|
|
|
|
cat_model = catboost.train(train_pool, test_pool = test_pool, params = params)
|
|
```
|
|
|
|
## Model interpretation outputs
|
|
```{r}
|
|
explainer_cat = explain(
|
|
model = cat_model,
|
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
|
y = train_df$target,
|
|
label = "CatBoost Harp Seal Model",
|
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
|
)
|
|
|
|
pdp_temp = model_profile(
|
|
explainer = explainer_cat,
|
|
variables = "Average Chlorophyll depthsurf"
|
|
)
|
|
plot(pdp_temp)
|
|
|
|
importanc2e = catboost.get_feature_importance(cat_model, train_pool) |>
|
|
enframe()
|
|
```
|
|
|
|
## Shared artifacts for prediction stage
|
|
These files are the explicit interface between learning and prediction.
|
|
|
|
```{r}
|
|
saveRDS(dynamic_layers, config$artifacts$dynamic_layers)
|
|
saveRDS(subset_baseline_layer_names, config$artifacts$subset_layer_names)
|
|
saveRDS(seal_range_df, config$artifacts$seal_range_df)
|
|
writeRaster(seal_range_raster, config$artifacts$seal_range_raster, overwrite = TRUE)
|
|
catboost.save_model(cat_model, config$artifacts$model)
|
|
|
|
artifact_manifest = tibble::tibble(
|
|
artifact = names(config$artifacts)[1:5],
|
|
path = unlist(config$artifacts[1:5])
|
|
)
|
|
utils::write.csv(artifact_manifest, config$artifacts$manifest, row.names = FALSE)
|
|
utils::capture.output(utils::sessionInfo(), file = config$artifacts$session_info)
|
|
```
|