commit
f491305d6b
@ -0,0 +1,47 @@
|
|||||||
|
*.pptx
|
||||||
|
*.tif
|
||||||
|
data/
|
||||||
|
*.Rda
|
||||||
|
*.rds
|
||||||
|
*.xlsx
|
||||||
|
ssp585-png
|
||||||
|
catboost_info/
|
||||||
|
cat_model.cbm
|
||||||
|
|
||||||
|
.Rproj.user
|
||||||
|
.positai
|
||||||
|
# History files
|
||||||
|
.Rhistory
|
||||||
|
.Rapp.history
|
||||||
|
# Session Data files
|
||||||
|
.RData
|
||||||
|
.RDataTmp
|
||||||
|
# User-specific files
|
||||||
|
.Ruserdata
|
||||||
|
# Example code in package build process
|
||||||
|
*-Ex.R
|
||||||
|
# Output files from R CMD build
|
||||||
|
/*.tar.gz
|
||||||
|
# Output files from R CMD check
|
||||||
|
/*.Rcheck/
|
||||||
|
# RStudio files
|
||||||
|
.Rproj.user/
|
||||||
|
# produced vignettes
|
||||||
|
vignettes/*.html
|
||||||
|
vignettes/*.pdf
|
||||||
|
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
|
||||||
|
.httr-oauth
|
||||||
|
# knitr and R markdown default cache directories
|
||||||
|
*_cache/
|
||||||
|
/cache/
|
||||||
|
# Temporary files created by R markdown
|
||||||
|
*.utf8.md
|
||||||
|
*.knit.md
|
||||||
|
# R Environment Variables
|
||||||
|
.Renviron
|
||||||
|
# pkgdown site
|
||||||
|
docs/
|
||||||
|
# translation temp files
|
||||||
|
po/*~
|
||||||
|
# RStudio Connect folder
|
||||||
|
rsconnect/
|
||||||
@ -0,0 +1,84 @@
|
|||||||
|
make_study_bounds <- function(range_shapefile, expand_degrees = 5) {
|
||||||
|
seal_range <- terra::vect(range_shapefile)
|
||||||
|
bbox <- terra::ext(seal_range) |> terra::extend(expand_degrees)
|
||||||
|
list(
|
||||||
|
seal_range = seal_range,
|
||||||
|
lon_range = c(bbox$xmin, bbox$xmax),
|
||||||
|
lat_range = c(bbox$ymin, bbox$ymax)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
download_biooracle_slice <- function(dynamic_layers, scenario_value, decade_start, lon_range, lat_range, download_root = "./data/bio-oracle-2") {
|
||||||
|
scenario_layers <- dynamic_layers |>
|
||||||
|
dplyr::filter(scenario == scenario_value)
|
||||||
|
|
||||||
|
time_point <- paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
slice_constraints <- list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir <- file.path(download_root, scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) biooracler::download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
download_biooracle_slice_subset <- function(dynamic_layers, scenario_value, decade_start, layers_to_download, lon_range, lat_range, download_root = "./data/bio-oracle-2") {
|
||||||
|
scenario_layers <- dynamic_layers |>
|
||||||
|
dplyr::filter(
|
||||||
|
scenario == scenario_value &
|
||||||
|
var %in% layers_to_download$var &
|
||||||
|
depth %in% layers_to_download$depth
|
||||||
|
)
|
||||||
|
|
||||||
|
time_point <- paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
slice_constraints <- list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir <- file.path(download_root, scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) biooracler::download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
set_brick_names_with_depth <- function(r) {
|
||||||
|
r_depths <- names(r) |> stringr::str_extract("depth[:alpha:]+")
|
||||||
|
r_longnames <- terra::longnames(r)
|
||||||
|
names(r) <- paste(r_longnames, r_depths)
|
||||||
|
r
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_required_files <- function(paths) {
|
||||||
|
missing_paths <- paths[!file.exists(paths)]
|
||||||
|
if (length(missing_paths) > 0) {
|
||||||
|
stop(
|
||||||
|
paste0(
|
||||||
|
"Missing required file(s): ",
|
||||||
|
paste(missing_paths, collapse = ", "),
|
||||||
|
". Run learning pipeline first."
|
||||||
|
),
|
||||||
|
call. = FALSE
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -0,0 +1,250 @@
|
|||||||
|
```{r}
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(terra)
|
||||||
|
library(mregions2)
|
||||||
|
library(biooracler)
|
||||||
|
library(stringr)
|
||||||
|
library(tibble)
|
||||||
|
library(catboost)
|
||||||
|
library(caret)
|
||||||
|
library(blockCV)
|
||||||
|
library(sf)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# mammals_range = vect("data/iucn/MAMMALS_MARINE_ONLY.shp")
|
||||||
|
# seal_range = mammals_range |>
|
||||||
|
# subset(mammals_range$sci_name == "Pagophilus groenlandicus")
|
||||||
|
# writeVector(seal_range, "data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
bbox = ext(seal_range) |> extend(5)
|
||||||
|
|
||||||
|
lon_range = c(bbox$xmin, bbox$xmax)
|
||||||
|
lat_range = c(bbox$ymin, bbox$ymax)
|
||||||
|
|
||||||
|
constraints = list(
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Bio-Oracle
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
layers = list_layers()
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Фиксируем слои, на которые нет прогнозных данных. Их мы не будем использовать в обучении и предсказании.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Нет прогнозных данных :/
|
||||||
|
removed_layers_ids = c(
|
||||||
|
"par_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"chl_baseline_2000_2018_depthmax",
|
||||||
|
"chl_baseline_2000_2018_depthmean",
|
||||||
|
"chl_baseline_2000_2018_depthmin"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
constant_layers_ids = c("terrain_characteristics")
|
||||||
|
|
||||||
|
constant_layers = layers |>
|
||||||
|
filter(dataset_id %in% constant_layers_ids)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
terrain_raster = download_layers(
|
||||||
|
constant_layers$dataset_id[1],
|
||||||
|
constraints = constraints,
|
||||||
|
directory = "data/bio-oracle-2/terrain_characteristics"
|
||||||
|
)
|
||||||
|
# Переименовываем слои в полные названия, чтобы потом поля понятно назывались
|
||||||
|
names(terrain_raster) = longnames(terrain_raster)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dynamic_layers = layers |>
|
||||||
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
||||||
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
download_slice = function(scenario_value, decade_start) {
|
||||||
|
scenario_layers = dynamic_layers |>
|
||||||
|
filter(scenario == scenario_value)
|
||||||
|
|
||||||
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
|
||||||
|
slice_constraints = list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
slice_rasters = sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
return(slice_rasters)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_rasters = download_slice("baseline", 2010)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick = rast(baseline_rasters)
|
||||||
|
baseline_brick_depths = baseline_brick |>
|
||||||
|
names() |>
|
||||||
|
str_extract("depth[:alpha:]+")
|
||||||
|
baseline_brick_longnames = baseline_brick |> longnames()
|
||||||
|
# baseline_brick_varnames = baseline_brick |> varnames() // краткая запись называний слоёв
|
||||||
|
# Человекочитаемые названия слоёв -> полей в датафрейме (см. следующий блок)
|
||||||
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# features_brick = c(baseline_brick, terrain_raster)
|
||||||
|
features_brick = c(baseline_brick)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# cropped_bbox = ext(
|
||||||
|
# bbox$xmin + 80,
|
||||||
|
# bbox$xmax - 90,
|
||||||
|
# bbox$ymin + 20,
|
||||||
|
# bbox$ymax
|
||||||
|
# )
|
||||||
|
|
||||||
|
cropped_bbox = ext(
|
||||||
|
-20,
|
||||||
|
72,
|
||||||
|
60,
|
||||||
|
85
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_features_brick = features_brick |>
|
||||||
|
crop(cropped_bbox)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_df = cropped_features_brick |>
|
||||||
|
as.data.frame(cells = TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_seal_range_raster = seal_range |>
|
||||||
|
rasterize(cropped_features_brick[[1]], field="", background=0)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(cropped_seal_range_raster)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_seal_range_df = cropped_seal_range_raster |>
|
||||||
|
as.data.frame(xy = TRUE, cells = TRUE) |>
|
||||||
|
rename(target = layer)
|
||||||
|
#st_as_sf(coords = c("x", "y"), crs = 4326)
|
||||||
|
|
||||||
|
block_size = 12
|
||||||
|
|
||||||
|
cropped_seal_range_df_index = cropped_seal_range_df |>
|
||||||
|
mutate(
|
||||||
|
# Create grid indices based on coordinates
|
||||||
|
grid_x = floor(x / block_size),
|
||||||
|
grid_y = floor(y / block_size),
|
||||||
|
# Assign to "A" or "B" in a checkerboard pattern
|
||||||
|
block_id = (grid_x + grid_y) %% 2
|
||||||
|
)
|
||||||
|
|
||||||
|
train_cells = cropped_seal_range_df_index |>
|
||||||
|
filter(block_id == 0) |>
|
||||||
|
pull(cell)
|
||||||
|
test_cells = cropped_seal_range_df_index |>
|
||||||
|
filter(block_id == 1) |>
|
||||||
|
pull(cell)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
learn_orca = function(hyperparam) {
|
||||||
|
baseline_species_df = baseline_df |>
|
||||||
|
left_join(cropped_seal_range_df, by = "cell") |>
|
||||||
|
filter(!is.na(target))
|
||||||
|
|
||||||
|
train_df = baseline_species_df |>
|
||||||
|
filter(cell %in% train_cells)
|
||||||
|
train_features = train_df %>% select(-cell, -target, -x, -y) # параметры
|
||||||
|
train_labels = train_df$target # наличие ареала
|
||||||
|
train_pool = catboost.load_pool(data = train_features, label = train_labels)
|
||||||
|
|
||||||
|
test_df = baseline_species_df |>
|
||||||
|
filter(cell %in% test_cells)
|
||||||
|
test_features = test_df %>% select(-cell, -target, -x, -y)
|
||||||
|
test_labels = test_df$target
|
||||||
|
test_pool = catboost.load_pool(data = test_features, label = test_labels)
|
||||||
|
|
||||||
|
|
||||||
|
# Обучение
|
||||||
|
model = catboost.train(train_pool, test_pool = test_pool, params = hyperparam)
|
||||||
|
|
||||||
|
return(model)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
fit_params <- list(
|
||||||
|
iterations = 100,
|
||||||
|
learning_rate = 0.01,
|
||||||
|
depth = 6,
|
||||||
|
loss_function = 'Logloss',
|
||||||
|
eval_metric = 'AUC',
|
||||||
|
random_seed = 42,
|
||||||
|
verbose = 10, # Print progress every 100 iterations
|
||||||
|
od_type = "Iter", # Optional: Early stopping
|
||||||
|
od_wait = 20
|
||||||
|
)
|
||||||
|
```
|
||||||
|
попробовать разделить по диагонали
|
||||||
|
```{r}
|
||||||
|
m_seal = learn_orca(fit_params)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
i_seal = catboost.get_feature_importance(m_seal) |> as.data.frame() |> tibble::rownames_to_column("VALUE")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
sf_use_s2(FALSE)
|
||||||
|
target_test_blocks = cv_spatial(
|
||||||
|
x = cropped_seal_range_sf,
|
||||||
|
column = "layer",
|
||||||
|
size = 1e+06,
|
||||||
|
k = 2
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(cropped_seal_range_raster)
|
||||||
|
```
|
||||||
@ -0,0 +1,414 @@
|
|||||||
|
```{r}
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(terra)
|
||||||
|
library(mregions2)
|
||||||
|
library(biooracler)
|
||||||
|
library(stringr)
|
||||||
|
library(tibble)
|
||||||
|
library(catboost)
|
||||||
|
library(caret)
|
||||||
|
library(blockCV)
|
||||||
|
library(sf)
|
||||||
|
library(usdm)
|
||||||
|
library(ggcorrplot)
|
||||||
|
library(reshape2)
|
||||||
|
library(tidygraph)
|
||||||
|
library(ggraph)
|
||||||
|
library(CAST)
|
||||||
|
library(pdp)
|
||||||
|
library(ggplot2)
|
||||||
|
library(DALEX)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
bbox = ext(seal_range) |> extend(5)
|
||||||
|
|
||||||
|
lon_range = c(bbox$xmin, bbox$xmax)
|
||||||
|
lat_range = c(bbox$ymin, bbox$ymax)
|
||||||
|
|
||||||
|
constraints = list(
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
layers = list_layers()
|
||||||
|
|
||||||
|
# Нет прогнозных данных :/
|
||||||
|
removed_layers_ids = c(
|
||||||
|
"par_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"chl_baseline_2000_2018_depthmax",
|
||||||
|
"chl_baseline_2000_2018_depthmean",
|
||||||
|
"chl_baseline_2000_2018_depthmin"
|
||||||
|
)
|
||||||
|
|
||||||
|
constant_layers_ids = c("terrain_characteristics")
|
||||||
|
|
||||||
|
constant_layers = layers |>
|
||||||
|
filter(dataset_id %in% constant_layers_ids)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dynamic_layers = layers |>
|
||||||
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
||||||
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
download_slice = function(scenario_value, decade_start) {
|
||||||
|
scenario_layers = dynamic_layers |>
|
||||||
|
filter(scenario == scenario_value)
|
||||||
|
|
||||||
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
|
||||||
|
slice_constraints = list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
slice_rasters = sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
return(slice_rasters)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_rasters = download_slice("baseline", 2010)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick = rast(baseline_rasters)
|
||||||
|
baseline_brick_depths = baseline_brick |>
|
||||||
|
names() |>
|
||||||
|
str_extract("depth[:alpha:]+")
|
||||||
|
baseline_brick_longnames = baseline_brick |> longnames()
|
||||||
|
baseline_brick_varnames = baseline_brick |> varnames()
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
subset_baseline_layer_names = tibble(
|
||||||
|
name = names(baseline_brick),
|
||||||
|
longname = baseline_brick_longnames,
|
||||||
|
varname = baseline_brick_varnames,
|
||||||
|
depth = baseline_brick_depths
|
||||||
|
) |>
|
||||||
|
separate_wider_delim(
|
||||||
|
varname,
|
||||||
|
delim = "_",
|
||||||
|
names = c("var", "type")
|
||||||
|
) |>
|
||||||
|
filter(
|
||||||
|
!(
|
||||||
|
depth == "depthmax" |
|
||||||
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
||||||
|
type %in% c("ltmin", "ltmax", "range")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
subset_baseline_brick = baseline_brick |>
|
||||||
|
subset(subset_baseline_layer_names$name)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# features_brick = c(baseline_brick, terrain_raster)
|
||||||
|
features_brick = c(subset_baseline_brick)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_bbox = ext(
|
||||||
|
-20,
|
||||||
|
72,
|
||||||
|
60,
|
||||||
|
85
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_features_brick = features_brick |>
|
||||||
|
crop(cropped_bbox)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_df = cropped_features_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vif_input_df <- baseline_df |>
|
||||||
|
select(-cell) |>
|
||||||
|
drop_na()
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vif_sample = vif_input_df |>
|
||||||
|
sample_n(10000)
|
||||||
|
|
||||||
|
vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)]
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
corr_matrix <- cor(vif_sample)
|
||||||
|
|
||||||
|
ggcorrplot(corr_matrix,
|
||||||
|
# hc.order = TRUE, # Clusters similar variables together
|
||||||
|
type = "lower", # Only show half (it's symmetrical anyway)
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE) + # Set to TRUE only if you have <20 variables
|
||||||
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
high_cor_pairs <- melt(corr_matrix) |>
|
||||||
|
filter(abs(value) > 0.8) |>
|
||||||
|
filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal)
|
||||||
|
distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A)
|
||||||
|
arrange(desc(abs(value))) |>
|
||||||
|
mutate(Var1 = as.character(Var1),
|
||||||
|
Var2 = as.character(Var2))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
graph_data <- as_tbl_graph(high_cor_pairs)
|
||||||
|
|
||||||
|
# Plot the clusters
|
||||||
|
ggraph(graph_data, layout = "nicely") +
|
||||||
|
geom_edge_link(aes(alpha = abs(value)), color = "orange") +
|
||||||
|
geom_node_point(size = 2, color = "steelblue") +
|
||||||
|
geom_node_text(aes(label = name), repel = TRUE, size = 5) +
|
||||||
|
theme_void() +
|
||||||
|
labs(title = "Network of Redundant Variables (|r| > 0.8)")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vif_results <- vifstep(vif_sample, th = 10)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
keeper_vars <- vif_results@results$Variables
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_df_subset = baseline_df |>
|
||||||
|
select(cell, x, y, keeper_vars)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_seal_range_raster = seal_range |>
|
||||||
|
rasterize(cropped_features_brick[[1]], field="", background=0)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cropped_seal_range_df = cropped_seal_range_raster |>
|
||||||
|
as.data.frame(cells = TRUE) |>
|
||||||
|
rename(target = layer)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_baseline = dplyr::left_join(baseline_df_subset, cropped_seal_range_df, by = "cell")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_baseline_sf = st_as_sf(seal_baseline, coords = c("x", "y"), crs = 4326)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
sb <- cv_spatial(x = seal_baseline_sf,
|
||||||
|
column = "target",
|
||||||
|
size = 500000,
|
||||||
|
k = 3,
|
||||||
|
selection = "random")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_baseline$block_id <- sb$folds_ids
|
||||||
|
seal_baseline$target <- as.factor(make.names(seal_baseline$target))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# save(seal_baseline, file = "seal_baseline.Rda")
|
||||||
|
load(file = "seal_baseline.Rda")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
indices <- CAST::CreateSpacetimeFolds(seal_baseline,
|
||||||
|
spacevar = "block_id",
|
||||||
|
k = 3)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_baseline_sample = sample_n(seal_baseline, 100000)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ctrl <- trainControl(method = "cv",
|
||||||
|
index = indices$index,
|
||||||
|
indexOut = indices$indexOut,
|
||||||
|
classProbs = TRUE,
|
||||||
|
summaryFunction = twoClassSummary,
|
||||||
|
verboseIter = TRUE)
|
||||||
|
|
||||||
|
ffs_model <- ffs(
|
||||||
|
predictors = seal_baseline |> select(-cell, -x, -y, -target),
|
||||||
|
response = seal_baseline$target,
|
||||||
|
method = "ranger",
|
||||||
|
metric = "ROC",
|
||||||
|
trControl = ctrl,
|
||||||
|
tuneGrid = expand.grid(mtry = 2,
|
||||||
|
splitrule = "gini",
|
||||||
|
min.node.size = 10),
|
||||||
|
num.trees = 50,
|
||||||
|
num.threads = parallel::detectCores() - 1,
|
||||||
|
withinSE = TRUE,
|
||||||
|
minDiff = 0.005,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# save(ffs_model, file = "ffs_model.Rda")
|
||||||
|
load(file = "ffs_model.Rda")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# 1. Take a small, representative sample for the calculation
|
||||||
|
pdp_sample <- seal_baseline[sample(nrow(seal_baseline), 500), ]
|
||||||
|
|
||||||
|
# 2. Run the partial function with 'train = pdp_sample' and 'grid.resolution'
|
||||||
|
pdp_temp <- partial(ffs_model,
|
||||||
|
pred.var = "Minimum OceanTemperature depthsurf",
|
||||||
|
prob = TRUE,
|
||||||
|
which.class = "X1",
|
||||||
|
train = pdp_sample, # This is the secret to speed
|
||||||
|
grid.resolution = 20) # 20 points is plenty for a smooth line
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# 3. Plot it
|
||||||
|
autoplot(pdp_temp) + theme_minimal()
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# 2. Create the plot
|
||||||
|
# The 'rug = TRUE' adds little tick marks at the bottom showing
|
||||||
|
# where your actual data points sit.
|
||||||
|
autoplot(pdp_temp, rug = TRUE, train = seal_baseline) +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(title = "Partial Dependence: Min Ocean Temperature",
|
||||||
|
subtitle = "How Ocean Temp influences Seal Presence Probability",
|
||||||
|
x = "Temperature (°C)",
|
||||||
|
y = "Probability of Presence") +
|
||||||
|
geom_line(size = 1.2, color = "steelblue")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
final_vars <- c(
|
||||||
|
"Minimum Chlorophyll depthsurf",
|
||||||
|
"Maximum TotalPhytoplankton depthsurf",
|
||||||
|
"Maximum AirTemperature depthsurf",
|
||||||
|
"Minimum OceanTemperature depthsurf",
|
||||||
|
"Average Chlorophyll depthsurf",
|
||||||
|
"Maximum OceanTemperature depthmean",
|
||||||
|
"Average TotalPhytoplankton depthmean"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
train_data <- seal_baseline %>%
|
||||||
|
mutate(target_num = ifelse(target == "X1", 1, 0))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
unique_blocks <- unique(train_data$block_id)
|
||||||
|
train_blocks <- sample(unique_blocks, size = round(0.7 * length(unique_blocks)))
|
||||||
|
|
||||||
|
# 3. Create the dataframes based on the blocks
|
||||||
|
train_df <- train_data %>% filter(block_id %in% train_blocks)
|
||||||
|
test_df <- train_data %>% filter(!(block_id %in% train_blocks))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
train_pool <- catboost.load_pool(
|
||||||
|
data = train_df[, final_vars],
|
||||||
|
label = train_df$target_num
|
||||||
|
)
|
||||||
|
|
||||||
|
test_pool = catboost.load_pool(
|
||||||
|
data = test_df[, final_vars],
|
||||||
|
label = test_df$target_num
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
params <- list(
|
||||||
|
loss_function = 'Logloss',
|
||||||
|
eval_metric = 'AUC',
|
||||||
|
iterations = 100, # Plenty of trees for a smooth fit
|
||||||
|
depth = 3, # Standard depth to prevent overfitting
|
||||||
|
learning_rate = 0.06, # Lower learning rate is better for high ROC data
|
||||||
|
l2_leaf_reg = 30, # Stronger regularization to handle that 0.998 ROC
|
||||||
|
random_seed = 42,
|
||||||
|
rsm = 0.5,
|
||||||
|
verbose = 10 # Log progress every 100 iterations
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
explainer_cat <- explain(
|
||||||
|
model = cat_model,
|
||||||
|
data = train_df[, final_vars],
|
||||||
|
y = train_df$target_num,
|
||||||
|
label = "CatBoost Harp Seal Model",
|
||||||
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
||||||
|
)
|
||||||
|
```
|
||||||
|
```{r}
|
||||||
|
pdp_temp <- model_profile(
|
||||||
|
explainer = explainer_cat,
|
||||||
|
variables = "Minimum OceanTemperature depthsurf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Plot it
|
||||||
|
plot(pdp_temp)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
importanc2e <- catboost.get_feature_importance(cat_model, train_pool)
|
||||||
|
```
|
||||||
@ -0,0 +1,605 @@
|
|||||||
|
## Load required R packages
|
||||||
|
These libraries provide spatial handling, machine learning, and model explainability tools used throughout the workflow.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(terra)
|
||||||
|
library(mregions2)
|
||||||
|
library(biooracler)
|
||||||
|
library(stringr)
|
||||||
|
library(tibble)
|
||||||
|
library(catboost)
|
||||||
|
library(caret)
|
||||||
|
library(blockCV)
|
||||||
|
library(sf)
|
||||||
|
library(usdm)
|
||||||
|
library(ggcorrplot)
|
||||||
|
library(reshape2)
|
||||||
|
library(tidygraph)
|
||||||
|
library(ggraph)
|
||||||
|
library(CAST)
|
||||||
|
library(pdp)
|
||||||
|
library(ggplot2)
|
||||||
|
library(DALEX)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Define study area and spatial constraints
|
||||||
|
Here we load the harp seal range shapefile and derive the longitude/latitude bounds used to constrain Bio-ORACLE downloads.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
bbox = ext(seal_range) |> extend(5)
|
||||||
|
|
||||||
|
lon_range = c(bbox$xmin, bbox$xmax)
|
||||||
|
lat_range = c(bbox$ymin, bbox$ymax)
|
||||||
|
|
||||||
|
constraints = list(
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## List and filter Bio-ORACLE layers
|
||||||
|
We list available Bio-ORACLE layers, manually remove unsupported ones, and separate constant (terrain) layers from dynamic variables.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
layers = list_layers()
|
||||||
|
|
||||||
|
# Нет прогнозных данных :/
|
||||||
|
removed_layers_ids = c(
|
||||||
|
"par_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"chl_baseline_2000_2018_depthmax",
|
||||||
|
"chl_baseline_2000_2018_depthmean",
|
||||||
|
"chl_baseline_2000_2018_depthmin"
|
||||||
|
)
|
||||||
|
|
||||||
|
constant_layers_ids = c("terrain_characteristics")
|
||||||
|
|
||||||
|
constant_layers = layers |>
|
||||||
|
filter(dataset_id %in% constant_layers_ids)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prepare dynamic layers metadata
|
||||||
|
We keep only dynamic environmental variables and parse dataset IDs into variable, scenario, time, and depth components.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dynamic_layers = layers |>
|
||||||
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
||||||
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
||||||
|
|
||||||
|
saveRDS(dynamic_layers, 'dynamic_layers.rds')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Helper to download a single temporal slice
|
||||||
|
This function downloads all dynamic layers for a given scenario and decade within the spatial and temporal constraints.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
download_slice = function(scenario_value, decade_start) {
|
||||||
|
scenario_layers = dynamic_layers |>
|
||||||
|
filter(scenario == scenario_value)
|
||||||
|
|
||||||
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
|
||||||
|
slice_constraints = list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
slice_rasters = sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
return(slice_rasters)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Download baseline environmental slice
|
||||||
|
We obtain baseline (historical) rasters for the 2010 decade over the study area.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_rasters = download_slice("baseline", 2010)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Build baseline raster brick and extract metadata
|
||||||
|
We combine downloaded rasters into a brick and extract depth, long names, and variable names for later filtering.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick = rast(baseline_rasters)
|
||||||
|
baseline_brick_depths = baseline_brick |>
|
||||||
|
names() |>
|
||||||
|
str_extract("depth[:alpha:]+")
|
||||||
|
baseline_brick_longnames = baseline_brick |> longnames()
|
||||||
|
baseline_brick_varnames = baseline_brick |> varnames()
|
||||||
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Select ecologically relevant baseline variables
|
||||||
|
We filter out less relevant or redundant variables and keep a focused subset of candidate predictors.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
suitable_baseline_layer_names = tibble(
|
||||||
|
name = names(baseline_brick),
|
||||||
|
longname = baseline_brick_longnames,
|
||||||
|
varname = baseline_brick_varnames,
|
||||||
|
depth = baseline_brick_depths,
|
||||||
|
) |>
|
||||||
|
separate_wider_delim(
|
||||||
|
varname,
|
||||||
|
delim = "_",
|
||||||
|
names = c("var", "type")
|
||||||
|
) |>
|
||||||
|
filter(
|
||||||
|
!(
|
||||||
|
depth == "depthmax" |
|
||||||
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
||||||
|
type %in% c("ltmin", "ltmax", "range")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
subset_baseline_layer_names = suitable_baseline_layer_names |>
|
||||||
|
filter(
|
||||||
|
name %in% c(
|
||||||
|
"Minimum SeaIceCover depthsurf",
|
||||||
|
"Minimum OceanTemperature depthsurf",
|
||||||
|
"Average SeaIceThickness depthsurf",
|
||||||
|
"Average Chlorophyll depthsurf",
|
||||||
|
"Maximum OceanTemperature depthmin"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
## Inspect chosen baseline variables
|
||||||
|
We preview the table of selected variables to confirm that only the intended layers remain.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
subset_baseline_layer_names
|
||||||
|
saveRDS(subset_baseline_layer_names, file = "subset_baseline_layer_names.rds")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Build subset raster brick
|
||||||
|
We subset the baseline raster brick to include only the selected variables.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
subset_baseline_brick = baseline_brick |>
|
||||||
|
subset(subset_baseline_layer_names$name)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Combine features into a single brick
|
||||||
|
The final feature brick contains the chosen environmental predictors (terrain can be added later if needed).
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# features_brick = c(baseline_brick, terrain_raster)
|
||||||
|
features_brick = c(subset_baseline_brick)
|
||||||
|
```
|
||||||
|
|
||||||
|
## (Optional) Crop feature brick to a subregion
|
||||||
|
This chunk shows how to restrict the analysis to a smaller bounding box if desired (currently commented out).
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# cropped_bbox = ext(
|
||||||
|
# -20,
|
||||||
|
# 72,
|
||||||
|
# 60,
|
||||||
|
# 85
|
||||||
|
# )
|
||||||
|
|
||||||
|
# cropped_features_brick = features_brick |>
|
||||||
|
# crop(cropped_bbox)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Convert raster brick to data frame
|
||||||
|
We convert the environmental rasters to a tidy data frame with cell indices and coordinates.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_df = features_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# vif_input_df <- baseline_df |>
|
||||||
|
# select(-cell) |>
|
||||||
|
# drop_na()
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# vif_sample = vif_input_df |>
|
||||||
|
# sample_n(10000)
|
||||||
|
|
||||||
|
# vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Explore correlations among predictors
|
||||||
|
We randomly sample cells, compute a correlation matrix, and visualize pairwise correlations.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
sample = baseline_df |>
|
||||||
|
sample_n(10000) |>
|
||||||
|
select(-cell, -x, -y) |>
|
||||||
|
drop_na()
|
||||||
|
corr_matrix <- cor(sample)
|
||||||
|
|
||||||
|
ggcorrplot(corr_matrix,
|
||||||
|
hc.order = TRUE, # Clusters similar variables together
|
||||||
|
type = "lower", # Only show half (it's symmetrical anyway)
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE) + # Set to TRUE only if you have <20 variables
|
||||||
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7))
|
||||||
|
```
|
||||||
|
|
||||||
|
## Identify highly correlated variable pairs
|
||||||
|
We list variable pairs with strong correlations to better understand redundancy among predictors.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
high_cor_pairs <- melt(corr_matrix) |>
|
||||||
|
filter(abs(value) > 0.8) |>
|
||||||
|
filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal)
|
||||||
|
distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A)
|
||||||
|
arrange(desc(abs(value))) |>
|
||||||
|
mutate(Var1 = as.character(Var1),
|
||||||
|
Var2 = as.character(Var2))
|
||||||
|
```
|
||||||
|
|
||||||
|
## Perform VIF-based variable selection
|
||||||
|
Variance Inflation Factor (VIF) is used to remove collinear predictors and retain a stable subset.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vif_results <- vifstep(sample, th = 10)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Extract retained predictor names
|
||||||
|
We pull out the names of variables that passed the VIF threshold.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
keeper_vars <- vif_results@results$Variables
|
||||||
|
```
|
||||||
|
|
||||||
|
## Subset baseline data frame to VIF-selected variables
|
||||||
|
We keep only the selected predictors along with cell indices and coordinates.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_df_subset = baseline_df |>
|
||||||
|
select(cell, x, y, all_of(keeper_vars))
|
||||||
|
```
|
||||||
|
## Rasterize harp seal range
|
||||||
|
We convert the harp seal polygon range into a raster aligned with the environmental brick (presence/absence mask).
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_raster = seal_range |>
|
||||||
|
rasterize(features_brick[[1]], field="", background=0)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
writeRaster(seal_range_raster, "seal_range_raster.tif")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Define spatial blocks for cross-validation
|
||||||
|
We create block IDs over the study area and split them into train and test sets.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
all_blocks <- 1:15
|
||||||
|
set.seed(321) # For reproducibility
|
||||||
|
|
||||||
|
test_blocks <- sample(all_blocks, 5) # Randomly pick 5 blocks for testing
|
||||||
|
train_blocks <- setdiff(all_blocks, test_blocks)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
block_grid = seal_range_raster |>
|
||||||
|
ext() |>
|
||||||
|
st_bbox() |>
|
||||||
|
st_make_grid(n = c(5, 3)) |>
|
||||||
|
st_sf() |>
|
||||||
|
mutate(block_id = row_number()) |>
|
||||||
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
block_raster = block_grid |>
|
||||||
|
rasterize(seal_range_raster, field = "block_id")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_raster$block_id = block_raster$block_id
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(seal_range_raster$layer)
|
||||||
|
plot(vect(block_grid), add = TRUE, border = "black", lwd = 1)
|
||||||
|
plot(
|
||||||
|
vect(block_grid |> filter(type == "Test (Hold-out)")),
|
||||||
|
add = TRUE,
|
||||||
|
border = "red",
|
||||||
|
lwd = 3)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_df = seal_range_raster |>
|
||||||
|
as.data.frame(cells = TRUE) |>
|
||||||
|
rename(target = layer)
|
||||||
|
|
||||||
|
saveRDS(seal_range_df, file = "seal_range_df.rds")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# 3. Create the dataframes based on the blocks
|
||||||
|
train_df <- seal_baseline %>% filter(block_id %in% train_blocks)
|
||||||
|
test_df <- seal_baseline %>% filter(block_id %in% test_blocks)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
train_pool <- catboost.load_pool(
|
||||||
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
label = train_df$target
|
||||||
|
)
|
||||||
|
|
||||||
|
test_pool = catboost.load_pool(
|
||||||
|
data = test_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
label = test_df$target
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
params <- list(
|
||||||
|
loss_function = 'Logloss',
|
||||||
|
eval_metric = 'AUC',
|
||||||
|
iterations = 200, # Plenty of trees for a smooth fit
|
||||||
|
depth = 2, # Standard depth to prevent overfitting
|
||||||
|
learning_rate = 0.02, # Lower learning rate is better for high ROC data
|
||||||
|
l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC
|
||||||
|
random_seed = 42,
|
||||||
|
rsm = 0.5,
|
||||||
|
verbose = 10,
|
||||||
|
od_type = "Iter",
|
||||||
|
od_wait = 20
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
explainer_cat <- explain(
|
||||||
|
model = cat_model,
|
||||||
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
y = train_df$target,
|
||||||
|
label = "CatBoost Harp Seal Model",
|
||||||
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
||||||
|
)
|
||||||
|
```
|
||||||
|
```{r}
|
||||||
|
pdp_temp <- model_profile(
|
||||||
|
explainer = explainer_cat,
|
||||||
|
variables = "Average Chlorophyll depthsurf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Plot it
|
||||||
|
plot(pdp_temp)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
importanc2e <- catboost.get_feature_importance(cat_model, train_pool) |>
|
||||||
|
enframe()
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
catboost.save_model(cat_model, "cat_model.cbm")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Make a prediction
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
download_slice_subset = function(scenario_value, decade_start, layers_to_download) {
|
||||||
|
scenario_layers = dynamic_layers |>
|
||||||
|
filter(
|
||||||
|
scenario == scenario_value &
|
||||||
|
var %in% layers_to_download$var &
|
||||||
|
depth %in% layers_to_download$depth
|
||||||
|
)
|
||||||
|
|
||||||
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
|
||||||
|
slice_constraints = list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
slice_rasters = sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
return(slice_rasters)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585 = download_slice_subset("ssp585", 2090, subset_baseline_layer_names)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cat_model <- catboost.load_model('cat_model.cbm')
|
||||||
|
subset_baseline_layer_names = readRDS('subset_baseline_layer_names.rds')
|
||||||
|
seal_range_df = readRDS('seal_range_df.rds')
|
||||||
|
seal_range_raster = rast('seal_range_raster.tif')
|
||||||
|
dynamic_layers = readRDS('dynamic_layers.rds')
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
get_prediction = function(ssp_code, decade) {
|
||||||
|
ssp_slice = download_slice_subset(ssp_code, decade, subset_baseline_layer_names)
|
||||||
|
|
||||||
|
ssp_slice_brick = rast(ssp_slice)
|
||||||
|
ssp_slice_brick_depths = ssp_slice_brick |>
|
||||||
|
names() |>
|
||||||
|
str_extract("depth[:alpha:]+")
|
||||||
|
ssp_slice_brick_longnames = ssp_slice_brick |> longnames()
|
||||||
|
# baseline_brick_varnames = baseline_brick |> varnames() // коды longnames
|
||||||
|
|
||||||
|
names(ssp_slice_brick) = paste(ssp_slice_brick_longnames, ssp_slice_brick_depths)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ssp_slice_df = ssp_slice_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
|
||||||
|
ssp_slice_features = ssp_slice_df |> select(-cell, -x, -y)
|
||||||
|
|
||||||
|
ssp_slice_pool <- catboost.load_pool(data = ssp_slice_features)
|
||||||
|
|
||||||
|
preds_prob <- catboost.predict(cat_model, ssp_slice_pool, prediction_type = 'Probability')
|
||||||
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
||||||
|
|
||||||
|
ssp_slice_prediction = ssp_slice_df |>
|
||||||
|
mutate(prediction = preds_class) |>
|
||||||
|
select(cell, prediction)
|
||||||
|
|
||||||
|
ssp_slice_diff = seal_range_df |>
|
||||||
|
left_join(ssp_slice_prediction, by = "cell") |>
|
||||||
|
mutate(diff = 2*target + prediction)
|
||||||
|
|
||||||
|
r = rast(ssp_slice_brick)
|
||||||
|
r[ssp_slice_diff$cell] = ssp_slice_diff$diff
|
||||||
|
|
||||||
|
writeRaster(r[[1]], paste0(ssp_code, "-", decade, ".tif"))
|
||||||
|
|
||||||
|
png(filename = paste0(ssp_code, "-", decade, ".png"), width = 800, height = 800)
|
||||||
|
plot(
|
||||||
|
r[[1]],
|
||||||
|
type="classes",
|
||||||
|
col=c("grey", "green", "red", "purple"),
|
||||||
|
# col=c("grey", "#7fc97f", "#fdc086", "#beaed4"),
|
||||||
|
# levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"),
|
||||||
|
levels=c("00", "01", "10", "11"),
|
||||||
|
main=paste0(ssp_code, "-", decade)
|
||||||
|
)
|
||||||
|
dev.off()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
get_prediction("ssp585", 2050)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
sapply(seq(2050, 2050, by=10), function(decade) {
|
||||||
|
get_prediction("ssp585", decade)
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_brick = rast(ssp585)
|
||||||
|
ssp585_2090_brick_depths = ssp585_2090_brick |>
|
||||||
|
names() |>
|
||||||
|
str_extract("depth[:alpha:]+")
|
||||||
|
ssp585_2090_brick_longnames = ssp585_2090_brick |> longnames()
|
||||||
|
# baseline_brick_varnames = baseline_brick |> varnames() // коды longnames
|
||||||
|
|
||||||
|
names(ssp585_2090_brick) = paste(ssp585_2090_brick_longnames, ssp585_2090_brick_depths)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_df = ssp585_2090_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_features = ssp585_2090_df |> select(-cell, -x, -y)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_pool <- catboost.load_pool(data = ssp585_2090_features)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
preds_prob <- catboost.predict(cat_model, ssp585_2090_pool, prediction_type = 'Probability')
|
||||||
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_prediction = ssp585_2090_df |>
|
||||||
|
mutate(prediction = preds_class) |>
|
||||||
|
select(cell, prediction)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ssp585_2090_diff = seal_range_df |>
|
||||||
|
left_join(ssp585_2090_prediction, by = "cell") |>
|
||||||
|
mutate(diff = 2*target + prediction)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
hist(ssp585_2090_diff$diff)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
r = rast(baseline_brick)
|
||||||
|
r[ssp585_2090_diff$cell] = ssp585_2090_diff$diff
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
writeRaster(r[[1]], "ssp585-2090.tif")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(
|
||||||
|
r[[1]],
|
||||||
|
type="classes",
|
||||||
|
col=c("grey", "green", "red", "purple"),
|
||||||
|
# col=c("grey", "#7fc97f", "#fdc086", "#beaed4"),
|
||||||
|
levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"),
|
||||||
|
main="SSP 585 - 2090"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
a = rast('ssp585-2090.tif')
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(a)
|
||||||
|
```
|
||||||
@ -0,0 +1,567 @@
|
|||||||
|
---
|
||||||
|
title: "bio-oracle-5"
|
||||||
|
format: html
|
||||||
|
---
|
||||||
|
|
||||||
|
## Libraries
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
library(dplyr)
|
||||||
|
library(tidyr)
|
||||||
|
library(tibble)
|
||||||
|
library(stringr)
|
||||||
|
|
||||||
|
library(terra)
|
||||||
|
|
||||||
|
library(biooracler)
|
||||||
|
library(sf)
|
||||||
|
library(ggplot2)
|
||||||
|
library(ggcorrplot)
|
||||||
|
|
||||||
|
library(usdm)
|
||||||
|
|
||||||
|
library(catboost)
|
||||||
|
|
||||||
|
library(DALEX)
|
||||||
|
library(pdp)
|
||||||
|
# library(ggspatial)
|
||||||
|
# library(rnaturalearth)
|
||||||
|
# library(tidyterra)
|
||||||
|
# source("./scripts/degree_labels.R")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Range and study area
|
||||||
|
|
||||||
|
Load the species range from IUCN and 5° buffer to define an area of the study.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
bbox = ext(seal_range) |> extend(5)
|
||||||
|
bbox_vect = bbox |> as.lines(crs="EPSG:4326")
|
||||||
|
# land = ne_download(scale=110, type="land", category = "physical", returnclass = "sv")
|
||||||
|
land = vect("land.geojson")
|
||||||
|
|
||||||
|
lon_range = c(bbox$xmin, bbox$xmax)
|
||||||
|
lat_range = c(bbox$ymin, bbox$ymax)
|
||||||
|
|
||||||
|
constraints_geo = list(
|
||||||
|
longitude = lon_range,
|
||||||
|
latitude = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
saveRDS(constraints_geo, file="constraints_geo.Rda")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(seal_range, col="#bcbddc", xlim=c(-170, 170), ylim=c(90, -80))
|
||||||
|
plot(land, col="#f0f0f0", add=T)
|
||||||
|
lines(bbox, col="#756bb1")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Bio-ORACLE
|
||||||
|
|
||||||
|
> Environmental predictors were sourced from the Bio-ORACLE v3.0 database, providing standardized global marine rasters for present-day conditions and future climate projections under CMIP6 Shared Socioeconomic Pathways (SSPs).
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
all_layers = list_layers()
|
||||||
|
|
||||||
|
ids_to_remove = c(
|
||||||
|
# no projection data
|
||||||
|
# the database flaws (?)
|
||||||
|
"par_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"chl_baseline_2000_2018_depthmax",
|
||||||
|
"chl_baseline_2000_2018_depthmean",
|
||||||
|
"chl_baseline_2000_2018_depthmin",
|
||||||
|
# nature of the variable
|
||||||
|
"terrain_characteristics"
|
||||||
|
)
|
||||||
|
|
||||||
|
layers = all_layers |>
|
||||||
|
filter(! dataset_id %in% c(ids_to_remove)) |>
|
||||||
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) |>
|
||||||
|
mutate(
|
||||||
|
var_depth = paste0(var, "_", depth),
|
||||||
|
var_depth_humane = str_extract(title, ".*]") |> str_remove("Bio-Oracle ")
|
||||||
|
)
|
||||||
|
# aware that not all variables have ssp126
|
||||||
|
|
||||||
|
saveRDS(layers, "layers.Rda")
|
||||||
|
|
||||||
|
layers |> select(var_depth_humane, var_depth) |> distinct() |> print.data.frame()
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
download_slice = function(scenario_value, decade_start, layers_to_filter) {
|
||||||
|
scenario_layers = layers_to_filter |>
|
||||||
|
filter(scenario == scenario_value)
|
||||||
|
|
||||||
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
||||||
|
|
||||||
|
slice_constraints = list(
|
||||||
|
time = c(time_point, time_point),
|
||||||
|
longitude = constraints_geo$longitude,
|
||||||
|
latitude = constraints_geo$latitude
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
||||||
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
slice_rasters = sapply(
|
||||||
|
scenario_layers$dataset_id,
|
||||||
|
function(id) download_layers(
|
||||||
|
id,
|
||||||
|
constraints = slice_constraints,
|
||||||
|
directory = download_dir
|
||||||
|
),
|
||||||
|
simplify = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
return(slice_rasters)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
slice_to_brick = function(list_of_rasters) {
|
||||||
|
brick = rast(list_of_rasters)
|
||||||
|
depths = brick |> names() |> str_extract("depth[:letter:]+")
|
||||||
|
var_stat = brick |>
|
||||||
|
varnames() |>
|
||||||
|
as_tibble() |>
|
||||||
|
separate_wider_delim("value",delim="_", names=c("var", "stat"))
|
||||||
|
|
||||||
|
prev_longnames = longnames(brick)
|
||||||
|
longnames(brick) = paste0(prev_longnames, " [", depths ,"]")
|
||||||
|
names(brick) = paste(var_stat$var, depths, var_stat$stat, sep = "_")
|
||||||
|
return(brick)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Data exploration
|
||||||
|
|
||||||
|
Feel free to skip this step as it shows the logic behind the layers selected for
|
||||||
|
analysis.
|
||||||
|
|
||||||
|
### Download
|
||||||
|
|
||||||
|
```{r eval=FALSE}
|
||||||
|
baseline_rasters = download_slice("baseline", 2010, layers)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick = slice_to_brick(baseline_rasters)
|
||||||
|
```
|
||||||
|
|
||||||
|
300 hundred layers seem too many for a controlled analysis
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
nlyr(baseline_brick)
|
||||||
|
```
|
||||||
|
### Filter by ecology
|
||||||
|
|
||||||
|
Knowing smth about the species lets clean up variables before any formal analysis of variables releations
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
filtered_layers = tibble(
|
||||||
|
names = names(baseline_brick),
|
||||||
|
longnames = longnames(baseline_brick)
|
||||||
|
) |>
|
||||||
|
separate_wider_delim(
|
||||||
|
"names",
|
||||||
|
delim="_",
|
||||||
|
names=c("var", "depth", "stat"),
|
||||||
|
cols_remove=F
|
||||||
|
) |>
|
||||||
|
filter(
|
||||||
|
!(
|
||||||
|
depth %in% c("depthmax", "depthmean") |
|
||||||
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
||||||
|
stat %in% c("ltmin", "ltmax", "range")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
baseline_brick_subset = baseline_brick |>
|
||||||
|
subset(filtered_layers$names)
|
||||||
|
|
||||||
|
filtered_layers |> select(longnames) |> print.data.frame()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sample for correlation analysis
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ten_percent_cells = nrow(baseline_brick_subset) * ncol(baseline_brick_subset) * 0.1
|
||||||
|
|
||||||
|
baseline_brick_subset_sample = baseline_brick_subset |>
|
||||||
|
spatSample(size=ten_percent_cells, method="regular", na.rm=TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Initial correlation
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
corr_matrix = cor(baseline_brick_subset_sample)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ggcorrplot(corr_matrix,
|
||||||
|
type = "lower", # Only show half (it's symmetrical anyway)
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE) + # don't label values
|
||||||
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
high_cor_pairs <- corr_matrix |>
|
||||||
|
as.data.frame() |>
|
||||||
|
rownames_to_column("Var1") |>
|
||||||
|
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 900 total pairs
|
||||||
|
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
|
||||||
|
filter(abs(value) > 0.8 & Var1 < Var2) |>
|
||||||
|
mutate(value = round(value, 3)) |>
|
||||||
|
arrange(desc(abs(value)))
|
||||||
|
|
||||||
|
# 59 are highly correlated
|
||||||
|
print.data.frame(high_cor_pairs)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Variance Inflation Factor
|
||||||
|
|
||||||
|
> It calculates how much one variable can be predicted by a linear combination of all other variables.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vif_results = vifstep(baseline_brick_subset_sample, th = 10)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vars_to_keep = vif_results@results$Variables
|
||||||
|
vif_results@results
|
||||||
|
```
|
||||||
|
|
||||||
|
Then check the correlations of variables filtered by VIF step
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick_subset_sample_vif = baseline_brick_subset_sample |>
|
||||||
|
select(all_of(vars_to_keep))
|
||||||
|
|
||||||
|
corr_matrix_vif = cor(baseline_brick_subset_sample_vif)
|
||||||
|
|
||||||
|
ggcorrplot(corr_matrix_vif,
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE) + # don't label values
|
||||||
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
high_cor_pairs_vif <- corr_matrix_vif |>
|
||||||
|
as.data.frame() |>
|
||||||
|
rownames_to_column("Var1") |>
|
||||||
|
pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 144 total pairs
|
||||||
|
# Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs
|
||||||
|
filter(abs(value) > 0.8 & Var1 < Var2) |>
|
||||||
|
mutate(value = round(value, 3)) |>
|
||||||
|
arrange(desc(abs(value)))
|
||||||
|
|
||||||
|
# 3 are highly correlated
|
||||||
|
print.data.frame(high_cor_pairs_vif)
|
||||||
|
```
|
||||||
|
Having high correlation pairs and VIF values we manually select variables we can interpret
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
manually_selected_vars = c(
|
||||||
|
"siconc_depthsurf_min",
|
||||||
|
"thetao_depthsurf_min",
|
||||||
|
"thetao_depthmin_max",
|
||||||
|
"chl_depthsurf_mean",
|
||||||
|
"phyc_depthmin_max"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Then again check correlation
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick_subset_sample_manual = baseline_brick_subset_sample |>
|
||||||
|
select(all_of(manually_selected_vars))
|
||||||
|
|
||||||
|
corr_matrix_manual = cor(baseline_brick_subset_sample_manual)
|
||||||
|
|
||||||
|
ggcorrplot(corr_matrix_manual,
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE) + # don't label values
|
||||||
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
selected_layers = filtered_layers |>
|
||||||
|
filter(names %in% manually_selected_vars)
|
||||||
|
|
||||||
|
saveRDS(selected_layers, file="selected_layers.Rda")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Learning
|
||||||
|
|
||||||
|
### Input layers
|
||||||
|
|
||||||
|
Filter layers based on selected layers info.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
layers = readRDS("layers.Rda")
|
||||||
|
selected_layers = readRDS("selected_layers.Rda")
|
||||||
|
constraints_geo = readRDS("constraints_geo.Rda")
|
||||||
|
|
||||||
|
features_layers = inner_join(selected_layers, layers, by=c("var", "depth"))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_features_rasters = download_slice("baseline", 2010, features_layers)
|
||||||
|
```
|
||||||
|
|
||||||
|
Set up features raster brick
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_features_brick = slice_to_brick(baseline_features_rasters) |>
|
||||||
|
subset(c(selected_layers$names))
|
||||||
|
```
|
||||||
|
|
||||||
|
And target raster
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
||||||
|
ocean_mask = ifel(is.na(baseline_features_brick[[1]]), NA, 1)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_raster = rasterize(
|
||||||
|
seal_range,
|
||||||
|
baseline_features_brick[[1]],
|
||||||
|
field = "",
|
||||||
|
background = 0
|
||||||
|
) |>
|
||||||
|
mask(ocean_mask)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(c(seal_range_raster, baseline_features_brick))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Spatial blocks
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ROWS = 3
|
||||||
|
COLUMNS = 5
|
||||||
|
nblocks = ROWS * COLUMNS
|
||||||
|
|
||||||
|
all_blocks = 1:(nblocks)
|
||||||
|
set.seed(321) # For reproducibility
|
||||||
|
|
||||||
|
test_blocks = seq(2, nblocks, by = 2)
|
||||||
|
train_blocks = setdiff(all_blocks, test_blocks)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
block_grid = seal_range_raster |>
|
||||||
|
ext() |>
|
||||||
|
st_bbox() |>
|
||||||
|
st_make_grid(n = c(COLUMNS, ROWS)) |>
|
||||||
|
st_sf() |>
|
||||||
|
mutate(block_id = row_number()) |>
|
||||||
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
block_raster = block_grid |>
|
||||||
|
rasterize(seal_range_raster, field = "block_id")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(seal_range_raster$layer)
|
||||||
|
plot(vect(block_grid), add = TRUE, border = "black", lwd = 1)
|
||||||
|
plot(
|
||||||
|
vect(block_grid |> filter(type == "Test (Hold-out)")),
|
||||||
|
add = TRUE,
|
||||||
|
border = "red",
|
||||||
|
lwd = 3)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_raster$block_id = block_raster$block_id
|
||||||
|
```
|
||||||
|
|
||||||
|
### Catboost
|
||||||
|
|
||||||
|
#### Prep
|
||||||
|
|
||||||
|
Set up the dataframe for machine learning
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
seal_range_df = seal_range_raster |>
|
||||||
|
as.data.frame(cells = TRUE, na.rm=TRUE) |>
|
||||||
|
rename(target = layer)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_features_df = baseline_features_brick |>
|
||||||
|
as.data.frame(cells = TRUE, na.rm=TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
target_features = left_join(seal_range_df, baseline_features_df, by = "cell")
|
||||||
|
```
|
||||||
|
|
||||||
|
Divide training and testing pools
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
train_df = target_features %>% filter(block_id %in% train_blocks)
|
||||||
|
test_df = target_features %>% filter(block_id %in% test_blocks)
|
||||||
|
|
||||||
|
train_pool <- catboost.load_pool(
|
||||||
|
data = train_df |> select(-cell, -block_id, -target),
|
||||||
|
label = train_df$target
|
||||||
|
)
|
||||||
|
test_pool = catboost.load_pool(
|
||||||
|
data = test_df |> select(-cell, -block_id, -target),
|
||||||
|
label = test_df$target
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Learning
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
params = list(
|
||||||
|
loss_function = 'Logloss',
|
||||||
|
eval_metric = 'AUC',
|
||||||
|
iterations = 200, # Plenty of trees for a smooth fit
|
||||||
|
depth = 2, # Standard depth to prevent overfitting
|
||||||
|
learning_rate = 0.02, # Lower learning rate is better for high ROC data
|
||||||
|
l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC
|
||||||
|
random_seed = 42,
|
||||||
|
rsm = 0.5,
|
||||||
|
verbose = 10,
|
||||||
|
od_type = "Iter",
|
||||||
|
od_wait = 20
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
cat_model = catboost.train(train_pool, test_pool = test_pool, params = params)
|
||||||
|
saveRDS(cat_model, "cat_model.Rda")
|
||||||
|
```
|
||||||
|
#### Catboost Result
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
whole_pool = catboost.load_pool(
|
||||||
|
data = target_features |> select(-cell, -block_id, -target)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
preds_prob = catboost.predict(cat_model, whole_pool, prediction_type = 'Probability')
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_prediction_raster = seal_range_raster
|
||||||
|
baseline_prediction_raster[target_features$cell] = preds_prob
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(baseline_prediction_raster$layer)
|
||||||
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Post-processing
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(seal_range_raster)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
range_distance = gridDist(seal_range_raster, target=1)
|
||||||
|
distance_decay = exp(-0.000001 * range_distance) |>
|
||||||
|
subst(NA, 0)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(distance_decay)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_prediction_raster$layer_dist = baseline_prediction_raster$layer * distance_decay
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(baseline_prediction_raster$layer_dist)
|
||||||
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot(ifel(baseline_prediction_raster$layer_dist > 0.7, 1, 0))
|
||||||
|
plot(seal_range, col=NA, border="cyan", lwd=1, add=T)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Interpret
|
||||||
|
|
||||||
|
Maps
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Plots
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
explainer_cat = explain(
|
||||||
|
model = cat_model,
|
||||||
|
data = train_df |> select(-cell, -block_id, -target),
|
||||||
|
y = train_df$target,
|
||||||
|
label = "CatBoost Harp Seal Model",
|
||||||
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
vi_cat = model_parts(explainer_cat)
|
||||||
|
plot(vi_cat)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
pdp_cat = lapply(selected_layers$names, function(X) model_profile(explainer_cat, variables = X))
|
||||||
|
plot(pdp_cat)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
mp_cat = model_performance(explainer_cat)
|
||||||
|
plot(mp_cat, geom = "boxplot")
|
||||||
|
plot(mp_cat, geom = "roc") # Or geom = "boxplot" for residuals
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Pick a specific "wrong" pixel from your dataframe
|
||||||
|
caspian_pixel = train_df[train_df$cell == 120363, ]
|
||||||
|
|
||||||
|
bd_cat = predict_parts(explainer_cat, new_observation = caspian_pixel)
|
||||||
|
plot(bd_cat)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,296 @@
|
|||||||
|
## Learning Pipeline (Training)
|
||||||
|
This file contains the learning stage of the prototype: data download, feature filtering, spatial split, and CatBoost model training.
|
||||||
|
|
||||||
|
## Load required R packages
|
||||||
|
```{r}
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(terra)
|
||||||
|
library(mregions2)
|
||||||
|
library(biooracler)
|
||||||
|
library(stringr)
|
||||||
|
library(tibble)
|
||||||
|
library(catboost)
|
||||||
|
library(caret)
|
||||||
|
library(blockCV)
|
||||||
|
library(sf)
|
||||||
|
library(usdm)
|
||||||
|
library(ggcorrplot)
|
||||||
|
library(reshape2)
|
||||||
|
library(tidygraph)
|
||||||
|
library(ggraph)
|
||||||
|
library(CAST)
|
||||||
|
library(pdp)
|
||||||
|
library(ggplot2)
|
||||||
|
library(DALEX)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Load shared helpers and define run configuration
|
||||||
|
```{r}
|
||||||
|
source("R/shared-utils.R")
|
||||||
|
|
||||||
|
config = list(
|
||||||
|
range_shapefile = "data/iucn/Pagophilus_groenlandicus.shp",
|
||||||
|
bbox_expand_degrees = 5,
|
||||||
|
baseline_scenario = "baseline",
|
||||||
|
baseline_decade = 2010,
|
||||||
|
n_corr_sample = 10000,
|
||||||
|
n_blocks_total = 15,
|
||||||
|
n_blocks_test = 5,
|
||||||
|
seed_blocks = 321,
|
||||||
|
vif_threshold = 10,
|
||||||
|
artifacts = list(
|
||||||
|
dynamic_layers = "dynamic_layers.rds",
|
||||||
|
subset_layer_names = "subset_baseline_layer_names.rds",
|
||||||
|
seal_range_df = "seal_range_df.rds",
|
||||||
|
seal_range_raster = "seal_range_raster.tif",
|
||||||
|
model = "cat_model.cbm",
|
||||||
|
manifest = "artifacts-manifest-learning.csv",
|
||||||
|
session_info = "session-info-learning.txt"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Define study area and spatial constraints
|
||||||
|
|
||||||
|
First we get a target species range from IUNC
|
||||||
|
```{r}
|
||||||
|
study_bounds = make_study_bounds(
|
||||||
|
range_shapefile = config$range_shapefile,
|
||||||
|
expand_degrees = config$bbox_expand_degrees
|
||||||
|
)
|
||||||
|
seal_range = study_bounds$seal_range
|
||||||
|
lon_range = study_bounds$lon_range
|
||||||
|
lat_range = study_bounds$lat_range
|
||||||
|
```
|
||||||
|
|
||||||
|
## List and filter Bio-ORACLE layers
|
||||||
|
|
||||||
|
Load Bio-ORACLE layers. Remove layers without forecast data: terrain characteristics are constant and some layers doesn't have the forcast data as a matter of fact.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
layers = list_layers()
|
||||||
|
|
||||||
|
# Нет прогнозных данных :/
|
||||||
|
removed_layers_ids = c(
|
||||||
|
"par_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
||||||
|
"chl_baseline_2000_2018_depthmax",
|
||||||
|
"chl_baseline_2000_2018_depthmean",
|
||||||
|
"chl_baseline_2000_2018_depthmin"
|
||||||
|
)
|
||||||
|
|
||||||
|
constant_layers_ids = c("terrain_characteristics")
|
||||||
|
|
||||||
|
constant_layers = layers |>
|
||||||
|
filter(dataset_id %in% constant_layers_ids)
|
||||||
|
|
||||||
|
dynamic_layers = layers |>
|
||||||
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
||||||
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Download baseline and prepare predictor brick
|
||||||
|
|
||||||
|
We download the data for current time slice as it will be the learning data.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_rasters = download_biooracle_slice(
|
||||||
|
dynamic_layers = dynamic_layers,
|
||||||
|
scenario_value = config$baseline_scenario,
|
||||||
|
decade_start = config$baseline_decade,
|
||||||
|
lon_range = lon_range,
|
||||||
|
lat_range = lat_range
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
And construct a raster brick from all context layers.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
baseline_brick = rast(baseline_rasters)
|
||||||
|
baseline_brick = set_brick_names_with_depth(baseline_brick)
|
||||||
|
baseline_brick_depths = names(baseline_brick) |> str_extract("depth[:alpha:]+")
|
||||||
|
baseline_brick_longnames = baseline_brick |> longnames()
|
||||||
|
baseline_brick_varnames = baseline_brick |> varnames()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Select baseline variables
|
||||||
|
|
||||||
|
Next filter layers matters based on our knowledge about the species.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
suitable_baseline_layer_names = tibble(
|
||||||
|
name = names(baseline_brick),
|
||||||
|
longname = baseline_brick_longnames,
|
||||||
|
varname = baseline_brick_varnames,
|
||||||
|
depth = baseline_brick_depths
|
||||||
|
) |>
|
||||||
|
separate_wider_delim(
|
||||||
|
varname,
|
||||||
|
delim = "_",
|
||||||
|
names = c("var", "type")
|
||||||
|
) |>
|
||||||
|
filter(
|
||||||
|
!(
|
||||||
|
depth == "depthmax" |
|
||||||
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
||||||
|
type %in% c("ltmin", "ltmax", "range")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
subset_baseline_layer_names = suitable_baseline_layer_names |>
|
||||||
|
filter(
|
||||||
|
name %in% c(
|
||||||
|
"Minimum SeaIceCover depthsurf",
|
||||||
|
"Minimum OceanTemperature depthsurf",
|
||||||
|
"Average SeaIceThickness depthsurf",
|
||||||
|
"Average Chlorophyll depthsurf",
|
||||||
|
"Maximum OceanTemperature depthmin"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Build feature table
|
||||||
|
```{r}
|
||||||
|
subset_baseline_brick = baseline_brick |>
|
||||||
|
subset(subset_baseline_layer_names$name)
|
||||||
|
|
||||||
|
features_brick = c(subset_baseline_brick)
|
||||||
|
|
||||||
|
baseline_df = features_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Correlation and VIF-based selection
|
||||||
|
```{r}
|
||||||
|
sample = baseline_df |>
|
||||||
|
sample_n(config$n_corr_sample) |>
|
||||||
|
select(-cell, -x, -y) |>
|
||||||
|
drop_na()
|
||||||
|
|
||||||
|
corr_matrix = cor(sample)
|
||||||
|
|
||||||
|
ggcorrplot(
|
||||||
|
corr_matrix,
|
||||||
|
hc.order = TRUE,
|
||||||
|
type = "lower",
|
||||||
|
outline.col = "white",
|
||||||
|
colors = c("#6D9EC1", "white", "#E46726"),
|
||||||
|
lab = FALSE
|
||||||
|
) +
|
||||||
|
theme(
|
||||||
|
axis.text.x = element_text(size = 7, angle = 90),
|
||||||
|
axis.text.y = element_text(size = 7)
|
||||||
|
)
|
||||||
|
|
||||||
|
high_cor_pairs = melt(corr_matrix) |>
|
||||||
|
filter(abs(value) > 0.8) |>
|
||||||
|
filter(Var1 != Var2) |>
|
||||||
|
distinct(value, .keep_all = TRUE) |>
|
||||||
|
arrange(desc(abs(value))) |>
|
||||||
|
mutate(Var1 = as.character(Var1), Var2 = as.character(Var2))
|
||||||
|
|
||||||
|
vif_results = vifstep(sample, th = config$vif_threshold)
|
||||||
|
keeper_vars = vif_results@results$Variables
|
||||||
|
|
||||||
|
baseline_df_subset = baseline_df |>
|
||||||
|
select(cell, x, y, all_of(keeper_vars))
|
||||||
|
```
|
||||||
|
|
||||||
|
## Build target and spatial blocks
|
||||||
|
```{r}
|
||||||
|
seal_range_raster = seal_range |>
|
||||||
|
rasterize(features_brick[[1]], field = "", background = 0)
|
||||||
|
|
||||||
|
all_blocks = seq_len(config$n_blocks_total)
|
||||||
|
set.seed(config$seed_blocks)
|
||||||
|
test_blocks = sample(all_blocks, config$n_blocks_test)
|
||||||
|
train_blocks = setdiff(all_blocks, test_blocks)
|
||||||
|
|
||||||
|
block_grid = seal_range_raster |>
|
||||||
|
ext() |>
|
||||||
|
st_bbox() |>
|
||||||
|
st_make_grid(n = c(5, 3)) |>
|
||||||
|
st_sf() |>
|
||||||
|
mutate(block_id = row_number()) |>
|
||||||
|
mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train"))
|
||||||
|
|
||||||
|
block_raster = block_grid |>
|
||||||
|
rasterize(seal_range_raster, field = "block_id")
|
||||||
|
|
||||||
|
seal_range_raster$block_id = block_raster$block_id
|
||||||
|
seal_range_df = seal_range_raster |>
|
||||||
|
as.data.frame(cells = TRUE) |>
|
||||||
|
rename(target = layer)
|
||||||
|
|
||||||
|
seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell")
|
||||||
|
train_df = seal_baseline %>% filter(block_id %in% train_blocks)
|
||||||
|
test_df = seal_baseline %>% filter(block_id %in% test_blocks)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Train CatBoost model
|
||||||
|
```{r}
|
||||||
|
train_pool = catboost.load_pool(
|
||||||
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
label = train_df$target
|
||||||
|
)
|
||||||
|
|
||||||
|
test_pool = catboost.load_pool(
|
||||||
|
data = test_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
label = test_df$target
|
||||||
|
)
|
||||||
|
|
||||||
|
params = list(
|
||||||
|
loss_function = "Logloss",
|
||||||
|
eval_metric = "AUC",
|
||||||
|
iterations = 200,
|
||||||
|
depth = 2,
|
||||||
|
learning_rate = 0.02,
|
||||||
|
l2_leaf_reg = 15,
|
||||||
|
random_seed = 42,
|
||||||
|
rsm = 0.5,
|
||||||
|
verbose = 10,
|
||||||
|
od_type = "Iter",
|
||||||
|
od_wait = 20
|
||||||
|
)
|
||||||
|
|
||||||
|
cat_model = catboost.train(train_pool, test_pool = test_pool, params = params)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Model interpretation outputs
|
||||||
|
```{r}
|
||||||
|
explainer_cat = explain(
|
||||||
|
model = cat_model,
|
||||||
|
data = train_df |> select(-cell, -x, -y, -block_id, -target),
|
||||||
|
y = train_df$target,
|
||||||
|
label = "CatBoost Harp Seal Model",
|
||||||
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
||||||
|
)
|
||||||
|
|
||||||
|
pdp_temp = model_profile(
|
||||||
|
explainer = explainer_cat,
|
||||||
|
variables = "Average Chlorophyll depthsurf"
|
||||||
|
)
|
||||||
|
plot(pdp_temp)
|
||||||
|
|
||||||
|
importanc2e = catboost.get_feature_importance(cat_model, train_pool) |>
|
||||||
|
enframe()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Shared artifacts for prediction stage
|
||||||
|
These files are the explicit interface between learning and prediction.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
saveRDS(dynamic_layers, config$artifacts$dynamic_layers)
|
||||||
|
saveRDS(subset_baseline_layer_names, config$artifacts$subset_layer_names)
|
||||||
|
saveRDS(seal_range_df, config$artifacts$seal_range_df)
|
||||||
|
writeRaster(seal_range_raster, config$artifacts$seal_range_raster, overwrite = TRUE)
|
||||||
|
catboost.save_model(cat_model, config$artifacts$model)
|
||||||
|
|
||||||
|
artifact_manifest = tibble::tibble(
|
||||||
|
artifact = names(config$artifacts)[1:5],
|
||||||
|
path = unlist(config$artifacts[1:5])
|
||||||
|
)
|
||||||
|
utils::write.csv(artifact_manifest, config$artifacts$manifest, row.names = FALSE)
|
||||||
|
utils::capture.output(utils::sessionInfo(), file = config$artifacts$session_info)
|
||||||
|
```
|
||||||
@ -0,0 +1,126 @@
|
|||||||
|
## Prediction Pipeline (Projection)
|
||||||
|
This file contains the prediction stage and consumes artifacts produced by `bio-oracle-learning.qmd`.
|
||||||
|
|
||||||
|
## Load required R packages
|
||||||
|
```{r}
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(terra)
|
||||||
|
library(mregions2)
|
||||||
|
library(biooracler)
|
||||||
|
library(stringr)
|
||||||
|
library(tibble)
|
||||||
|
library(catboost)
|
||||||
|
library(sf)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Load shared helpers and define run configuration
|
||||||
|
```{r}
|
||||||
|
source("R/shared-utils.R")
|
||||||
|
|
||||||
|
config <- list(
|
||||||
|
range_shapefile = "data/iucn/Pagophilus_groenlandicus.shp",
|
||||||
|
bbox_expand_degrees = 5,
|
||||||
|
artifacts = list(
|
||||||
|
dynamic_layers = "dynamic_layers.rds",
|
||||||
|
subset_layer_names = "subset_baseline_layer_names.rds",
|
||||||
|
seal_range_df = "seal_range_df.rds",
|
||||||
|
seal_range_raster = "seal_range_raster.tif",
|
||||||
|
model = "cat_model.cbm",
|
||||||
|
manifest = "artifacts-manifest-learning.csv"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Recreate spatial constraints
|
||||||
|
These bounds are needed for downloading future Bio-ORACLE slices.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
study_bounds <- make_study_bounds(
|
||||||
|
range_shapefile = config$range_shapefile,
|
||||||
|
expand_degrees = config$bbox_expand_degrees
|
||||||
|
)
|
||||||
|
seal_range <- study_bounds$seal_range
|
||||||
|
lon_range <- study_bounds$lon_range
|
||||||
|
lat_range <- study_bounds$lat_range
|
||||||
|
```
|
||||||
|
|
||||||
|
## Load shared artifacts from learning stage
|
||||||
|
```{r}
|
||||||
|
required_artifacts <- unlist(config$artifacts[c("dynamic_layers", "subset_layer_names", "seal_range_df", "seal_range_raster", "model")])
|
||||||
|
assert_required_files(required_artifacts)
|
||||||
|
|
||||||
|
cat_model <- catboost.load_model(config$artifacts$model)
|
||||||
|
subset_baseline_layer_names = readRDS(config$artifacts$subset_layer_names)
|
||||||
|
seal_range_df = readRDS(config$artifacts$seal_range_df)
|
||||||
|
seal_range_raster = rast(config$artifacts$seal_range_raster)
|
||||||
|
dynamic_layers = readRDS(config$artifacts$dynamic_layers)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Shared artifacts manifest (optional inspection)
|
||||||
|
```{r}
|
||||||
|
if (file.exists(config$artifacts$manifest)) {
|
||||||
|
artifacts_manifest <- utils::read.csv(config$artifacts$manifest)
|
||||||
|
artifacts_manifest
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prediction function
|
||||||
|
```{r}
|
||||||
|
get_prediction = function(ssp_code, decade) {
|
||||||
|
ssp_slice = download_biooracle_slice_subset(
|
||||||
|
dynamic_layers = dynamic_layers,
|
||||||
|
scenario_value = ssp_code,
|
||||||
|
decade_start = decade,
|
||||||
|
layers_to_download = subset_baseline_layer_names,
|
||||||
|
lon_range = lon_range,
|
||||||
|
lat_range = lat_range
|
||||||
|
)
|
||||||
|
|
||||||
|
ssp_slice_brick = rast(ssp_slice)
|
||||||
|
ssp_slice_brick = set_brick_names_with_depth(ssp_slice_brick)
|
||||||
|
|
||||||
|
ssp_slice_df = ssp_slice_brick |>
|
||||||
|
as.data.frame(cells = TRUE, xy = TRUE)
|
||||||
|
|
||||||
|
ssp_slice_features = ssp_slice_df |> select(-cell, -x, -y)
|
||||||
|
ssp_slice_pool <- catboost.load_pool(data = ssp_slice_features)
|
||||||
|
|
||||||
|
preds_prob <- catboost.predict(cat_model, ssp_slice_pool, prediction_type = "Probability")
|
||||||
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
||||||
|
|
||||||
|
ssp_slice_prediction = ssp_slice_df |>
|
||||||
|
mutate(prediction = preds_class) |>
|
||||||
|
select(cell, prediction)
|
||||||
|
|
||||||
|
ssp_slice_diff = seal_range_df |>
|
||||||
|
left_join(ssp_slice_prediction, by = "cell") |>
|
||||||
|
mutate(diff = 2 * target + prediction)
|
||||||
|
|
||||||
|
r = rast(ssp_slice_brick)
|
||||||
|
r[ssp_slice_diff$cell] = ssp_slice_diff$diff
|
||||||
|
|
||||||
|
writeRaster(r[[1]], paste0(ssp_code, "-", decade, ".tif"), overwrite = TRUE)
|
||||||
|
|
||||||
|
png(filename = paste0(ssp_code, "-", decade, ".png"), width = 800, height = 800)
|
||||||
|
plot(
|
||||||
|
r[[1]],
|
||||||
|
type = "classes",
|
||||||
|
col = c("grey", "green", "red", "purple"),
|
||||||
|
levels = c("00", "01", "10", "11"),
|
||||||
|
main = paste0(ssp_code, "-", decade)
|
||||||
|
)
|
||||||
|
dev.off()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example runs
|
||||||
|
```{r}
|
||||||
|
get_prediction("ssp585", 2020)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
sapply(seq(2050, 2050, by = 10), function(decade) {
|
||||||
|
get_prediction("ssp585", decade)
|
||||||
|
})
|
||||||
|
```
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: Default
|
||||||
|
SaveWorkspace: Default
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: Yes
|
||||||
|
NumSpacesForTab: 2
|
||||||
|
Encoding: UTF-8
|
||||||
|
|
||||||
|
RnwWeave: Sweave
|
||||||
|
LaTeX: pdfLaTeX
|
||||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,6 @@
|
|||||||
|
Брать чисто Баренцево море -- получаем нерепрезентативную выборку по параметрам.
|
||||||
|
|
||||||
|
Брать весь ареал -- слишком много ресурсов.
|
||||||
|
|
||||||
|
|
||||||
|
Getting every endangered species of Barentsz sea can be challenging as it requires much more calculation especially for big ranges
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
lons = seq(-180, 180, by = 30)
|
||||||
|
lats = seq(-90, 90, by = 30)
|
||||||
|
|
||||||
|
grat = st_graticule(lon = lons, lat = lats)
|
||||||
|
|
||||||
|
box = st_bbox(c(xmin = -180, xmax = 180,
|
||||||
|
ymax = 90, ymin = -90),
|
||||||
|
crs = st_crs(4326)) |>
|
||||||
|
st_as_sfc() |>
|
||||||
|
smoothr::densify(max_distance = 1)
|
||||||
|
|
||||||
|
degree_labels = function(grat, vjust, hjust, size, lon = T, lat = T) {
|
||||||
|
pts = grat |>
|
||||||
|
st_cast('POINT') |>
|
||||||
|
group_by(degree, type, degree_label) |>
|
||||||
|
filter(row_number() == 1)
|
||||||
|
|
||||||
|
list(
|
||||||
|
if (lon) geom_sf_text(data = filter(pts, type == 'E'), vjust = vjust, size = size,
|
||||||
|
mapping = aes(label = degree_label), parse = TRUE),
|
||||||
|
if (lat) geom_sf_text(data = filter(pts, type == 'N'), hjust = hjust, size = size,
|
||||||
|
mapping = aes(label = degree_label), parse = TRUE)
|
||||||
|
)
|
||||||
|
}
|
||||||
@ -0,0 +1,45 @@
|
|||||||
|
R version 4.4.2 (2024-10-31 ucrt)
|
||||||
|
Platform: x86_64-w64-mingw32/x64
|
||||||
|
Running under: Windows 10 x64 (build 19044)
|
||||||
|
|
||||||
|
Matrix products: default
|
||||||
|
|
||||||
|
|
||||||
|
locale:
|
||||||
|
[1] LC_COLLATE=Russian_Russia.utf8 LC_CTYPE=Russian_Russia.utf8 LC_MONETARY=Russian_Russia.utf8
|
||||||
|
[4] LC_NUMERIC=C LC_TIME=Russian_Russia.utf8
|
||||||
|
|
||||||
|
time zone: Etc/GMT-3
|
||||||
|
tzcode source: internal
|
||||||
|
|
||||||
|
attached base packages:
|
||||||
|
[1] stats graphics grDevices utils datasets methods base
|
||||||
|
|
||||||
|
other attached packages:
|
||||||
|
[1] DALEX_2.5.3 pdp_0.8.3 CAST_1.0.4 ggraph_2.2.2
|
||||||
|
[5] tidygraph_1.3.1 reshape2_1.4.5 ggcorrplot_0.1.4.1 usdm_2.1-7
|
||||||
|
[9] sf_1.0-19 blockCV_3.2-0 caret_7.0-1 lattice_0.22-6
|
||||||
|
[13] ggplot2_4.0.1 catboost_1.2.8 tibble_3.2.1 stringr_1.5.1
|
||||||
|
[17] biooracler_0.0.0.9000 mregions2_1.1.2 terra_1.8-5 dplyr_1.1.4
|
||||||
|
[21] tidyr_1.3.1
|
||||||
|
|
||||||
|
loaded via a namespace (and not attached):
|
||||||
|
[1] DBI_1.2.3 pROC_1.19.0.1 gridExtra_2.3 rlang_1.1.4 magrittr_2.0.3
|
||||||
|
[6] e1071_1.7-16 compiler_4.4.2 systemfonts_1.1.0 vctrs_0.6.5 httpcode_0.3.0
|
||||||
|
[11] crayon_1.5.3 pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0 labeling_0.4.3
|
||||||
|
[16] prodlim_2025.04.28 ragg_1.5.0 purrr_1.0.2 cachem_1.1.0 jsonlite_1.8.9
|
||||||
|
[21] recipes_1.3.1 tweenr_2.0.3 parallel_4.4.2 R6_2.5.1 stringi_1.8.4
|
||||||
|
[26] RColorBrewer_1.1-3 hoardr_0.5.5 parallelly_1.46.0 rpart_4.1.23 lubridate_1.9.4
|
||||||
|
[31] Rcpp_1.0.13-1 iterators_1.0.14 future.apply_1.20.1 triebeard_0.4.1 Matrix_1.7-1
|
||||||
|
[36] splines_4.4.2 nnet_7.3-19 igraph_2.2.1 timechange_0.3.0 tidyselect_1.2.1
|
||||||
|
[41] viridis_0.6.5 timeDate_4051.111 codetools_0.2-20 curl_7.0.0 listenv_0.10.0
|
||||||
|
[46] plyr_1.8.9 withr_3.0.2 S7_0.2.1 future_1.68.0 survival_3.7-0
|
||||||
|
[51] units_0.8-5 proxy_0.4-27 polyclip_1.10-7 xml2_1.3.6 pillar_1.10.0
|
||||||
|
[56] KernSmooth_2.23-24 checkmate_2.3.3 foreach_1.5.2 stats4_4.4.2 ncdf4_1.24
|
||||||
|
[61] generics_0.1.3 sp_2.2-0 scales_1.4.0 globals_0.18.0 ingredients_2.3.0
|
||||||
|
[66] class_7.3-22 glue_1.8.0 tools_4.4.2 data.table_1.16.4 ModelMetrics_1.2.2.2
|
||||||
|
[71] gower_1.0.2 forcats_1.0.0 graphlayouts_1.2.2 grid_4.4.2 urltools_1.7.3.1
|
||||||
|
[76] ipred_0.9-15 nlme_3.1-166 raster_3.6-32 ggforce_0.5.0 rerddap_1.2.1
|
||||||
|
[81] cli_3.6.5 rappdirs_0.3.3 textshaping_0.4.1 viridisLite_0.4.2 lava_1.8.2
|
||||||
|
[86] gtable_0.3.6 digest_0.6.37 classInt_0.4-10 ggrepel_0.9.6 crul_1.6.0
|
||||||
|
[91] farver_2.1.2 memoise_2.0.1 lifecycle_1.0.4 hardhat_1.4.2 MASS_7.3-61
|
||||||
Loading…
Reference in new issue