## Load required R packages These libraries provide spatial handling, machine learning, and model explainability tools used throughout the workflow. ```{r} library(tidyr) library(dplyr) library(terra) library(mregions2) library(biooracler) library(stringr) library(tibble) library(catboost) library(caret) library(blockCV) library(sf) library(usdm) library(ggcorrplot) library(reshape2) library(tidygraph) library(ggraph) library(CAST) library(pdp) library(ggplot2) library(DALEX) ``` ## Define study area and spatial constraints Here we load the harp seal range shapefile and derive the longitude/latitude bounds used to constrain Bio-ORACLE downloads. ```{r} seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp") bbox = ext(seal_range) |> extend(5) lon_range = c(bbox$xmin, bbox$xmax) lat_range = c(bbox$ymin, bbox$ymax) constraints = list( longitude = lon_range, latitude = lat_range ) ``` ## List and filter Bio-ORACLE layers We list available Bio-ORACLE layers, manually remove unsupported ones, and separate constant (terrain) layers from dynamic variables. ```{r} layers = list_layers() # Нет прогнозных данных :/ removed_layers_ids = c( "par_mean_baseline_2000_2020_depthsurf", "kdpar_mean_baseline_2000_2020_depthsurf", "chl_baseline_2000_2018_depthmax", "chl_baseline_2000_2018_depthmean", "chl_baseline_2000_2018_depthmin" ) constant_layers_ids = c("terrain_characteristics") constant_layers = layers |> filter(dataset_id %in% constant_layers_ids) ``` ## Prepare dynamic layers metadata We keep only dynamic environmental variables and parse dataset IDs into variable, scenario, time, and depth components. ```{r} dynamic_layers = layers |> filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |> separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) saveRDS(dynamic_layers, 'dynamic_layers.rds') ``` ## Helper to download a single temporal slice This function downloads all dynamic layers for a given scenario and decade within the spatial and temporal constraints. ```{r} download_slice = function(scenario_value, decade_start) { scenario_layers = dynamic_layers |> filter(scenario == scenario_value) time_point = paste0(decade_start, "-01-01T00:00:00Z") slice_constraints = list( time = c(time_point, time_point), longitude = lon_range, latitude = lat_range ) download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start) dir.create(download_dir, recursive = TRUE, showWarnings = FALSE) slice_rasters = sapply( scenario_layers$dataset_id, function(id) download_layers( id, constraints = slice_constraints, directory = download_dir ), simplify = TRUE ) return(slice_rasters) } ``` ## Download baseline environmental slice We obtain baseline (historical) rasters for the 2010 decade over the study area. ```{r} baseline_rasters = download_slice("baseline", 2010) ``` ## Build baseline raster brick and extract metadata We combine downloaded rasters into a brick and extract depth, long names, and variable names for later filtering. ```{r} baseline_brick = rast(baseline_rasters) baseline_brick_depths = baseline_brick |> names() |> str_extract("depth[:alpha:]+") baseline_brick_longnames = baseline_brick |> longnames() baseline_brick_varnames = baseline_brick |> varnames() names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths) ``` ## Select ecologically relevant baseline variables We filter out less relevant or redundant variables and keep a focused subset of candidate predictors. ```{r} suitable_baseline_layer_names = tibble( name = names(baseline_brick), longname = baseline_brick_longnames, varname = baseline_brick_varnames, depth = baseline_brick_depths, ) |> separate_wider_delim( varname, delim = "_", names = c("var", "type") ) |> filter( !( depth == "depthmax" | var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") | type %in% c("ltmin", "ltmax", "range") ) ) subset_baseline_layer_names = suitable_baseline_layer_names |> filter( name %in% c( "Minimum SeaIceCover depthsurf", "Minimum OceanTemperature depthsurf", "Average SeaIceThickness depthsurf", "Average Chlorophyll depthsurf", "Maximum OceanTemperature depthmin" ) ) ``` ## Inspect chosen baseline variables We preview the table of selected variables to confirm that only the intended layers remain. ```{r} subset_baseline_layer_names saveRDS(subset_baseline_layer_names, file = "subset_baseline_layer_names.rds") ``` ## Build subset raster brick We subset the baseline raster brick to include only the selected variables. ```{r} subset_baseline_brick = baseline_brick |> subset(subset_baseline_layer_names$name) ``` ## Combine features into a single brick The final feature brick contains the chosen environmental predictors (terrain can be added later if needed). ```{r} # features_brick = c(baseline_brick, terrain_raster) features_brick = c(subset_baseline_brick) ``` ## (Optional) Crop feature brick to a subregion This chunk shows how to restrict the analysis to a smaller bounding box if desired (currently commented out). ```{r} # cropped_bbox = ext( # -20, # 72, # 60, # 85 # ) # cropped_features_brick = features_brick |> # crop(cropped_bbox) ``` ## Convert raster brick to data frame We convert the environmental rasters to a tidy data frame with cell indices and coordinates. ```{r} baseline_df = features_brick |> as.data.frame(cells = TRUE, xy = TRUE) ``` ```{r} # vif_input_df <- baseline_df |> # select(-cell) |> # drop_na() ``` ```{r} # vif_sample = vif_input_df |> # sample_n(10000) # vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)] ``` ## Explore correlations among predictors We randomly sample cells, compute a correlation matrix, and visualize pairwise correlations. ```{r} sample = baseline_df |> sample_n(10000) |> select(-cell, -x, -y) |> drop_na() corr_matrix <- cor(sample) ggcorrplot(corr_matrix, hc.order = TRUE, # Clusters similar variables together type = "lower", # Only show half (it's symmetrical anyway) outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE) + # Set to TRUE only if you have <20 variables theme(axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7)) ``` ## Identify highly correlated variable pairs We list variable pairs with strong correlations to better understand redundancy among predictors. ```{r} high_cor_pairs <- melt(corr_matrix) |> filter(abs(value) > 0.8) |> filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal) distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A) arrange(desc(abs(value))) |> mutate(Var1 = as.character(Var1), Var2 = as.character(Var2)) ``` ## Perform VIF-based variable selection Variance Inflation Factor (VIF) is used to remove collinear predictors and retain a stable subset. ```{r} vif_results <- vifstep(sample, th = 10) ``` ## Extract retained predictor names We pull out the names of variables that passed the VIF threshold. ```{r} keeper_vars <- vif_results@results$Variables ``` ## Subset baseline data frame to VIF-selected variables We keep only the selected predictors along with cell indices and coordinates. ```{r} baseline_df_subset = baseline_df |> select(cell, x, y, all_of(keeper_vars)) ``` ## Rasterize harp seal range We convert the harp seal polygon range into a raster aligned with the environmental brick (presence/absence mask). ```{r} seal_range_raster = seal_range |> rasterize(features_brick[[1]], field="", background=0) ``` ```{r} writeRaster(seal_range_raster, "seal_range_raster.tif") ``` ## Define spatial blocks for cross-validation We create block IDs over the study area and split them into train and test sets. ```{r} all_blocks <- 1:15 set.seed(321) # For reproducibility test_blocks <- sample(all_blocks, 5) # Randomly pick 5 blocks for testing train_blocks <- setdiff(all_blocks, test_blocks) ``` ```{r} block_grid = seal_range_raster |> ext() |> st_bbox() |> st_make_grid(n = c(5, 3)) |> st_sf() |> mutate(block_id = row_number()) |> mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train")) ``` ```{r} block_raster = block_grid |> rasterize(seal_range_raster, field = "block_id") ``` ```{r} seal_range_raster$block_id = block_raster$block_id ``` ```{r} plot(seal_range_raster$layer) plot(vect(block_grid), add = TRUE, border = "black", lwd = 1) plot( vect(block_grid |> filter(type == "Test (Hold-out)")), add = TRUE, border = "red", lwd = 3) ``` ```{r} seal_range_df = seal_range_raster |> as.data.frame(cells = TRUE) |> rename(target = layer) saveRDS(seal_range_df, file = "seal_range_df.rds") ``` ```{r} seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell") ``` ```{r} # 3. Create the dataframes based on the blocks train_df <- seal_baseline %>% filter(block_id %in% train_blocks) test_df <- seal_baseline %>% filter(block_id %in% test_blocks) ``` ```{r} train_pool <- catboost.load_pool( data = train_df |> select(-cell, -x, -y, -block_id, -target), label = train_df$target ) test_pool = catboost.load_pool( data = test_df |> select(-cell, -x, -y, -block_id, -target), label = test_df$target ) ``` ```{r} params <- list( loss_function = 'Logloss', eval_metric = 'AUC', iterations = 200, # Plenty of trees for a smooth fit depth = 2, # Standard depth to prevent overfitting learning_rate = 0.02, # Lower learning rate is better for high ROC data l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC random_seed = 42, rsm = 0.5, verbose = 10, od_type = "Iter", od_wait = 20 ) ``` ```{r} cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params) ``` ```{r} explainer_cat <- explain( model = cat_model, data = train_df |> select(-cell, -x, -y, -block_id, -target), y = train_df$target, label = "CatBoost Harp Seal Model", predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability") ) ``` ```{r} pdp_temp <- model_profile( explainer = explainer_cat, variables = "Average Chlorophyll depthsurf" ) # 3. Plot it plot(pdp_temp) ``` ```{r} importanc2e <- catboost.get_feature_importance(cat_model, train_pool) |> enframe() ``` ```{r} catboost.save_model(cat_model, "cat_model.cbm") ``` ## Make a prediction ```{r} download_slice_subset = function(scenario_value, decade_start, layers_to_download) { scenario_layers = dynamic_layers |> filter( scenario == scenario_value & var %in% layers_to_download$var & depth %in% layers_to_download$depth ) time_point = paste0(decade_start, "-01-01T00:00:00Z") slice_constraints = list( time = c(time_point, time_point), longitude = lon_range, latitude = lat_range ) download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start) dir.create(download_dir, recursive = TRUE, showWarnings = FALSE) slice_rasters = sapply( scenario_layers$dataset_id, function(id) download_layers( id, constraints = slice_constraints, directory = download_dir ), simplify = TRUE ) return(slice_rasters) } ``` ```{r} ssp585 = download_slice_subset("ssp585", 2090, subset_baseline_layer_names) ``` ```{r} cat_model <- catboost.load_model('cat_model.cbm') subset_baseline_layer_names = readRDS('subset_baseline_layer_names.rds') seal_range_df = readRDS('seal_range_df.rds') seal_range_raster = rast('seal_range_raster.tif') dynamic_layers = readRDS('dynamic_layers.rds') ``` ```{r} get_prediction = function(ssp_code, decade) { ssp_slice = download_slice_subset(ssp_code, decade, subset_baseline_layer_names) ssp_slice_brick = rast(ssp_slice) ssp_slice_brick_depths = ssp_slice_brick |> names() |> str_extract("depth[:alpha:]+") ssp_slice_brick_longnames = ssp_slice_brick |> longnames() # baseline_brick_varnames = baseline_brick |> varnames() // коды longnames names(ssp_slice_brick) = paste(ssp_slice_brick_longnames, ssp_slice_brick_depths) ssp_slice_df = ssp_slice_brick |> as.data.frame(cells = TRUE, xy = TRUE) ssp_slice_features = ssp_slice_df |> select(-cell, -x, -y) ssp_slice_pool <- catboost.load_pool(data = ssp_slice_features) preds_prob <- catboost.predict(cat_model, ssp_slice_pool, prediction_type = 'Probability') preds_class <- ifelse(preds_prob > 0.5, 1, 0) ssp_slice_prediction = ssp_slice_df |> mutate(prediction = preds_class) |> select(cell, prediction) ssp_slice_diff = seal_range_df |> left_join(ssp_slice_prediction, by = "cell") |> mutate(diff = 2*target + prediction) r = rast(ssp_slice_brick) r[ssp_slice_diff$cell] = ssp_slice_diff$diff writeRaster(r[[1]], paste0(ssp_code, "-", decade, ".tif")) png(filename = paste0(ssp_code, "-", decade, ".png"), width = 800, height = 800) plot( r[[1]], type="classes", col=c("grey", "green", "red", "purple"), # col=c("grey", "#7fc97f", "#fdc086", "#beaed4"), # levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"), levels=c("00", "01", "10", "11"), main=paste0(ssp_code, "-", decade) ) dev.off() } ``` ```{r} get_prediction("ssp585", 2050) ``` ```{r} sapply(seq(2050, 2050, by=10), function(decade) { get_prediction("ssp585", decade) }) ``` ```{r} ssp585_2090_brick = rast(ssp585) ssp585_2090_brick_depths = ssp585_2090_brick |> names() |> str_extract("depth[:alpha:]+") ssp585_2090_brick_longnames = ssp585_2090_brick |> longnames() # baseline_brick_varnames = baseline_brick |> varnames() // коды longnames names(ssp585_2090_brick) = paste(ssp585_2090_brick_longnames, ssp585_2090_brick_depths) ``` ```{r} ssp585_2090_df = ssp585_2090_brick |> as.data.frame(cells = TRUE, xy = TRUE) ``` ```{r} ssp585_2090_features = ssp585_2090_df |> select(-cell, -x, -y) ``` ```{r} ssp585_2090_pool <- catboost.load_pool(data = ssp585_2090_features) ``` ```{r} preds_prob <- catboost.predict(cat_model, ssp585_2090_pool, prediction_type = 'Probability') preds_class <- ifelse(preds_prob > 0.5, 1, 0) ``` ```{r} ssp585_2090_prediction = ssp585_2090_df |> mutate(prediction = preds_class) |> select(cell, prediction) ``` ```{r} ssp585_2090_diff = seal_range_df |> left_join(ssp585_2090_prediction, by = "cell") |> mutate(diff = 2*target + prediction) ``` ```{r} hist(ssp585_2090_diff$diff) ``` ```{r} r = rast(baseline_brick) r[ssp585_2090_diff$cell] = ssp585_2090_diff$diff ``` ```{r} writeRaster(r[[1]], "ssp585-2090.tif") ``` ```{r} plot( r[[1]], type="classes", col=c("grey", "green", "red", "purple"), # col=c("grey", "#7fc97f", "#fdc086", "#beaed4"), levels=c("0 → 0", "0 → 1", "1 → 0", "1 → 1"), main="SSP 585 - 2090" ) ``` ```{r} a = rast('ssp585-2090.tif') ``` ```{r} plot(a) ```