## Learning Pipeline (Training) This file contains the learning stage of the prototype: data download, feature filtering, spatial split, and CatBoost model training. ## Load required R packages ```{r} library(tidyr) library(dplyr) library(terra) library(mregions2) library(biooracler) library(stringr) library(tibble) library(catboost) library(caret) library(blockCV) library(sf) library(usdm) library(ggcorrplot) library(reshape2) library(tidygraph) library(ggraph) library(CAST) library(pdp) library(ggplot2) library(DALEX) ``` ## Load shared helpers and define run configuration ```{r} source("R/shared-utils.R") config = list( range_shapefile = "data/iucn/Pagophilus_groenlandicus.shp", bbox_expand_degrees = 5, baseline_scenario = "baseline", baseline_decade = 2010, n_corr_sample = 10000, n_blocks_total = 15, n_blocks_test = 5, seed_blocks = 321, vif_threshold = 10, artifacts = list( dynamic_layers = "dynamic_layers.rds", subset_layer_names = "subset_baseline_layer_names.rds", seal_range_df = "seal_range_df.rds", seal_range_raster = "seal_range_raster.tif", model = "cat_model.cbm", manifest = "artifacts-manifest-learning.csv", session_info = "session-info-learning.txt" ) ) ``` ## Define study area and spatial constraints First we get a target species range from IUNC ```{r} study_bounds = make_study_bounds( range_shapefile = config$range_shapefile, expand_degrees = config$bbox_expand_degrees ) seal_range = study_bounds$seal_range lon_range = study_bounds$lon_range lat_range = study_bounds$lat_range ``` ## List and filter Bio-ORACLE layers Load Bio-ORACLE layers. Remove layers without forecast data: terrain characteristics are constant and some layers doesn't have the forcast data as a matter of fact. ```{r} layers = list_layers() # Нет прогнозных данных :/ removed_layers_ids = c( "par_mean_baseline_2000_2020_depthsurf", "kdpar_mean_baseline_2000_2020_depthsurf", "chl_baseline_2000_2018_depthmax", "chl_baseline_2000_2018_depthmean", "chl_baseline_2000_2018_depthmin" ) constant_layers_ids = c("terrain_characteristics") constant_layers = layers |> filter(dataset_id %in% constant_layers_ids) dynamic_layers = layers |> filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |> separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) ``` ## Download baseline and prepare predictor brick We download the data for current time slice as it will be the learning data. ```{r} baseline_rasters = download_biooracle_slice( dynamic_layers = dynamic_layers, scenario_value = config$baseline_scenario, decade_start = config$baseline_decade, lon_range = lon_range, lat_range = lat_range ) ``` And construct a raster brick from all context layers. ```{r} baseline_brick = rast(baseline_rasters) baseline_brick = set_brick_names_with_depth(baseline_brick) baseline_brick_depths = names(baseline_brick) |> str_extract("depth[:alpha:]+") baseline_brick_longnames = baseline_brick |> longnames() baseline_brick_varnames = baseline_brick |> varnames() ``` ## Select baseline variables Next filter layers matters based on our knowledge about the species. ```{r} suitable_baseline_layer_names = tibble( name = names(baseline_brick), longname = baseline_brick_longnames, varname = baseline_brick_varnames, depth = baseline_brick_depths ) |> separate_wider_delim( varname, delim = "_", names = c("var", "type") ) |> filter( !( depth == "depthmax" | var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") | type %in% c("ltmin", "ltmax", "range") ) ) subset_baseline_layer_names = suitable_baseline_layer_names |> filter( name %in% c( "Minimum SeaIceCover depthsurf", "Minimum OceanTemperature depthsurf", "Average SeaIceThickness depthsurf", "Average Chlorophyll depthsurf", "Maximum OceanTemperature depthmin" ) ) ``` ## Build feature table ```{r} subset_baseline_brick = baseline_brick |> subset(subset_baseline_layer_names$name) features_brick = c(subset_baseline_brick) baseline_df = features_brick |> as.data.frame(cells = TRUE, xy = TRUE) ``` ## Correlation and VIF-based selection ```{r} sample = baseline_df |> sample_n(config$n_corr_sample) |> select(-cell, -x, -y) |> drop_na() corr_matrix = cor(sample) ggcorrplot( corr_matrix, hc.order = TRUE, type = "lower", outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE ) + theme( axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7) ) high_cor_pairs = melt(corr_matrix) |> filter(abs(value) > 0.8) |> filter(Var1 != Var2) |> distinct(value, .keep_all = TRUE) |> arrange(desc(abs(value))) |> mutate(Var1 = as.character(Var1), Var2 = as.character(Var2)) vif_results = vifstep(sample, th = config$vif_threshold) keeper_vars = vif_results@results$Variables baseline_df_subset = baseline_df |> select(cell, x, y, all_of(keeper_vars)) ``` ## Build target and spatial blocks ```{r} seal_range_raster = seal_range |> rasterize(features_brick[[1]], field = "", background = 0) all_blocks = seq_len(config$n_blocks_total) set.seed(config$seed_blocks) test_blocks = sample(all_blocks, config$n_blocks_test) train_blocks = setdiff(all_blocks, test_blocks) block_grid = seal_range_raster |> ext() |> st_bbox() |> st_make_grid(n = c(5, 3)) |> st_sf() |> mutate(block_id = row_number()) |> mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train")) block_raster = block_grid |> rasterize(seal_range_raster, field = "block_id") seal_range_raster$block_id = block_raster$block_id seal_range_df = seal_range_raster |> as.data.frame(cells = TRUE) |> rename(target = layer) seal_baseline = dplyr::left_join(baseline_df_subset, seal_range_df, by = "cell") train_df = seal_baseline %>% filter(block_id %in% train_blocks) test_df = seal_baseline %>% filter(block_id %in% test_blocks) ``` ## Train CatBoost model ```{r} train_pool = catboost.load_pool( data = train_df |> select(-cell, -x, -y, -block_id, -target), label = train_df$target ) test_pool = catboost.load_pool( data = test_df |> select(-cell, -x, -y, -block_id, -target), label = test_df$target ) params = list( loss_function = "Logloss", eval_metric = "AUC", iterations = 200, depth = 2, learning_rate = 0.02, l2_leaf_reg = 15, random_seed = 42, rsm = 0.5, verbose = 10, od_type = "Iter", od_wait = 20 ) cat_model = catboost.train(train_pool, test_pool = test_pool, params = params) ``` ## Model interpretation outputs ```{r} explainer_cat = explain( model = cat_model, data = train_df |> select(-cell, -x, -y, -block_id, -target), y = train_df$target, label = "CatBoost Harp Seal Model", predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability") ) pdp_temp = model_profile( explainer = explainer_cat, variables = "Average Chlorophyll depthsurf" ) plot(pdp_temp) importanc2e = catboost.get_feature_importance(cat_model, train_pool) |> enframe() ``` ## Shared artifacts for prediction stage These files are the explicit interface between learning and prediction. ```{r} saveRDS(dynamic_layers, config$artifacts$dynamic_layers) saveRDS(subset_baseline_layer_names, config$artifacts$subset_layer_names) saveRDS(seal_range_df, config$artifacts$seal_range_df) writeRaster(seal_range_raster, config$artifacts$seal_range_raster, overwrite = TRUE) catboost.save_model(cat_model, config$artifacts$model) artifact_manifest = tibble::tibble( artifact = names(config$artifacts)[1:5], path = unlist(config$artifacts[1:5]) ) utils::write.csv(artifact_manifest, config$artifacts$manifest, row.names = FALSE) utils::capture.output(utils::sessionInfo(), file = config$artifacts$session_info) ```