--- title: "bio-oracle-5" format: html --- ## Libraries ```{r} library(dplyr) library(tidyr) library(tibble) library(stringr) library(terra) library(biooracler) library(sf) library(ggplot2) library(ggcorrplot) library(usdm) library(catboost) library(DALEX) library(pdp) # library(ggspatial) # library(rnaturalearth) # library(tidyterra) # source("./scripts/degree_labels.R") ``` ## Range and study area Load the species range from IUCN and 5° buffer to define an area of the study. ```{r} seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp") bbox = ext(seal_range) |> extend(5) bbox_vect = bbox |> as.lines(crs="EPSG:4326") # land = ne_download(scale=110, type="land", category = "physical", returnclass = "sv") land = vect("land.geojson") lon_range = c(bbox$xmin, bbox$xmax) lat_range = c(bbox$ymin, bbox$ymax) constraints_geo = list( longitude = lon_range, latitude = lat_range ) saveRDS(constraints_geo, file="constraints_geo.Rda") ``` ```{r} plot(seal_range, col="#bcbddc", xlim=c(-170, 170), ylim=c(90, -80)) plot(land, col="#f0f0f0", add=T) lines(bbox, col="#756bb1") ``` ## Bio-ORACLE > Environmental predictors were sourced from the Bio-ORACLE v3.0 database, providing standardized global marine rasters for present-day conditions and future climate projections under CMIP6 Shared Socioeconomic Pathways (SSPs). ```{r} all_layers = list_layers() ids_to_remove = c( # no projection data # the database flaws (?) "par_mean_baseline_2000_2020_depthsurf", "kdpar_mean_baseline_2000_2020_depthsurf", "chl_baseline_2000_2018_depthmax", "chl_baseline_2000_2018_depthmean", "chl_baseline_2000_2018_depthmin", # nature of the variable "terrain_characteristics" ) layers = all_layers |> filter(! dataset_id %in% c(ids_to_remove)) |> separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) |> mutate( var_depth = paste0(var, "_", depth), var_depth_humane = str_extract(title, ".*]") |> str_remove("Bio-Oracle ") ) # aware that not all variables have ssp126 saveRDS(layers, "layers.Rda") layers |> select(var_depth_humane, var_depth) |> distinct() |> print.data.frame() ``` ```{r} download_slice = function(scenario_value, decade_start, layers_to_filter) { scenario_layers = layers_to_filter |> filter(scenario == scenario_value) time_point = paste0(decade_start, "-01-01T00:00:00Z") slice_constraints = list( time = c(time_point, time_point), longitude = constraints_geo$longitude, latitude = constraints_geo$latitude ) download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start) dir.create(download_dir, recursive = TRUE, showWarnings = FALSE) slice_rasters = sapply( scenario_layers$dataset_id, function(id) download_layers( id, constraints = slice_constraints, directory = download_dir ), simplify = TRUE ) return(slice_rasters) } ``` ```{r} slice_to_brick = function(list_of_rasters) { brick = rast(list_of_rasters) depths = brick |> names() |> str_extract("depth[:letter:]+") var_stat = brick |> varnames() |> as_tibble() |> separate_wider_delim("value",delim="_", names=c("var", "stat")) prev_longnames = longnames(brick) longnames(brick) = paste0(prev_longnames, " [", depths ,"]") names(brick) = paste(var_stat$var, depths, var_stat$stat, sep = "_") return(brick) } ``` ## Data exploration Feel free to skip this step as it shows the logic behind the layers selected for analysis. ### Download ```{r eval=FALSE} baseline_rasters = download_slice("baseline", 2010, layers) ``` ```{r} baseline_brick = slice_to_brick(baseline_rasters) ``` 300 hundred layers seem too many for a controlled analysis ```{r} nlyr(baseline_brick) ``` ### Filter by ecology Knowing smth about the species lets clean up variables before any formal analysis of variables releations ```{r} filtered_layers = tibble( names = names(baseline_brick), longnames = longnames(baseline_brick) ) |> separate_wider_delim( "names", delim="_", names=c("var", "depth", "stat"), cols_remove=F ) |> filter( !( depth %in% c("depthmax", "depthmean") | var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") | stat %in% c("ltmin", "ltmax", "range") ) ) baseline_brick_subset = baseline_brick |> subset(filtered_layers$names) filtered_layers |> select(longnames) |> print.data.frame() ``` ### Sample for correlation analysis ```{r} ten_percent_cells = nrow(baseline_brick_subset) * ncol(baseline_brick_subset) * 0.1 baseline_brick_subset_sample = baseline_brick_subset |> spatSample(size=ten_percent_cells, method="regular", na.rm=TRUE) ``` #### Initial correlation ```{r} corr_matrix = cor(baseline_brick_subset_sample) ``` ```{r} ggcorrplot(corr_matrix, type = "lower", # Only show half (it's symmetrical anyway) outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE) + # don't label values theme(axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7)) ``` ```{r} high_cor_pairs <- corr_matrix |> as.data.frame() |> rownames_to_column("Var1") |> pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 900 total pairs # Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs filter(abs(value) > 0.8 & Var1 < Var2) |> mutate(value = round(value, 3)) |> arrange(desc(abs(value))) # 59 are highly correlated print.data.frame(high_cor_pairs) ``` #### Variance Inflation Factor > It calculates how much one variable can be predicted by a linear combination of all other variables. ```{r} vif_results = vifstep(baseline_brick_subset_sample, th = 10) ``` ```{r} vars_to_keep = vif_results@results$Variables vif_results@results ``` Then check the correlations of variables filtered by VIF step ```{r} baseline_brick_subset_sample_vif = baseline_brick_subset_sample |> select(all_of(vars_to_keep)) corr_matrix_vif = cor(baseline_brick_subset_sample_vif) ggcorrplot(corr_matrix_vif, outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE) + # don't label values theme(axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7)) ``` ```{r} high_cor_pairs_vif <- corr_matrix_vif |> as.data.frame() |> rownames_to_column("Var1") |> pivot_longer(-Var1, names_to = "Var2", values_to = "value") |> # 144 total pairs # Var1 < Var2 removes self-correlation AND picks only one of the AB/BA pairs filter(abs(value) > 0.8 & Var1 < Var2) |> mutate(value = round(value, 3)) |> arrange(desc(abs(value))) # 3 are highly correlated print.data.frame(high_cor_pairs_vif) ``` Having high correlation pairs and VIF values we manually select variables we can interpret ```{r} manually_selected_vars = c( "siconc_depthsurf_min", "thetao_depthsurf_min", "thetao_depthmin_max", "chl_depthsurf_mean", "phyc_depthmin_max" ) ``` Then again check correlation ```{r} baseline_brick_subset_sample_manual = baseline_brick_subset_sample |> select(all_of(manually_selected_vars)) corr_matrix_manual = cor(baseline_brick_subset_sample_manual) ggcorrplot(corr_matrix_manual, outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE) + # don't label values theme(axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7)) ``` ```{r} selected_layers = filtered_layers |> filter(names %in% manually_selected_vars) saveRDS(selected_layers, file="selected_layers.Rda") ``` ## Learning ### Input layers Filter layers based on selected layers info. ```{r} layers = readRDS("layers.Rda") selected_layers = readRDS("selected_layers.Rda") constraints_geo = readRDS("constraints_geo.Rda") features_layers = inner_join(selected_layers, layers, by=c("var", "depth")) ``` ```{r} baseline_features_rasters = download_slice("baseline", 2010, features_layers) ``` Set up features raster brick ```{r} baseline_features_brick = slice_to_brick(baseline_features_rasters) |> subset(c(selected_layers$names)) ``` And target raster ```{r} seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp") ocean_mask = ifel(is.na(baseline_features_brick[[1]]), NA, 1) ``` ```{r} seal_range_raster = rasterize( seal_range, baseline_features_brick[[1]], field = "", background = 0 ) |> mask(ocean_mask) ``` ```{r} plot(c(seal_range_raster, baseline_features_brick)) ``` ### Spatial blocks ```{r} ROWS = 3 COLUMNS = 5 nblocks = ROWS * COLUMNS all_blocks = 1:(nblocks) set.seed(321) # For reproducibility test_blocks = seq(2, nblocks, by = 2) train_blocks = setdiff(all_blocks, test_blocks) ``` ```{r} block_grid = seal_range_raster |> ext() |> st_bbox() |> st_make_grid(n = c(COLUMNS, ROWS)) |> st_sf() |> mutate(block_id = row_number()) |> mutate(type = ifelse(block_id %in% test_blocks, "Test (Hold-out)", "Train")) ``` ```{r} block_raster = block_grid |> rasterize(seal_range_raster, field = "block_id") ``` ```{r} plot(seal_range_raster$layer) plot(vect(block_grid), add = TRUE, border = "black", lwd = 1) plot( vect(block_grid |> filter(type == "Test (Hold-out)")), add = TRUE, border = "red", lwd = 3) ``` ```{r} seal_range_raster$block_id = block_raster$block_id ``` ### Catboost #### Prep Set up the dataframe for machine learning ```{r} seal_range_df = seal_range_raster |> as.data.frame(cells = TRUE, na.rm=TRUE) |> rename(target = layer) ``` ```{r} baseline_features_df = baseline_features_brick |> as.data.frame(cells = TRUE, na.rm=TRUE) ``` ```{r} target_features = left_join(seal_range_df, baseline_features_df, by = "cell") ``` Divide training and testing pools ```{r} train_df = target_features %>% filter(block_id %in% train_blocks) test_df = target_features %>% filter(block_id %in% test_blocks) train_pool <- catboost.load_pool( data = train_df |> select(-cell, -block_id, -target), label = train_df$target ) test_pool = catboost.load_pool( data = test_df |> select(-cell, -block_id, -target), label = test_df$target ) ``` #### Learning ```{r} params = list( loss_function = 'Logloss', eval_metric = 'AUC', iterations = 200, # Plenty of trees for a smooth fit depth = 2, # Standard depth to prevent overfitting learning_rate = 0.02, # Lower learning rate is better for high ROC data l2_leaf_reg = 15, # Stronger regularization to handle that 0.998 ROC random_seed = 42, rsm = 0.5, verbose = 10, od_type = "Iter", od_wait = 20 ) ``` ```{r} cat_model = catboost.train(train_pool, test_pool = test_pool, params = params) saveRDS(cat_model, "cat_model.Rda") ``` #### Catboost Result ```{r} whole_pool = catboost.load_pool( data = target_features |> select(-cell, -block_id, -target) ) ``` ```{r} preds_prob = catboost.predict(cat_model, whole_pool, prediction_type = 'Probability') ``` ```{r} baseline_prediction_raster = seal_range_raster baseline_prediction_raster[target_features$cell] = preds_prob ``` ```{r} plot(baseline_prediction_raster$layer) plot(seal_range, col=NA, border="cyan", lwd=1, add=T) ``` #### Post-processing ```{r} plot(seal_range_raster) ``` ```{r} range_distance = gridDist(seal_range_raster, target=1) distance_decay = exp(-0.000001 * range_distance) |> subst(NA, 0) ``` ```{r} plot(distance_decay) ``` ```{r} baseline_prediction_raster$layer_dist = baseline_prediction_raster$layer * distance_decay ``` ```{r} plot(baseline_prediction_raster$layer_dist) plot(seal_range, col=NA, border="cyan", lwd=1, add=T) ``` ```{r} plot(ifel(baseline_prediction_raster$layer_dist > 0.7, 1, 0)) plot(seal_range, col=NA, border="cyan", lwd=1, add=T) ``` #### Interpret Maps Plots ```{r} explainer_cat = explain( model = cat_model, data = train_df |> select(-cell, -block_id, -target), y = train_df$target, label = "CatBoost Harp Seal Model", predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability") ) ``` ```{r} vi_cat = model_parts(explainer_cat) plot(vi_cat) ``` ```{r} pdp_cat = lapply(selected_layers$names, function(X) model_profile(explainer_cat, variables = X)) plot(pdp_cat) ``` ```{r} mp_cat = model_performance(explainer_cat) plot(mp_cat, geom = "boxplot") plot(mp_cat, geom = "roc") # Or geom = "boxplot" for residuals ``` ```{r} # Pick a specific "wrong" pixel from your dataframe caspian_pixel = train_df[train_df$cell == 120363, ] bd_cat = predict_parts(explainer_cat, new_observation = caspian_pixel) plot(bd_cat) ```