```{r} library(tidyr) library(dplyr) library(terra) library(mregions2) library(biooracler) library(stringr) library(tibble) library(catboost) library(caret) library(blockCV) library(sf) library(usdm) library(ggcorrplot) library(reshape2) library(tidygraph) library(ggraph) library(CAST) library(pdp) library(ggplot2) library(DALEX) ``` ```{r} seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp") bbox = ext(seal_range) |> extend(5) lon_range = c(bbox$xmin, bbox$xmax) lat_range = c(bbox$ymin, bbox$ymax) constraints = list( longitude = lon_range, latitude = lat_range ) ``` ```{r} layers = list_layers() # Нет прогнозных данных :/ removed_layers_ids = c( "par_mean_baseline_2000_2020_depthsurf", "kdpar_mean_baseline_2000_2020_depthsurf", "chl_baseline_2000_2018_depthmax", "chl_baseline_2000_2018_depthmean", "chl_baseline_2000_2018_depthmin" ) constant_layers_ids = c("terrain_characteristics") constant_layers = layers |> filter(dataset_id %in% constant_layers_ids) ``` ```{r} dynamic_layers = layers |> filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |> separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE) ``` ```{r} download_slice = function(scenario_value, decade_start) { scenario_layers = dynamic_layers |> filter(scenario == scenario_value) time_point = paste0(decade_start, "-01-01T00:00:00Z") slice_constraints = list( time = c(time_point, time_point), longitude = lon_range, latitude = lat_range ) download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start) dir.create(download_dir, recursive = TRUE, showWarnings = FALSE) slice_rasters = sapply( scenario_layers$dataset_id, function(id) download_layers( id, constraints = slice_constraints, directory = download_dir ), simplify = TRUE ) return(slice_rasters) } ``` ```{r} baseline_rasters = download_slice("baseline", 2010) ``` ```{r} baseline_brick = rast(baseline_rasters) baseline_brick_depths = baseline_brick |> names() |> str_extract("depth[:alpha:]+") baseline_brick_longnames = baseline_brick |> longnames() baseline_brick_varnames = baseline_brick |> varnames() ``` ```{r} subset_baseline_layer_names = tibble( name = names(baseline_brick), longname = baseline_brick_longnames, varname = baseline_brick_varnames, depth = baseline_brick_depths ) |> separate_wider_delim( varname, delim = "_", names = c("var", "type") ) |> filter( !( depth == "depthmax" | var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") | type %in% c("ltmin", "ltmax", "range") ) ) ``` ```{r} subset_baseline_brick = baseline_brick |> subset(subset_baseline_layer_names$name) ``` ```{r} names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths) ``` ```{r} # features_brick = c(baseline_brick, terrain_raster) features_brick = c(subset_baseline_brick) ``` ```{r} cropped_bbox = ext( -20, 72, 60, 85 ) ``` ```{r} cropped_features_brick = features_brick |> crop(cropped_bbox) ``` ```{r} baseline_df = cropped_features_brick |> as.data.frame(cells = TRUE, xy = TRUE) ``` ```{r} vif_input_df <- baseline_df |> select(-cell) |> drop_na() ``` ```{r} vif_sample = vif_input_df |> sample_n(10000) vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)] ``` ```{r} corr_matrix <- cor(vif_sample) ggcorrplot(corr_matrix, # hc.order = TRUE, # Clusters similar variables together type = "lower", # Only show half (it's symmetrical anyway) outline.col = "white", colors = c("#6D9EC1", "white", "#E46726"), lab = FALSE) + # Set to TRUE only if you have <20 variables theme(axis.text.x = element_text(size = 7, angle = 90), axis.text.y = element_text(size = 7)) ``` ```{r} high_cor_pairs <- melt(corr_matrix) |> filter(abs(value) > 0.8) |> filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal) distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A) arrange(desc(abs(value))) |> mutate(Var1 = as.character(Var1), Var2 = as.character(Var2)) ``` ```{r} graph_data <- as_tbl_graph(high_cor_pairs) # Plot the clusters ggraph(graph_data, layout = "nicely") + geom_edge_link(aes(alpha = abs(value)), color = "orange") + geom_node_point(size = 2, color = "steelblue") + geom_node_text(aes(label = name), repel = TRUE, size = 5) + theme_void() + labs(title = "Network of Redundant Variables (|r| > 0.8)") ``` ```{r} vif_results <- vifstep(vif_sample, th = 10) ``` ```{r} keeper_vars <- vif_results@results$Variables ``` ```{r} baseline_df_subset = baseline_df |> select(cell, x, y, keeper_vars) ``` ```{r} cropped_seal_range_raster = seal_range |> rasterize(cropped_features_brick[[1]], field="", background=0) ``` ```{r} cropped_seal_range_df = cropped_seal_range_raster |> as.data.frame(cells = TRUE) |> rename(target = layer) ``` ```{r} seal_baseline = dplyr::left_join(baseline_df_subset, cropped_seal_range_df, by = "cell") ``` ```{r} seal_baseline_sf = st_as_sf(seal_baseline, coords = c("x", "y"), crs = 4326) ``` ```{r} sb <- cv_spatial(x = seal_baseline_sf, column = "target", size = 500000, k = 3, selection = "random") ``` ```{r} seal_baseline$block_id <- sb$folds_ids seal_baseline$target <- as.factor(make.names(seal_baseline$target)) ``` ```{r} # save(seal_baseline, file = "seal_baseline.Rda") load(file = "seal_baseline.Rda") ``` ```{r} indices <- CAST::CreateSpacetimeFolds(seal_baseline, spacevar = "block_id", k = 3) ``` ```{r} seal_baseline_sample = sample_n(seal_baseline, 100000) ``` ```{r} ctrl <- trainControl(method = "cv", index = indices$index, indexOut = indices$indexOut, classProbs = TRUE, summaryFunction = twoClassSummary, verboseIter = TRUE) ffs_model <- ffs( predictors = seal_baseline |> select(-cell, -x, -y, -target), response = seal_baseline$target, method = "ranger", metric = "ROC", trControl = ctrl, tuneGrid = expand.grid(mtry = 2, splitrule = "gini", min.node.size = 10), num.trees = 50, num.threads = parallel::detectCores() - 1, withinSE = TRUE, minDiff = 0.005, ) ``` ```{r} # save(ffs_model, file = "ffs_model.Rda") load(file = "ffs_model.Rda") ``` ```{r} # 1. Take a small, representative sample for the calculation pdp_sample <- seal_baseline[sample(nrow(seal_baseline), 500), ] # 2. Run the partial function with 'train = pdp_sample' and 'grid.resolution' pdp_temp <- partial(ffs_model, pred.var = "Minimum OceanTemperature depthsurf", prob = TRUE, which.class = "X1", train = pdp_sample, # This is the secret to speed grid.resolution = 20) # 20 points is plenty for a smooth line ``` ```{r} # 3. Plot it autoplot(pdp_temp) + theme_minimal() ``` ```{r} # 2. Create the plot # The 'rug = TRUE' adds little tick marks at the bottom showing # where your actual data points sit. autoplot(pdp_temp, rug = TRUE, train = seal_baseline) + theme_minimal() + labs(title = "Partial Dependence: Min Ocean Temperature", subtitle = "How Ocean Temp influences Seal Presence Probability", x = "Temperature (°C)", y = "Probability of Presence") + geom_line(size = 1.2, color = "steelblue") ``` ```{r} final_vars <- c( "Minimum Chlorophyll depthsurf", "Maximum TotalPhytoplankton depthsurf", "Maximum AirTemperature depthsurf", "Minimum OceanTemperature depthsurf", "Average Chlorophyll depthsurf", "Maximum OceanTemperature depthmean", "Average TotalPhytoplankton depthmean" ) ``` ```{r} train_data <- seal_baseline %>% mutate(target_num = ifelse(target == "X1", 1, 0)) ``` ```{r} unique_blocks <- unique(train_data$block_id) train_blocks <- sample(unique_blocks, size = round(0.7 * length(unique_blocks))) # 3. Create the dataframes based on the blocks train_df <- train_data %>% filter(block_id %in% train_blocks) test_df <- train_data %>% filter(!(block_id %in% train_blocks)) ``` ```{r} train_pool <- catboost.load_pool( data = train_df[, final_vars], label = train_df$target_num ) test_pool = catboost.load_pool( data = test_df[, final_vars], label = test_df$target_num ) ``` ```{r} params <- list( loss_function = 'Logloss', eval_metric = 'AUC', iterations = 100, # Plenty of trees for a smooth fit depth = 3, # Standard depth to prevent overfitting learning_rate = 0.06, # Lower learning rate is better for high ROC data l2_leaf_reg = 30, # Stronger regularization to handle that 0.998 ROC random_seed = 42, rsm = 0.5, verbose = 10 # Log progress every 100 iterations ) ``` ```{r} cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params) ``` ```{r} explainer_cat <- explain( model = cat_model, data = train_df[, final_vars], y = train_df$target_num, label = "CatBoost Harp Seal Model", predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability") ) ``` ```{r} pdp_temp <- model_profile( explainer = explainer_cat, variables = "Minimum OceanTemperature depthsurf" ) # 3. Plot it plot(pdp_temp) ``` ```{r} importanc2e <- catboost.get_feature_importance(cat_model, train_pool) ```