You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
414 lines
10 KiB
414 lines
10 KiB
```{r}
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(terra)
|
|
library(mregions2)
|
|
library(biooracler)
|
|
library(stringr)
|
|
library(tibble)
|
|
library(catboost)
|
|
library(caret)
|
|
library(blockCV)
|
|
library(sf)
|
|
library(usdm)
|
|
library(ggcorrplot)
|
|
library(reshape2)
|
|
library(tidygraph)
|
|
library(ggraph)
|
|
library(CAST)
|
|
library(pdp)
|
|
library(ggplot2)
|
|
library(DALEX)
|
|
```
|
|
|
|
|
|
```{r}
|
|
seal_range = vect("data/iucn/Pagophilus_groenlandicus.shp")
|
|
bbox = ext(seal_range) |> extend(5)
|
|
|
|
lon_range = c(bbox$xmin, bbox$xmax)
|
|
lat_range = c(bbox$ymin, bbox$ymax)
|
|
|
|
constraints = list(
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
layers = list_layers()
|
|
|
|
# Нет прогнозных данных :/
|
|
removed_layers_ids = c(
|
|
"par_mean_baseline_2000_2020_depthsurf",
|
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
|
"chl_baseline_2000_2018_depthmax",
|
|
"chl_baseline_2000_2018_depthmean",
|
|
"chl_baseline_2000_2018_depthmin"
|
|
)
|
|
|
|
constant_layers_ids = c("terrain_characteristics")
|
|
|
|
constant_layers = layers |>
|
|
filter(dataset_id %in% constant_layers_ids)
|
|
```
|
|
|
|
```{r}
|
|
dynamic_layers = layers |>
|
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
|
```
|
|
|
|
```{r}
|
|
download_slice = function(scenario_value, decade_start) {
|
|
scenario_layers = dynamic_layers |>
|
|
filter(scenario == scenario_value)
|
|
|
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
|
|
|
slice_constraints = list(
|
|
time = c(time_point, time_point),
|
|
longitude = lon_range,
|
|
latitude = lat_range
|
|
)
|
|
|
|
download_dir = file.path("./data/bio-oracle-2", scenario_value, decade_start)
|
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
slice_rasters = sapply(
|
|
scenario_layers$dataset_id,
|
|
function(id) download_layers(
|
|
id,
|
|
constraints = slice_constraints,
|
|
directory = download_dir
|
|
),
|
|
simplify = TRUE
|
|
)
|
|
|
|
return(slice_rasters)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
baseline_rasters = download_slice("baseline", 2010)
|
|
```
|
|
|
|
```{r}
|
|
baseline_brick = rast(baseline_rasters)
|
|
baseline_brick_depths = baseline_brick |>
|
|
names() |>
|
|
str_extract("depth[:alpha:]+")
|
|
baseline_brick_longnames = baseline_brick |> longnames()
|
|
baseline_brick_varnames = baseline_brick |> varnames()
|
|
```
|
|
|
|
```{r}
|
|
subset_baseline_layer_names = tibble(
|
|
name = names(baseline_brick),
|
|
longname = baseline_brick_longnames,
|
|
varname = baseline_brick_varnames,
|
|
depth = baseline_brick_depths
|
|
) |>
|
|
separate_wider_delim(
|
|
varname,
|
|
delim = "_",
|
|
names = c("var", "type")
|
|
) |>
|
|
filter(
|
|
!(
|
|
depth == "depthmax" |
|
|
var %in% c("ph", "si", "dfe", "no3", "po4", "clt", "o2", "mlotst", "sws", "swd", "so") |
|
|
type %in% c("ltmin", "ltmax", "range")
|
|
)
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
subset_baseline_brick = baseline_brick |>
|
|
subset(subset_baseline_layer_names$name)
|
|
```
|
|
|
|
```{r}
|
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
|
```
|
|
|
|
```{r}
|
|
# features_brick = c(baseline_brick, terrain_raster)
|
|
features_brick = c(subset_baseline_brick)
|
|
```
|
|
|
|
```{r}
|
|
cropped_bbox = ext(
|
|
-20,
|
|
72,
|
|
60,
|
|
85
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
cropped_features_brick = features_brick |>
|
|
crop(cropped_bbox)
|
|
```
|
|
|
|
```{r}
|
|
baseline_df = cropped_features_brick |>
|
|
as.data.frame(cells = TRUE, xy = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
vif_input_df <- baseline_df |>
|
|
select(-cell) |>
|
|
drop_na()
|
|
```
|
|
|
|
```{r}
|
|
vif_sample = vif_input_df |>
|
|
sample_n(10000)
|
|
|
|
vif_sample = vif_sample[, sapply(vif_sample, function(x) var(x) > 0)]
|
|
```
|
|
|
|
```{r}
|
|
corr_matrix <- cor(vif_sample)
|
|
|
|
ggcorrplot(corr_matrix,
|
|
# hc.order = TRUE, # Clusters similar variables together
|
|
type = "lower", # Only show half (it's symmetrical anyway)
|
|
outline.col = "white",
|
|
colors = c("#6D9EC1", "white", "#E46726"),
|
|
lab = FALSE) + # Set to TRUE only if you have <20 variables
|
|
theme(axis.text.x = element_text(size = 7, angle = 90),
|
|
axis.text.y = element_text(size = 7))
|
|
```
|
|
|
|
```{r}
|
|
high_cor_pairs <- melt(corr_matrix) |>
|
|
filter(abs(value) > 0.8) |>
|
|
filter(Var1 != Var2) |> # Remove self-correlations (1.0 on diagonal)
|
|
distinct(value, .keep_all = TRUE) |> # Remove duplicates (A-B and B-A)
|
|
arrange(desc(abs(value))) |>
|
|
mutate(Var1 = as.character(Var1),
|
|
Var2 = as.character(Var2))
|
|
```
|
|
|
|
```{r}
|
|
graph_data <- as_tbl_graph(high_cor_pairs)
|
|
|
|
# Plot the clusters
|
|
ggraph(graph_data, layout = "nicely") +
|
|
geom_edge_link(aes(alpha = abs(value)), color = "orange") +
|
|
geom_node_point(size = 2, color = "steelblue") +
|
|
geom_node_text(aes(label = name), repel = TRUE, size = 5) +
|
|
theme_void() +
|
|
labs(title = "Network of Redundant Variables (|r| > 0.8)")
|
|
```
|
|
|
|
```{r}
|
|
vif_results <- vifstep(vif_sample, th = 10)
|
|
```
|
|
|
|
|
|
```{r}
|
|
keeper_vars <- vif_results@results$Variables
|
|
```
|
|
|
|
|
|
```{r}
|
|
baseline_df_subset = baseline_df |>
|
|
select(cell, x, y, keeper_vars)
|
|
```
|
|
|
|
```{r}
|
|
cropped_seal_range_raster = seal_range |>
|
|
rasterize(cropped_features_brick[[1]], field="", background=0)
|
|
```
|
|
|
|
```{r}
|
|
cropped_seal_range_df = cropped_seal_range_raster |>
|
|
as.data.frame(cells = TRUE) |>
|
|
rename(target = layer)
|
|
```
|
|
|
|
```{r}
|
|
seal_baseline = dplyr::left_join(baseline_df_subset, cropped_seal_range_df, by = "cell")
|
|
```
|
|
|
|
```{r}
|
|
seal_baseline_sf = st_as_sf(seal_baseline, coords = c("x", "y"), crs = 4326)
|
|
```
|
|
|
|
|
|
```{r}
|
|
sb <- cv_spatial(x = seal_baseline_sf,
|
|
column = "target",
|
|
size = 500000,
|
|
k = 3,
|
|
selection = "random")
|
|
```
|
|
|
|
|
|
```{r}
|
|
seal_baseline$block_id <- sb$folds_ids
|
|
seal_baseline$target <- as.factor(make.names(seal_baseline$target))
|
|
```
|
|
|
|
```{r}
|
|
# save(seal_baseline, file = "seal_baseline.Rda")
|
|
load(file = "seal_baseline.Rda")
|
|
```
|
|
|
|
|
|
```{r}
|
|
indices <- CAST::CreateSpacetimeFolds(seal_baseline,
|
|
spacevar = "block_id",
|
|
k = 3)
|
|
```
|
|
|
|
```{r}
|
|
seal_baseline_sample = sample_n(seal_baseline, 100000)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ctrl <- trainControl(method = "cv",
|
|
index = indices$index,
|
|
indexOut = indices$indexOut,
|
|
classProbs = TRUE,
|
|
summaryFunction = twoClassSummary,
|
|
verboseIter = TRUE)
|
|
|
|
ffs_model <- ffs(
|
|
predictors = seal_baseline |> select(-cell, -x, -y, -target),
|
|
response = seal_baseline$target,
|
|
method = "ranger",
|
|
metric = "ROC",
|
|
trControl = ctrl,
|
|
tuneGrid = expand.grid(mtry = 2,
|
|
splitrule = "gini",
|
|
min.node.size = 10),
|
|
num.trees = 50,
|
|
num.threads = parallel::detectCores() - 1,
|
|
withinSE = TRUE,
|
|
minDiff = 0.005,
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
# save(ffs_model, file = "ffs_model.Rda")
|
|
load(file = "ffs_model.Rda")
|
|
```
|
|
|
|
|
|
```{r}
|
|
# 1. Take a small, representative sample for the calculation
|
|
pdp_sample <- seal_baseline[sample(nrow(seal_baseline), 500), ]
|
|
|
|
# 2. Run the partial function with 'train = pdp_sample' and 'grid.resolution'
|
|
pdp_temp <- partial(ffs_model,
|
|
pred.var = "Minimum OceanTemperature depthsurf",
|
|
prob = TRUE,
|
|
which.class = "X1",
|
|
train = pdp_sample, # This is the secret to speed
|
|
grid.resolution = 20) # 20 points is plenty for a smooth line
|
|
```
|
|
|
|
```{r}
|
|
# 3. Plot it
|
|
autoplot(pdp_temp) + theme_minimal()
|
|
```
|
|
|
|
|
|
```{r}
|
|
# 2. Create the plot
|
|
# The 'rug = TRUE' adds little tick marks at the bottom showing
|
|
# where your actual data points sit.
|
|
autoplot(pdp_temp, rug = TRUE, train = seal_baseline) +
|
|
theme_minimal() +
|
|
labs(title = "Partial Dependence: Min Ocean Temperature",
|
|
subtitle = "How Ocean Temp influences Seal Presence Probability",
|
|
x = "Temperature (°C)",
|
|
y = "Probability of Presence") +
|
|
geom_line(size = 1.2, color = "steelblue")
|
|
```
|
|
|
|
```{r}
|
|
final_vars <- c(
|
|
"Minimum Chlorophyll depthsurf",
|
|
"Maximum TotalPhytoplankton depthsurf",
|
|
"Maximum AirTemperature depthsurf",
|
|
"Minimum OceanTemperature depthsurf",
|
|
"Average Chlorophyll depthsurf",
|
|
"Maximum OceanTemperature depthmean",
|
|
"Average TotalPhytoplankton depthmean"
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
train_data <- seal_baseline %>%
|
|
mutate(target_num = ifelse(target == "X1", 1, 0))
|
|
```
|
|
|
|
```{r}
|
|
unique_blocks <- unique(train_data$block_id)
|
|
train_blocks <- sample(unique_blocks, size = round(0.7 * length(unique_blocks)))
|
|
|
|
# 3. Create the dataframes based on the blocks
|
|
train_df <- train_data %>% filter(block_id %in% train_blocks)
|
|
test_df <- train_data %>% filter(!(block_id %in% train_blocks))
|
|
```
|
|
|
|
```{r}
|
|
train_pool <- catboost.load_pool(
|
|
data = train_df[, final_vars],
|
|
label = train_df$target_num
|
|
)
|
|
|
|
test_pool = catboost.load_pool(
|
|
data = test_df[, final_vars],
|
|
label = test_df$target_num
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
params <- list(
|
|
loss_function = 'Logloss',
|
|
eval_metric = 'AUC',
|
|
iterations = 100, # Plenty of trees for a smooth fit
|
|
depth = 3, # Standard depth to prevent overfitting
|
|
learning_rate = 0.06, # Lower learning rate is better for high ROC data
|
|
l2_leaf_reg = 30, # Stronger regularization to handle that 0.998 ROC
|
|
random_seed = 42,
|
|
rsm = 0.5,
|
|
verbose = 10 # Log progress every 100 iterations
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
cat_model <- catboost.train(train_pool, test_pool = test_pool, params = params)
|
|
```
|
|
|
|
```{r}
|
|
explainer_cat <- explain(
|
|
model = cat_model,
|
|
data = train_df[, final_vars],
|
|
y = train_df$target_num,
|
|
label = "CatBoost Harp Seal Model",
|
|
predict_function = function(model, x) catboost.predict(model, catboost.load_pool(x), prediction_type = "Probability")
|
|
)
|
|
```
|
|
```{r}
|
|
pdp_temp <- model_profile(
|
|
explainer = explainer_cat,
|
|
variables = "Minimum OceanTemperature depthsurf"
|
|
)
|
|
|
|
# 3. Plot it
|
|
plot(pdp_temp)
|
|
```
|
|
|
|
```{r}
|
|
importanc2e <- catboost.get_feature_importance(cat_model, train_pool)
|
|
``` |