|
|
---
|
|
|
title: "Untitled"
|
|
|
format: html
|
|
|
---
|
|
|
|
|
|
## Libraries
|
|
|
|
|
|
```{r}
|
|
|
library(tidyr)
|
|
|
library(dplyr)
|
|
|
library(terra)
|
|
|
library(mregions2)
|
|
|
library(biooracler)
|
|
|
library(stringr)
|
|
|
library(tibble)
|
|
|
library(catboost)
|
|
|
library(caret)
|
|
|
library(blockCV)
|
|
|
library(sf)
|
|
|
```
|
|
|
|
|
|
## Download
|
|
|
|
|
|
### Sea
|
|
|
|
|
|
Загружаем данные по Баренцеву морю
|
|
|
|
|
|
```{r}
|
|
|
barentsz_mrgid = 4247
|
|
|
geo = gaz_geometry(barentsz_mrgid, format = "sfc") |> vect()
|
|
|
```
|
|
|
|
|
|
|
|
|
Фиксируем охват
|
|
|
|
|
|
```{r}
|
|
|
bbox = ext(geo)
|
|
|
```
|
|
|
|
|
|
### Bio-Oracle
|
|
|
|
|
|
Фиксируем доступные в Bio-Oracle слои
|
|
|
|
|
|
```{r}
|
|
|
layers = list_layers()
|
|
|
```
|
|
|
|
|
|
|
|
|
Фиксируем слои, на которые нет прогнозных данных. Их мы не будем использовать в обучении и предсказании.
|
|
|
|
|
|
```{r}
|
|
|
# Нет прогнозных данных :/
|
|
|
removed_layers_ids = c(
|
|
|
"par_mean_baseline_2000_2020_depthsurf",
|
|
|
"kdpar_mean_baseline_2000_2020_depthsurf",
|
|
|
"chl_baseline_2000_2018_depthmax",
|
|
|
"chl_baseline_2000_2018_depthmean",
|
|
|
"chl_baseline_2000_2018_depthmin"
|
|
|
)
|
|
|
```
|
|
|
|
|
|
|
|
|
Фиксируем слой с характеристиками поверхности дна. Они будут одинаковыми для всех временных срезов и сценариев.
|
|
|
|
|
|
```{r}
|
|
|
constant_layers_ids = c("terrain_characteristics")
|
|
|
|
|
|
constant_layers = layers |>
|
|
|
filter(dataset_id %in% constant_layers_ids)
|
|
|
```
|
|
|
|
|
|
|
|
|
Фиксируем динамические слои. Они являются основой проводимого анализа. Разбиваем код на отдельные поля: переменная, сценарий, период, глубина.
|
|
|
|
|
|
```{r}
|
|
|
dynamic_layers = layers |>
|
|
|
filter(! dataset_id %in% c(constant_layers_ids, removed_layers_ids)) |>
|
|
|
separate_wider_delim(dataset_id, delim = "_", names = c("var", "scenario", "year_star", "year_end", "depth"), cols_remove = FALSE)
|
|
|
```
|
|
|
|
|
|
|
|
|
Создаём заготовку для формирования срезов. Пока включаем только пространственную составляющую.
|
|
|
|
|
|
```{r}
|
|
|
lon_range = c(bbox$xmin, bbox$xmax)
|
|
|
lat_range = c(bbox$ymin, bbox$ymax)
|
|
|
constraints = list(
|
|
|
longitude = lon_range,
|
|
|
latitude = lat_range
|
|
|
)
|
|
|
```
|
|
|
|
|
|
|
|
|
Загружаем данные о поверхности дна.
|
|
|
|
|
|
```{r}
|
|
|
terrain_raster = download_layers(
|
|
|
constant_layers$dataset_id[1],
|
|
|
constraints = constraints,
|
|
|
directory = "data/bio-oracle/terrain_characteristics"
|
|
|
)
|
|
|
# Переименовываем слои в полные названия, чтобы потом поля понятно назывались
|
|
|
names(terrain_raster) = longnames(terrain_raster)
|
|
|
```
|
|
|
|
|
|
|
|
|
Формируем функцию загрузки среза данных. Аргументами являются сценарий климатического развития и декада. Параметры охвата беруться из констант (см. `lon_range`, `lat_range`).
|
|
|
|
|
|
```{r}
|
|
|
download_slice = function(scenario_value, decade_start) {
|
|
|
scenario_layers = dynamic_layers |>
|
|
|
filter(scenario == scenario_value)
|
|
|
|
|
|
time_point = paste0(decade_start, "-01-01T00:00:00Z")
|
|
|
|
|
|
slice_constraints = list(
|
|
|
time = c(time_point, time_point),
|
|
|
longitude = lon_range,
|
|
|
latitude = lat_range
|
|
|
)
|
|
|
|
|
|
download_dir = file.path("./data/bio-oracle", scenario_value, decade_start)
|
|
|
dir.create(download_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
slice_rasters = sapply(
|
|
|
scenario_layers$dataset_id,
|
|
|
function(id) download_layers(
|
|
|
id,
|
|
|
constraints = slice_constraints,
|
|
|
directory = download_dir
|
|
|
),
|
|
|
simplify = TRUE
|
|
|
)
|
|
|
|
|
|
return(slice_rasters)
|
|
|
}
|
|
|
```
|
|
|
|
|
|
|
|
|
Выполняем загрузку среза на текущую декаду. Он будет использоваться в качестве контекстных данных для обучения.
|
|
|
|
|
|
```{r}
|
|
|
baseline_rasters = download_slice("baseline", 2010)
|
|
|
```
|
|
|
|
|
|
### IUCN
|
|
|
<!-- Publish dataset -->
|
|
|
|
|
|
```{r}
|
|
|
mammals_range = vect("data/iucn/MAMMALS_MARINE_ONLY.shp")
|
|
|
```
|
|
|
|
|
|
|
|
|
Загружаем данные об ареалах видов в пределах озвата Баренцевого моря.
|
|
|
|
|
|
#### Crop to the sea
|
|
|
|
|
|
```{r}
|
|
|
# mammals_range_barentsz = mammals_range |>
|
|
|
# crop(bbox)
|
|
|
|
|
|
# writeVector(mammals_range_barentsz, "data/iunc/mammals_barentsz.shp")
|
|
|
|
|
|
mammals_range_barentsz = vect("./data/iucn/mammals_barentsz.shp")
|
|
|
```
|
|
|
|
|
|
## Transform
|
|
|
|
|
|
Приводим загруженные данные к виду, пригодному для машинного обучения
|
|
|
|
|
|
### Bio-Oracle
|
|
|
|
|
|
Чтобы в df потом были нормальные названия колонок
|
|
|
```{r}
|
|
|
baseline_brick = rast(baseline_rasters)
|
|
|
|
|
|
baseline_brick_depths = baseline_brick |>
|
|
|
names() |>
|
|
|
str_extract("depth[:alpha:]+")
|
|
|
baseline_brick_longnames = baseline_brick |> longnames()
|
|
|
# baseline_brick_varnames = baseline_brick |> varnames() // краткая запись называний слоёв
|
|
|
# Человекочитаемые названия слоёв -> полей в датафрейме (см. следующий блок)
|
|
|
names(baseline_brick) = paste(baseline_brick_longnames, baseline_brick_depths)
|
|
|
```
|
|
|
|
|
|
|
|
|
Трансформируем растр в датафрейм. Сохраняем идентификаторы ячеек для надёжного установления соответствия между контекстными и целевыми данными.
|
|
|
|
|
|
```{r}
|
|
|
baseline_df = c(baseline_brick, terrain_raster) |>
|
|
|
as.data.frame(cells = TRUE)
|
|
|
```
|
|
|
|
|
|
### IUCN
|
|
|
|
|
|
Трансформируем данные об ареалах в растр с параметрами, аналогичными растрам параметров местообитаний.
|
|
|
|
|
|
```{r}
|
|
|
template_raster = baseline_rasters[[1]]
|
|
|
orca_range = mammals_range_barentsz |>
|
|
|
subset(mammals_range_barentsz$sci_name == "Orcinus orca")
|
|
|
|
|
|
orca_range_boundaries_buffer = orca_range |>
|
|
|
as.lines() |>
|
|
|
buffer(50000) |>
|
|
|
rasterize(template_raster, 1)
|
|
|
|
|
|
orca_range_raster = orca_range |>
|
|
|
rasterize(template_raster, "", background=0) |>
|
|
|
mask(orca_range_boundaries_buffer, inverse = TRUE)
|
|
|
|
|
|
plot(orca_range_raster)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
orca_df = orca_range_raster |>
|
|
|
as.data.frame(cells = TRUE) |>
|
|
|
mutate(target = factor(if_else(layer == 0, "0", "1"), levels = c("0", "1"))) |>
|
|
|
select(-layer)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
mammals_brick = mammals_range_barentsz$sci_name |>
|
|
|
sapply(function(sci_name) {
|
|
|
mammals_range_barentsz |>
|
|
|
subset(mammals_range_barentsz$sci_name == sci_name) |>
|
|
|
rasterize(baseline_rasters[[1]], field="sci_name")
|
|
|
}) |>
|
|
|
rast()
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
plot(mammals_brick$`Orcinus orca`)
|
|
|
```
|
|
|
|
|
|
|
|
|
Трансформируем растр ареалов животных в датафрейм с идентификаторами ячеек. Так как параметры растров аналогичны, можно установить однозначное соответствие между ячейкой ареала и параметрами местообитания в данной ячейке.
|
|
|
|
|
|
```{r}
|
|
|
mammals_df = mammals_brick |> as.data.frame(cells = TRUE)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
m_r = ifel(is.na(mammals_brick), 0, 1)
|
|
|
mammals_df2 = m_r |>
|
|
|
as.data.frame(xy = TRUE, cells = TRUE)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
# mammals_sf = mammals_df2
|
|
|
# st_as_sf(coords = c("x", "y"), crs = 4326)
|
|
|
# b = cv_spatial(
|
|
|
# x = mammals_sf,
|
|
|
# column = "Orcinus orca",
|
|
|
# # r = baseline_brick,
|
|
|
# # k = 3,
|
|
|
# size = 500000,
|
|
|
# # hexagon = TRUE,
|
|
|
# # selection = "random"
|
|
|
# )
|
|
|
# train_cells = mammals_sf[b$folds_list[[1]][[1]], ]$cell
|
|
|
# test_cells = mammals_sf[b$folds_list[[1]][[2]], ]$cell
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
train_cells = mammals_df2 |>
|
|
|
select(cell, x, y) |>
|
|
|
filter(y > 75) |>
|
|
|
pull(cell)
|
|
|
test_cells = mammals_df2 |>
|
|
|
select(cell, x, y) |>
|
|
|
filter(y < 75) |>
|
|
|
pull(cell)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
cv_plot(
|
|
|
cv = b, # a blockCV object
|
|
|
x = mammals_sf, # sample points
|
|
|
)
|
|
|
```
|
|
|
|
|
|
<!-- Выбрать только виды, у которых присутсвие меньше 80% -->
|
|
|
|
|
|
## Learn
|
|
|
|
|
|
Learn what conditions does animals prefer
|
|
|
|
|
|
```{r}
|
|
|
learn = function(species_name, hyperparam) {
|
|
|
# Подготовка данных
|
|
|
species_df = mammals_df |>
|
|
|
select(cell, all_of(species_name)) |>
|
|
|
rename(target = species_name)
|
|
|
|
|
|
baseline_species_df = baseline_df |>
|
|
|
left_join(species_df, by = "cell") |>
|
|
|
mutate(target = factor(if_else(is.na(target), "0", "1"), levels = c("0", "1")))# |>
|
|
|
#select(-cell)
|
|
|
|
|
|
# Разделение выборки на обучающую и тестовую
|
|
|
# set.seed(123)
|
|
|
# train_index = createDataPartition(baseline_species_df$target, p = 0.8, list = FALSE)
|
|
|
|
|
|
train_df = baseline_species_df |>
|
|
|
filter(cell %in% train_cells)
|
|
|
train_features = train_df %>% select(-cell, -target) # параметры
|
|
|
train_labels = train_df$target # наличие ареала
|
|
|
train_pool = catboost.load_pool(data = train_features, label = train_labels)
|
|
|
|
|
|
test_df = baseline_species_df |>
|
|
|
filter(cell %in% test_cells)
|
|
|
test_features = test_df %>% select(-cell, -target)
|
|
|
test_labels = test_df$target
|
|
|
test_pool = catboost.load_pool(data = test_features, label = test_labels)
|
|
|
|
|
|
|
|
|
# Обучение
|
|
|
model = catboost.train(train_pool, test_pool = test_pool, params = hyperparam)
|
|
|
|
|
|
return(model)
|
|
|
}
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
baseline_orca_df = baseline_df |>
|
|
|
left_join(orca_df, by = "cell") |>
|
|
|
filter(!is.na(target))
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
learn_orca = function(species_name, hyperparam) {
|
|
|
baseline_species_df = baseline_df |>
|
|
|
left_join(orca_df, by = "cell") |>
|
|
|
filter(!is.na(target))
|
|
|
|
|
|
train_df = baseline_species_df |>
|
|
|
filter(cell %in% train_cells)
|
|
|
train_features = train_df %>% select(-cell, -target) # параметры
|
|
|
train_labels = train_df$target # наличие ареала
|
|
|
train_pool = catboost.load_pool(data = train_features, label = train_labels)
|
|
|
|
|
|
test_df = baseline_species_df |>
|
|
|
filter(cell %in% test_cells)
|
|
|
test_features = test_df %>% select(-cell, -target)
|
|
|
test_labels = test_df$target
|
|
|
test_pool = catboost.load_pool(data = test_features, label = test_labels)
|
|
|
|
|
|
|
|
|
# Обучение
|
|
|
model = catboost.train(train_pool, test_pool = test_pool, params = hyperparam)
|
|
|
|
|
|
return(model)
|
|
|
}
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
fit_params <- list(
|
|
|
iterations = 100,
|
|
|
learning_rate = 0.03,
|
|
|
depth = 6,
|
|
|
loss_function = 'Logloss', # Standard for binary classification
|
|
|
eval_metric = 'AUC', # Good metric for imbalanced data
|
|
|
random_seed = 42,
|
|
|
verbose = 10 # Print progress every 100 iterations
|
|
|
# od_type = "Iter", # Optional: Early stopping
|
|
|
# od_wait = 50
|
|
|
)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
m_orca = learn("Orcinus orca", fit_params)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
m_orca_2 = learn_orca("Orcinus orca", fit_params)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
preds_prob <- catboost.predict(m_orca, pool_test, prediction_type = 'Probability')
|
|
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
|
|
a = confusionMatrix(factor(preds_class), factor(labels_test))
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
importance <- catboost.get_feature_importance(model, pool = pool_train)
|
|
|
```
|
|
|
|
|
|
|
|
|
## Predict
|
|
|
|
|
|
Predict changes in habitat ranges using different scenarios
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_rasters = download_slice("ssp585", 2090)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_brick = rast(ssp585_2090_rasters)
|
|
|
ssp585_2090_brick_depths = ssp585_2090_brick |>
|
|
|
names() |>
|
|
|
str_extract("depth[:alpha:]+")
|
|
|
ssp585_2090_brick_longnames = ssp585_2090_brick |> longnames()
|
|
|
# baseline_brick_varnames = baseline_brick |> varnames() // коды longnames
|
|
|
|
|
|
names(ssp585_2090_brick) = paste(ssp585_2090_brick_longnames, ssp585_2090_brick_depths)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_df = c(ssp585_2090_brick, terrain_raster) |>
|
|
|
as.data.frame(cells = TRUE)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_features = ssp585_2090_df |> select(-cell)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_pool <- catboost.load_pool(data = ssp585_2090_features)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
preds_prob <- catboost.predict(model, ssp585_2090_pool, prediction_type = 'Probability')
|
|
|
preds_class <- ifelse(preds_prob > 0.5, 1, 0)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
ssp585_2090_prediction = ssp585_2090_df |>
|
|
|
mutate(prediction = preds_class) |>
|
|
|
select(cell, prediction)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
a = orca_df |>
|
|
|
left_join(ssp585_2090_prediction, by = "cell") |>
|
|
|
mutate(target = if_else(is.na(target), 0, 1)) |>
|
|
|
mutate(diff = prediction - target)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
hist(a$diff)
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
# Select only your numeric predictors (environmental layers)
|
|
|
# Calculate correlation matrix
|
|
|
|
|
|
cor_matrix <- baseline_df |>
|
|
|
select(-cell, -landmass, -coastline) |>
|
|
|
cor()
|
|
|
|
|
|
# Find attributes that are highly corrected (ideal cutoff is debatable, 0.85 or 0.9 is common)
|
|
|
high_cor_features <- findCorrelation(cor_matrix, cutoff = 0.9)
|
|
|
|
|
|
# Print names of features suggested for removal
|
|
|
print(colnames(baseline_df)[high_cor_features])
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
r = rast(baseline_brick)
|
|
|
r[a$cell] = a$diff
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
values(r) = NA
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
plot(r$`Long-term maximum AirTemperature depthsurf`)
|
|
|
```
|
|
|
|