source("setup.R", echo = FALSE)
suppressPackageStartupMessages(library(tidysdm))
= get_bb(form = 'polygon')
bb = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
coast ::st_geometry()
sf= read_obis() |>
x ::filter(date >= as.Date("2000-01-01")) dplyr
Exploring tidysdm
Here we try our hand with the tidysdm, which is a Species Distribution Modeling (“sdm”) wrapper around and extension of the tidymodels suite of R packages. We’ll walk through the steps shown in tidysdm’s overview vignette using our Mola mola
dataset. We already have a good sense of our obseravtions and covariates so we can skip the data preparation steps.
1 Special installation
First we need to install a developmental version of tidysdm. The dev
branch is subect to a pull request; if it is accepted into the upstream repos then it will work its way into a CRAN release.
remotes::install_github("BigelowLab/tidysdm", ref = "dev")
2 Thinning the observations
Here we thin the data. The idea of thinning is to have one presence point per cell of the target raster output. For this purpose we’ll need not just the observation data, but also a raster of the desired extent and resolution. We have two rasterized covariate datasets we can load in, and then use one or the other as our template.
First, the observations…
And now the covariates…
= "data/oisst"
sst_path = oisster::read_database(sst_path) |>
sst_db ::arrange(date)
dplyr
= "data/nbs"
wind_path = nbs::read_database(wind_path) |>
wind_db ::arrange(date)
dplyr
= read_predictors(sst_db = sst_db,
preds windspeed_db = wind_db |> dplyr::filter(param == "windspeed"),
u_wind_db = wind_db |> dplyr::filter(param == "u_wind"),
v_wind_db = wind_db |> dplyr::filter(param == "v_wind"))
We’ll take the first slice of sst as a template and convert it into a mask.
= dplyr::slice(preds['sst'], "time", 1) |>
mask ::set_names("mask")|>
rlang::mutate(mask = factor(c("mask", NA_character_)[as.numeric(is.na(mask) + 1)],
dplyrlevels = "mask"))
plot(mask, breaks = "equal", axes = TRUE, reset = FALSE)
plot(sf::st_geometry(x), pch = "+", add = TRUE)
plot(coast, col = "orange", add = TRUE)
Now we can thin using thin_by_cell()
. You can see the number of observations is greatly winnowed.
set.seed(1234)
<- tidysdm::thin_by_cell(x, raster = mask)
thinx plot(mask, breaks = "equal", axes = TRUE, reset = FALSE)
plot(sf::st_geometry(thinx), pch = "+", add = TRUE)
plot(coast, col = "orange", add = TRUE)
Next is to thin again by separation distance.
<- tidysdm::thin_by_dist(thinx, dist_min = km2m(20))
thinx plot(mask, breaks = "equal", axes = TRUE, reset = FALSE)
plot(sf::st_geometry(thinx), pch = "+", add = TRUE)
plot(coast, col = "orange", add = TRUE)
3 Sampling background
Here we depart a little from the original workflow to use the full compliment of observations to generate a sampling bias map. We diverge from the original workflow because we have no observations of similar species”- Mola mola is unique.
First we develop a sampling density map to guide the backgroud selection if there is a sampling bias.
= st_as_sf(mask)
mask_vec = dplyr::mutate(x, ID = 1) |>
y ::select(ID)
dplyr<- aggregate(y, mask_vec, FUN = length)
agg = st_rasterize(agg)
background_raster = terra::map.pal("viridis", 50)
pal plot(background_raster,
col = pal,
nbreaks = length(pal) + 1,
breaks = "equal",
reset = FALSE)
plot(coast, col = "orange", add = TRUE)
Ahhh, as you might have surmised, there are some places where observations occur more frequently than in other places (bias!) Now we select random points using this weighted map to guide the background point selection.
set.seed(1234)
<- tidysdm::sample_background(
input data = thinx,
raster = background_raster,
n = 3 * nrow(thinx),
method = "bias",
class_label = "background",
return_pres = TRUE) |>
::mutate(time = lubridate::NA_Date_, .after = 1)
dplyr<- input$class == "presence"
ix $time[ix] <- oisster::current_month(thinx$date) input
plot(input['class'], pch = 1, cex = 0.2,reset = FALSE, axes = TRUE)
plot(coast, col = "orange", add = TRUE)
Next we need to think about sampling through time. We’ll borrow from ?@sec-introduction to make a weighted time sample.
set.seed(1234)
= dplyr::count(sf::st_drop_geometry(input), class) |>
nback ::filter(class == "presence") |>
dplyr::pull(n)
dplyr= sample_time(x$date,
days_sample size = nback,
by = "month",
replace = TRUE,
weighted = TRUE)
# recall ix is the logical identifying the class "presence"
$time[!ix] <- days_sample input
4 Extract points data from covariates
It’s easy to extract point data from raster.
= stars::st_extract(preds, at = input, time_column = "time") |>
input_data ::st_as_sf() |>
sf::as_tibble() |>
dplyr::select(dplyr::all_of(names(preds)))
dplyr= dplyr::bind_cols(input, input_data) |>
input ::glimpse() dplyr
Rows: 1,452
Columns: 7
$ class <fct> presence, presence, presence, presence, presence, presence, …
$ time <date> 2012-06-01, 2016-08-01, 2017-11-01, 2003-07-01, 2018-11-01,…
$ geometry <POINT [°]> POINT (-66.60798 42.2943), POINT (-68.00023 42.60569),…
$ sst <dbl> 12.45633, 20.16968, 14.89867, 18.12452, 16.69533, 27.77900, …
$ windspeed <dbl> 4.911994, 4.860674, 8.411347, 4.308283, 8.377000, 7.495667, …
$ u_wind <dbl> -0.27018008, 1.58849359, 2.05362248, 1.27765775, 0.86364067,…
$ v_wind <dbl> -0.54305273, 0.75563574, -0.73934704, 2.79159832, -1.1114074…
Now let’s look at the relationships among these predicitors. First a distribution plot.
|>
input ::select(dplyr::all_of(c("class", names(preds)))) |>
dplyr::plot_pres_vs_bg(class) tidysdm
We can then compute a distance metric that measures the “distance” between presence and background records for a given covariate. The bigger the distance the more suitable the covariate is for modeling. Just what consitutes enough distance is harder to know.
|>
input ::select(dplyr::all_of(c("class", names(preds)))) |>
dplyr::dist_pres_vs_bg(class) tidysdm
sst v_wind u_wind windspeed
0.28953807 0.11841485 0.10805954 0.07949534
We can alos look into correlations between the covariates. Except for windspeed
and u_wind
(the east-west component of wind) there isn’t any obvious correlation.
pairs(preds)
We can also deploy a filtering function that will indentify the uncorrelated covariates (and the correlated ones, too!) Noe that none are recommened for removal, so we proceed with all four. If any had been flagged for removal we would do just that.
<- filter_collinear(preds, cutoff = 0.7, method = "cor_caret")
vars_uncor vars_uncor
[1] "windspeed" "u_wind" "v_wind" "sst"
attr(,"to_remove")
character(0)
5 Fit the model by cross-validation
= dplyr::select(input, dplyr::all_of(c("class", names(preds))))
model_input <- recipes::recipe(model_input, formula = class ~ .)
rec rec
── Recipe ──────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 4
coords: 2
%>% check_sdm_presence(class) model_input
[1] TRUE
<-
models # create the workflow_set
workflow_set(
preproc = list(default = rec),
models = list(
# the standard glm specs
glm = sdm_spec_glm(),
# rf specs with tuning
rf = sdm_spec_rf(),
# boosted tree model (gbm) specs with tuning
gbm = sdm_spec_boost_tree(),
# maxent specs with tuning
maxent = sdm_spec_maxent()
),# make all combinations of preproc and models,
cross = TRUE
%>%
) # tweak controls to store information needed later to create the ensemble
option_add(control = control_ensemble_grid())
set.seed(100)
<- spatial_block_cv(data = model_input, v = 3, n = 5)
input_cv autoplot(input_cv)
set.seed(1234567)
<-
models %>%
models workflow_map("tune_grid",
resamples = input_cv, grid = 3,
metrics = sdm_metric_set(), verbose = TRUE
)
i No tuning parameters. `fit_resamples()` will be attempted
i 1 of 4 resampling: default_glm
✔ 1 of 4 resampling: default_glm (293ms)
i 2 of 4 tuning: default_rf
i Creating pre-processing data to finalize unknown parameter: mtry
✔ 2 of 4 tuning: default_rf (2.9s)
i 3 of 4 tuning: default_gbm
i Creating pre-processing data to finalize unknown parameter: mtry
✔ 3 of 4 tuning: default_gbm (7.2s)
i 4 of 4 tuning: default_maxent
✔ 4 of 4 tuning: default_maxent (1.9s)
autoplot(models)