<- psp |>
psp ::mutate(classification = psptools::recode_classification(.data$total_toxicity, c(0,10,30,80))) dplyr
Building Training Samples
A look inside make_image_list()
In order to forecast unevenly observed data, we’ll need to group consecutive observations together to create a training set of labeled “samples”.
First, we’ll start with a table of observations. The table needs to have a date column, and we’ll add one for year.
Steps:
Add a classification for each row
glimpse(psp)
Rows: 11,695
Columns: 21
$ id <chr> "DMC_2019-06-11_mytilus", "Isleboro_2019-07-24_mytilus"…
$ location_id <chr> "DMC", "Isleboro", "MIDGIx", "PE3M", "PEN BB", "PEN FIX…
$ species <chr> "mytilus", "mytilus", "mytilus", "mytilus", "mytilus", …
$ total_toxicity <dbl> 137.9188000, 0.0000000, 14.9866900, 0.0000000, 0.000000…
$ gtx4 <dbl> 18.70973, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ gtx1 <dbl> 37.792166, 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
$ dcgtx3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ dcgtx2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx3 <dbl> 24.6304364, 0.0000000, 8.5274400, 0.0000000, 0.0000000,…
$ gtx2 <dbl> 20.63097, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ neo <dbl> 21.01449, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ dcstx <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stx <dbl> 15.140990, 0.000000, 6.459252, 0.000000, 0.000000, 0.00…
$ c1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ c2 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ date <date> 2019-06-11, 2019-07-24, 2019-06-24, 2019-07-18, 2019-0…
$ year <chr> "2019", "2019", "2019", "2019", "2019", "2019", "2019",…
$ region <chr> "maine", "maine", "maine", "maine", "maine", "maine", "…
$ gap_days <dbl> 0, 0, 21, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 7, 7, 0, 6, 7, …
$ classification <dbl> 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1…
summary(psp$classification)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.3431 0.0000 3.0000
Normalize the input columns to put them all on the same scale
need to remove environmentals field from functions - maybe change toxins to predictors?
<- c("gtx4","gtx1","dcgtx3","gtx5","dcgtx2","gtx3","gtx2","neo","dcstx","stx","c1","c2") toxins
<- psp |>
psp ::normalize_input(toxins, environmentals=c())
psptools
summary(psp)
id location_id species total_toxicity
Length:11695 Length:11695 Length:11695 Min. : 0.0000
Class :character Class :character Class :character 1st Qu.: 0.0000
Mode :character Mode :character Mode :character Median : 0.8584
Mean : 16.4425
3rd Qu.: 6.6318
Max. :3092.8320
date year region gap_days
Min. :2014-04-01 Length:11695 Length:11695 Min. : 0.000
1st Qu.:2015-07-20 Class :character Class :character 1st Qu.: 6.000
Median :2017-07-16 Mode :character Mode :character Median : 7.000
Mean :2018-05-10 Mean : 6.838
3rd Qu.:2020-07-27 3rd Qu.: 7.000
Max. :2025-04-02 Max. :104.000
classification gtx4 gtx1 dcgtx3
Min. :0.0000 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Median :0.0000 Median :0.000000 Median :0.000000 Median :0.000000
Mean :0.3431 Mean :0.004992 Mean :0.004707 Mean :0.001454
3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
Max. :3.0000 Max. :1.000000 Max. :1.000000 Max. :1.000000
gtx5 dcgtx2 gtx3 gtx2
Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000
Median :0.0000000 Median :0.000000 Median :0.0007194 Median :0.000000
Mean :0.0009571 Mean :0.002252 Mean :0.0046837 Mean :0.003984
3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:0.0028843 3rd Qu.:0.000000
Max. :1.0000000 Max. :1.000000 Max. :1.0000000 Max. :1.000000
neo dcstx stx c1
Min. :0.000000 Min. :0.00e+00 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.000000
Median :0.000000 Median :0.00e+00 Median :0.000000 Median :0.000000
Mean :0.003551 Mean :9.68e-05 Mean :0.004118 Mean :0.002771
3rd Qu.:0.000000 3rd Qu.:0.00e+00 3rd Qu.:0.000000 3rd Qu.:0.000000
Max. :1.000000 Max. :1.00e+00 Max. :1.000000 Max. :1.000000
c2
Min. :0.000000
1st Qu.:0.000000
Median :0.000000
Mean :0.005855
3rd Qu.:0.000000
Max. :1.000000
Group by location and year
For the purpose of this demo, we’ll just work on one group
Add a gap column - the number of days between consecutive samples in the same season
Check the gap_status of each sample (Is the gap of days since the last sample between the minimum and maximum defined in the configuration?)
<- psp |>
g filter(location_id == "PSP12.28" & year == 2014) |>
compute_gap() |>
mutate(meets_gap = psptools::check_gap(.data$gap_days, minimum_gap=4, maximum_gap=10))
g
# A tibble: 28 × 22
id location_id species total_toxicity date year region gap_days
<chr> <chr> <chr> <dbl> <date> <chr> <chr> <dbl>
1 PSP12.28… PSP12.28 mytilus 5.33 2014-04-04 2014 maine 0
2 PSP12.28… PSP12.28 mytilus 0.565 2014-04-08 2014 maine 4
3 PSP12.28… PSP12.28 mytilus 8.82 2014-04-15 2014 maine 7
4 PSP12.28… PSP12.28 mytilus 12.9 2014-04-22 2014 maine 7
5 PSP12.28… PSP12.28 mytilus 18.1 2014-04-28 2014 maine 6
6 PSP12.28… PSP12.28 mytilus 7.86 2014-05-06 2014 maine 8
7 PSP12.28… PSP12.28 mytilus 15.3 2014-05-12 2014 maine 6
8 PSP12.28… PSP12.28 mytilus 22.0 2014-05-19 2014 maine 7
9 PSP12.28… PSP12.28 mytilus 148. 2014-05-27 2014 maine 8
10 PSP12.28… PSP12.28 mytilus 492. 2014-06-02 2014 maine 6
# ℹ 18 more rows
# ℹ 14 more variables: classification <dbl>, gtx4 <dbl>, gtx1 <dbl>,
# dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>,
# dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>, meets_gap <lgl>
For each location-year subset
<- 2
n_steps <- 1 forecast_steps
Find how many samples can be made (n_batches) List the samples indices (batches)
<- n_batches(nrow(g), (n_steps+forecast_steps))
nb
nb
[1] 26
<- compute_batches(nb, (n_steps+forecast_steps))
batches
1:4] batches[
[[1]]
[1] 1 2 3
[[2]]
[1] 2 3 4
[[3]]
[1] 3 4 5
[[4]]
[1] 4 5 6
For each batch
Slice out the rows for the image
Check gap status of each row - if any don’t meet the requirement (besides the first), move on to the next batch
<- batches[[1]]
batch batch
[1] 1 2 3
<- g |>
b ::slice(batch)
dplyr b
# A tibble: 3 × 22
id location_id species total_toxicity date year region gap_days
<chr> <chr> <chr> <dbl> <date> <chr> <chr> <dbl>
1 PSP12.28_… PSP12.28 mytilus 5.33 2014-04-04 2014 maine 0
2 PSP12.28_… PSP12.28 mytilus 0.565 2014-04-08 2014 maine 4
3 PSP12.28_… PSP12.28 mytilus 8.82 2014-04-15 2014 maine 7
# ℹ 14 more variables: classification <dbl>, gtx4 <dbl>, gtx1 <dbl>,
# dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>,
# dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>, meets_gap <lgl>
Select the predictor variable columns, and make them into a matrix (2 dimensions - predictors/columns and number of weeks/rows)
<- as.matrix(dplyr::ungroup(b) |>
i ::select(dplyr::all_of(toxins)))
dplyrround(i,3)
gtx4 gtx1 dcgtx3 gtx5 dcgtx2 gtx3 gtx2 neo dcstx stx c1 c2
[1,] 0 0.00 0 0 0 0.006 0 0 0 0 0 0.000
[2,] 0 0.00 0 0 0 0.001 0 0 0 0 0 0.000
[3,] 0 0.01 0 0 0 0.001 0 0 0 0 0 0.006
Store the image, label and metadata in a list
list(status=TRUE,
year = "2014",
location_id = b$location_id[1],
species = b$species[1],
classification = b$classification[n_steps+forecast_steps],
toxicity = b$total_toxicity[n_steps+forecast_steps],
date = b$date[n_steps],
image = i[1:n_steps,])
$status
[1] TRUE
$year
[1] "2014"
$location_id
[1] "PSP12.28"
$species
mytilus
"mytilus"
$classification
[1] 0
$toxicity
[1] 8.821341
$date
[1] "2014-04-08"
$image
gtx4 gtx1 dcgtx3 gtx5 dcgtx2 gtx3 gtx2 neo dcstx stx c1 c2
[1,] 0 0 0 0 0 0.0062662855 0 0 0 0 0 0
[2,] 0 0 0 0 0 0.0006640863 0 0 0 0 0 0