<- psp |>
psp mutate(year = format(date, format="%Y")) |>
filter(species == "mytilus")
Building Training Samples
A look inside make_image_list()
In order to forecast unevenly observed data, we’ll need to group consecutive observations together to create a training set of labeled “samples”.
First, we’ll start with a table of observations. The table needs to have a date column, and we’ll add one for year.
Steps:
Add a gap column - the number of days between consecutive samples in the same season
<- psp |>
psp ::compute_gap() psptools
Add a classification for each row and check gap requirement
<- psp |>
psp ::mutate(classification = psptools::recode_classification(.data$total_toxicity, c(0,10,30,80)),
dplyrmeets_gap = psptools::check_gap(.data$gap_days, minimum_gap=4, maximum_gap=10))
glimpse(psp)
Rows: 11,229
Columns: 24
$ id <chr> "DMC_2019-06-11_mytilus", "Isleboro_2019-07-24_mytilus"…
$ location_id <chr> "DMC", "Isleboro", "MIDGIx", "PE3M", "PEN BB", "PEN FIX…
$ date <date> 2019-06-11, 2019-07-24, 2019-06-24, 2019-07-18, 2019-0…
$ species <chr> "mytilus", "mytilus", "mytilus", "mytilus", "mytilus", …
$ sampleid <chr> "061119DMCmyt", "072419Isleboro myt", "062419MIDGIxmyt"…
$ total_toxicity <dbl> 137.9188000, 0.0000000, 14.9866900, 0.0000000, 0.000000…
$ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 43.09612, 43.10…
$ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -70.76781, -70.…
$ gtx4 <dbl> 18.70973, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ gtx1 <dbl> 37.792166, 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
$ dcgtx3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ dcgtx2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx3 <dbl> 24.6304364, 0.0000000, 8.5274400, 0.0000000, 0.0000000,…
$ gtx2 <dbl> 20.63097, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ neo <dbl> 21.01449, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ dcstx <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stx <dbl> 15.140990, 0.000000, 6.459252, 0.000000, 0.000000, 0.00…
$ c1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ c2 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ year <chr> "2019", "2019", "2019", "2019", "2019", "2019", "2019",…
$ gap_days <dbl> 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 7, 7, 1, 6, 7, 7…
$ classification <dbl> 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1…
$ meets_gap <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, …
summary(psp$gap_days)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 6.000 7.000 6.965 7.000 96.000
summary(psp$classification)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.3405 0.0000 3.0000
Normalize the input columns to put them all on the same scale
need to remove environmentals field from functions - maybe change toxins to predictors?
<- c("gtx4","gtx1","dcgtx3","gtx5","dcgtx2","gtx3","gtx2","neo","dcstx","stx","c1","c2") toxins
<- psp |>
psp ::normalize_input(toxins, environmentals=c())
psptools
summary(psp)
id location_id date species
Length:11229 Length:11229 Min. :2014-04-01 Length:11229
Class :character Class :character 1st Qu.:2015-07-13 Class :character
Mode :character Mode :character Median :2017-06-27 Mode :character
Mean :2018-02-05
3rd Qu.:2020-05-20
Max. :2024-06-10
sampleid total_toxicity lat lon
Length:11229 Min. : 0.000 Min. :43.10 Min. :-70.77
Class :character 1st Qu.: 0.000 1st Qu.:43.77 1st Qu.:-69.94
Mode :character Median : 0.810 Median :43.93 Median :-69.33
Mean : 16.664 Mean :44.10 Mean :-68.93
3rd Qu.: 6.252 3rd Qu.:44.52 3rd Qu.:-67.79
Max. :3092.832 Max. :44.97 Max. :-66.98
NA's :29 NA's :29
year gap_days classification meets_gap
Length:11229 Min. : 0.000 Min. :0.0000 Mode :logical
Class :character 1st Qu.: 6.000 1st Qu.:0.0000 FALSE:1815
Mode :character Median : 7.000 Median :0.0000 TRUE :9414
Mean : 6.965 Mean :0.3405
3rd Qu.: 7.000 3rd Qu.:0.0000
Max. :96.000 Max. :3.0000
gtx4 gtx1 dcgtx3 gtx5
Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
Median :0.00000 Median :0.000000 Median :0.000000 Median :0.0000000
Mean :0.00512 Mean :0.004778 Mean :0.001515 Mean :0.0009969
3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000000
Max. :1.00000 Max. :1.000000 Max. :1.000000 Max. :1.0000000
dcgtx2 gtx3 gtx2 neo
Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000
Median :0.000000 Median :0.0006825 Median :0.000000 Median :0.000000
Mean :0.002345 Mean :0.0047486 Mean :0.003967 Mean :0.003653
3rd Qu.:0.000000 3rd Qu.:0.0028490 3rd Qu.:0.000000 3rd Qu.:0.000000
Max. :1.000000 Max. :1.0000000 Max. :1.000000 Max. :1.000000
dcstx stx c1 c2
Min. :0.00e+00 Min. :0.000000 Min. :0.000000 Min. :0.00000
1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000
Median :0.00e+00 Median :0.000000 Median :0.000000 Median :0.00000
Mean :9.53e-05 Mean :0.004128 Mean :0.002749 Mean :0.00608
3rd Qu.:0.00e+00 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000
Max. :1.00e+00 Max. :1.000000 Max. :1.000000 Max. :1.00000
Group by location and year
For the purpose of this demo, we’ll just work on one group
<- psp |>
g filter(location_id == "PSP12.28" & year == 2014)
g
# A tibble: 28 × 24
id location_id date species sampleid total_toxicity lat lon
<chr> <chr> <date> <chr> <chr> <dbl> <dbl> <dbl>
1 PSP12.28_… PSP12.28 2014-04-04 mytilus 040414P… 5.33 43.8 -69.9
2 PSP12.28_… PSP12.28 2014-04-08 mytilus 040814P… 0.565 43.8 -69.9
3 PSP12.28_… PSP12.28 2014-04-15 mytilus 041514P… 8.82 43.8 -69.9
4 PSP12.28_… PSP12.28 2014-04-22 mytilus 042214P… 12.9 43.8 -69.9
5 PSP12.28_… PSP12.28 2014-04-28 mytilus 042814P… 18.1 43.8 -69.9
6 PSP12.28_… PSP12.28 2014-05-06 mytilus 050614P… 7.86 43.8 -69.9
7 PSP12.28_… PSP12.28 2014-05-12 mytilus 051214P… 15.3 43.8 -69.9
8 PSP12.28_… PSP12.28 2014-05-19 mytilus 051914P… 22.0 43.8 -69.9
9 PSP12.28_… PSP12.28 2014-05-27 mytilus 052714P… 148. 43.8 -69.9
10 PSP12.28_… PSP12.28 2014-06-02 mytilus 060214P… 492. 43.8 -69.9
# ℹ 18 more rows
# ℹ 16 more variables: year <chr>, gap_days <dbl>, classification <dbl>,
# meets_gap <lgl>, gtx4 <dbl>, gtx1 <dbl>, dcgtx3 <dbl>, gtx5 <dbl>,
# dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>, dcstx <dbl>, stx <dbl>,
# c1 <dbl>, c2 <dbl>
For each location-year subset
<- 2
n_steps <- 1 forecast_steps
Find how many samples can be made (n_batches) List the samples indices (batches)
<- n_batches(nrow(g), (n_steps+forecast_steps))
nb
nb
[1] 26
<- compute_batches(nb, (n_steps+forecast_steps))
batches
1:4] batches[
[[1]]
[1] 1 2 3
[[2]]
[1] 2 3 4
[[3]]
[1] 3 4 5
[[4]]
[1] 4 5 6
For each batch
Slice out the rows for the image
Check gap status of each row - if any don’t meet the requirement (besides the first), move on to the next batch
<- batches[[1]]
batch batch
[1] 1 2 3
<- g |>
b ::slice(batch)
dplyr b
# A tibble: 3 × 24
id location_id date species sampleid total_toxicity lat lon year
<chr> <chr> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
1 PSP1… PSP12.28 2014-04-04 mytilus 040414P… 5.33 43.8 -69.9 2014
2 PSP1… PSP12.28 2014-04-08 mytilus 040814P… 0.565 43.8 -69.9 2014
3 PSP1… PSP12.28 2014-04-15 mytilus 041514P… 8.82 43.8 -69.9 2014
# ℹ 15 more variables: gap_days <dbl>, classification <dbl>, meets_gap <lgl>,
# gtx4 <dbl>, gtx1 <dbl>, dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>,
# gtx2 <dbl>, neo <dbl>, dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>
Select the predictor variable columns, and make them into a matrix (2 dimensions - predictors/columns and number of weeks/rows)
<- as.matrix(dplyr::ungroup(b) |>
i ::select(dplyr::all_of(toxins)))
dplyrround(i,3)
gtx4 gtx1 dcgtx3 gtx5 dcgtx2 gtx3 gtx2 neo dcstx stx c1 c2
[1,] 0 0.00 0 0 0 0.006 0 0 0 0 0 0.000
[2,] 0 0.00 0 0 0 0.001 0 0 0 0 0 0.000
[3,] 0 0.01 0 0 0 0.001 0 0 0 0 0 0.006
Store the image, label and metadata in a list
list(status=TRUE,
year = "2014",
location_id = b$location_id[1],
classification = b$classification[n_steps+forecast_steps],
toxicity = b$total_toxicity[n_steps+forecast_steps],
date = b$date[n_steps],
image = i[1:n_steps,])
$status
[1] TRUE
$year
[1] "2014"
$location_id
[1] "PSP12.28"
$classification
[1] 0
$toxicity
[1] 8.821341
$date
[1] "2014-04-08"
$image
gtx4 gtx1 dcgtx3 gtx5 dcgtx2 gtx3 gtx2 neo dcstx stx c1 c2
[1,] 0 0 0 0 0 0.0062662855 0 0 0 0 0 0
[2,] 0 0 0 0 0 0.0006640863 0 0 0 0 0 0