Building Training Samples

A look inside make_image_list()

In order to forecast unevenly observed data, we’ll need to group consecutive observations together to create a training set of labeled “samples”.

First, we’ll start with a table of observations. The table needs to have a date column, and we’ll add one for year.

Steps:

Add a classification for each row

psp <- psp |> 
    dplyr::mutate(classification = psptools::recode_classification(.data$total_toxicity, c(0,10,30,80)))

glimpse(psp)

Rows: 11,695
Columns: 21
$ id             <chr> "DMC_2019-06-11_mytilus", "Isleboro_2019-07-24_mytilus"…
$ location_id    <chr> "DMC", "Isleboro", "MIDGIx", "PE3M", "PEN BB", "PEN FIX…
$ species        <chr> "mytilus", "mytilus", "mytilus", "mytilus", "mytilus", …
$ total_toxicity <dbl> 137.9188000, 0.0000000, 14.9866900, 0.0000000, 0.000000…
$ gtx4           <dbl> 18.70973, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ gtx1           <dbl> 37.792166, 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
$ dcgtx3         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx5           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ dcgtx2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx3           <dbl> 24.6304364, 0.0000000, 8.5274400, 0.0000000, 0.0000000,…
$ gtx2           <dbl> 20.63097, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ neo            <dbl> 21.01449, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ dcstx          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stx            <dbl> 15.140990, 0.000000, 6.459252, 0.000000, 0.000000, 0.00…
$ c1             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ c2             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ date           <date> 2019-06-11, 2019-07-24, 2019-06-24, 2019-07-18, 2019-0…
$ year           <chr> "2019", "2019", "2019", "2019", "2019", "2019", "2019",…
$ region         <chr> "maine", "maine", "maine", "maine", "maine", "maine", "…
$ gap_days       <dbl> 0, 0, 21, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 7, 7, 0, 6, 7, …
$ classification <dbl> 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1…

summary(psp$classification)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.3431  0.0000  3.0000

Normalize the input columns to put them all on the same scale

need to remove environmentals field from functions - maybe change toxins to predictors?

toxins <- c("gtx4","gtx1","dcgtx3","gtx5","dcgtx2","gtx3","gtx2","neo","dcstx","stx","c1","c2")

psp <- psp |>
  psptools::normalize_input(toxins, environmentals=c())

summary(psp)

      id            location_id          species          total_toxicity     
 Length:11695       Length:11695       Length:11695       Min.   :   0.0000  
 Class :character   Class :character   Class :character   1st Qu.:   0.0000  
 Mode  :character   Mode  :character   Mode  :character   Median :   0.8584  
                                                          Mean   :  16.4425  
                                                          3rd Qu.:   6.6318  
                                                          Max.   :3092.8320  
      date                year              region             gap_days      
 Min.   :2014-04-01   Length:11695       Length:11695       Min.   :  0.000  
 1st Qu.:2015-07-20   Class :character   Class :character   1st Qu.:  6.000  
 Median :2017-07-16   Mode  :character   Mode  :character   Median :  7.000  
 Mean   :2018-05-10                                         Mean   :  6.838  
 3rd Qu.:2020-07-27                                         3rd Qu.:  7.000  
 Max.   :2025-04-02                                         Max.   :104.000  
 classification        gtx4               gtx1              dcgtx3        
 Min.   :0.0000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Median :0.0000   Median :0.000000   Median :0.000000   Median :0.000000  
 Mean   :0.3431   Mean   :0.004992   Mean   :0.004707   Mean   :0.001454  
 3rd Qu.:0.0000   3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
 Max.   :3.0000   Max.   :1.000000   Max.   :1.000000   Max.   :1.000000  
      gtx5               dcgtx2              gtx3                gtx2         
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000  
 Median :0.0000000   Median :0.000000   Median :0.0007194   Median :0.000000  
 Mean   :0.0009571   Mean   :0.002252   Mean   :0.0046837   Mean   :0.003984  
 3rd Qu.:0.0000000   3rd Qu.:0.000000   3rd Qu.:0.0028843   3rd Qu.:0.000000  
 Max.   :1.0000000   Max.   :1.000000   Max.   :1.0000000   Max.   :1.000000  
      neo               dcstx               stx                 c1          
 Min.   :0.000000   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.000000  
 Median :0.000000   Median :0.00e+00   Median :0.000000   Median :0.000000  
 Mean   :0.003551   Mean   :9.68e-05   Mean   :0.004118   Mean   :0.002771  
 3rd Qu.:0.000000   3rd Qu.:0.00e+00   3rd Qu.:0.000000   3rd Qu.:0.000000  
 Max.   :1.000000   Max.   :1.00e+00   Max.   :1.000000   Max.   :1.000000  
       c2          
 Min.   :0.000000  
 1st Qu.:0.000000  
 Median :0.000000  
 Mean   :0.005855  
 3rd Qu.:0.000000  
 Max.   :1.000000

Group by location and year

For the purpose of this demo, we’ll just work on one group

Add a gap column - the number of days between consecutive samples in the same season

Check the gap_status of each sample (Is the gap of days since the last sample between the minimum and maximum defined in the configuration?)

g <- psp |>
  filter(location_id == "PSP12.28" & year == 2014) |>
  compute_gap() |>
  mutate(meets_gap = psptools::check_gap(.data$gap_days, minimum_gap=4, maximum_gap=10))

g

# A tibble: 28 × 22
   id        location_id species total_toxicity date       year  region gap_days
   <chr>     <chr>       <chr>            <dbl> <date>     <chr> <chr>     <dbl>
 1 PSP12.28… PSP12.28    mytilus          5.33  2014-04-04 2014  maine         0
 2 PSP12.28… PSP12.28    mytilus          0.565 2014-04-08 2014  maine         4
 3 PSP12.28… PSP12.28    mytilus          8.82  2014-04-15 2014  maine         7
 4 PSP12.28… PSP12.28    mytilus         12.9   2014-04-22 2014  maine         7
 5 PSP12.28… PSP12.28    mytilus         18.1   2014-04-28 2014  maine         6
 6 PSP12.28… PSP12.28    mytilus          7.86  2014-05-06 2014  maine         8
 7 PSP12.28… PSP12.28    mytilus         15.3   2014-05-12 2014  maine         6
 8 PSP12.28… PSP12.28    mytilus         22.0   2014-05-19 2014  maine         7
 9 PSP12.28… PSP12.28    mytilus        148.    2014-05-27 2014  maine         8
10 PSP12.28… PSP12.28    mytilus        492.    2014-06-02 2014  maine         6
# ℹ 18 more rows
# ℹ 14 more variables: classification <dbl>, gtx4 <dbl>, gtx1 <dbl>,
#   dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>,
#   dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>, meets_gap <lgl>

For each location-year subset

n_steps <- 2
forecast_steps <- 1

Find how many samples can be made (n_batches) List the samples indices (batches)

nb <- n_batches(nrow(g), (n_steps+forecast_steps))

nb

[1] 26

batches <- compute_batches(nb, (n_steps+forecast_steps))

batches[1:4]

[[1]]
[1] 1 2 3

[[2]]
[1] 2 3 4

[[3]]
[1] 3 4 5

[[4]]
[1] 4 5 6

For each batch

Slice out the rows for the image

Check gap status of each row - if any don’t meet the requirement (besides the first), move on to the next batch

batch <- batches[[1]]
batch

[1] 1 2 3

b <- g |> 
  dplyr::slice(batch)
b

# A tibble: 3 × 22
  id         location_id species total_toxicity date       year  region gap_days
  <chr>      <chr>       <chr>            <dbl> <date>     <chr> <chr>     <dbl>
1 PSP12.28_… PSP12.28    mytilus          5.33  2014-04-04 2014  maine         0
2 PSP12.28_… PSP12.28    mytilus          0.565 2014-04-08 2014  maine         4
3 PSP12.28_… PSP12.28    mytilus          8.82  2014-04-15 2014  maine         7
# ℹ 14 more variables: classification <dbl>, gtx4 <dbl>, gtx1 <dbl>,
#   dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>,
#   dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>, meets_gap <lgl>

Select the predictor variable columns, and make them into a matrix (2 dimensions - predictors/columns and number of weeks/rows)

i <- as.matrix(dplyr::ungroup(b) |> 
      dplyr::select(dplyr::all_of(toxins)))
round(i,3)

     gtx4 gtx1 dcgtx3 gtx5 dcgtx2  gtx3 gtx2 neo dcstx stx c1    c2
[1,]    0 0.00      0    0      0 0.006    0   0     0   0  0 0.000
[2,]    0 0.00      0    0      0 0.001    0   0     0   0  0 0.000
[3,]    0 0.01      0    0      0 0.001    0   0     0   0  0 0.006

Store the image, label and metadata in a list

list(status=TRUE,
     year =           "2014",
     location_id =    b$location_id[1],
     species =        b$species[1],
     classification = b$classification[n_steps+forecast_steps],
     toxicity =       b$total_toxicity[n_steps+forecast_steps],
     date =           b$date[n_steps],
     image =          i[1:n_steps,])

$status
[1] TRUE

$year
[1] "2014"

$location_id
[1] "PSP12.28"

$species
  mytilus 
"mytilus" 

$classification
[1] 0

$toxicity
[1] 8.821341

$date
[1] "2014-04-08"

$image
     gtx4 gtx1 dcgtx3 gtx5 dcgtx2         gtx3 gtx2 neo dcstx stx c1 c2
[1,]    0    0      0    0      0 0.0062662855    0   0     0   0  0  0
[2,]    0    0      0    0      0 0.0006640863    0   0     0   0  0  0