Building Training Samples

A look inside make_image_list()

In order to forecast unevenly observed data, we’ll need to group consecutive observations together to create a training set of labeled “samples”.

First, we’ll start with a table of observations. The table needs to have a date column, and we’ll add one for year.

psp <- psp |>
  mutate(year = format(date, format="%Y")) |>
  filter(species == "mytilus")

Steps:

Add a gap column - the number of days between consecutive samples in the same season

psp <- psp |>
  psptools::compute_gap()

Add a classification for each row and check gap requirement

psp <- psp |> 
    dplyr::mutate(classification = psptools::recode_classification(.data$total_toxicity, c(0,10,30,80)),
                  meets_gap = psptools::check_gap(.data$gap_days, minimum_gap=4, maximum_gap=10))
glimpse(psp)
Rows: 11,229
Columns: 24
$ id             <chr> "DMC_2019-06-11_mytilus", "Isleboro_2019-07-24_mytilus"…
$ location_id    <chr> "DMC", "Isleboro", "MIDGIx", "PE3M", "PEN BB", "PEN FIX…
$ date           <date> 2019-06-11, 2019-07-24, 2019-06-24, 2019-07-18, 2019-0…
$ species        <chr> "mytilus", "mytilus", "mytilus", "mytilus", "mytilus", …
$ sampleid       <chr> "061119DMCmyt", "072419Isleboro myt", "062419MIDGIxmyt"…
$ total_toxicity <dbl> 137.9188000, 0.0000000, 14.9866900, 0.0000000, 0.000000…
$ lat            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 43.09612, 43.10…
$ lon            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -70.76781, -70.…
$ gtx4           <dbl> 18.70973, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ gtx1           <dbl> 37.792166, 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
$ dcgtx3         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx5           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ dcgtx2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ gtx3           <dbl> 24.6304364, 0.0000000, 8.5274400, 0.0000000, 0.0000000,…
$ gtx2           <dbl> 20.63097, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ neo            <dbl> 21.01449, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
$ dcstx          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stx            <dbl> 15.140990, 0.000000, 6.459252, 0.000000, 0.000000, 0.00…
$ c1             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ c2             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ year           <chr> "2019", "2019", "2019", "2019", "2019", "2019", "2019",…
$ gap_days       <dbl> 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 7, 7, 1, 6, 7, 7…
$ classification <dbl> 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1…
$ meets_gap      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, …
summary(psp$gap_days)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   6.000   7.000   6.965   7.000  96.000 
summary(psp$classification)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.3405  0.0000  3.0000 

Normalize the input columns to put them all on the same scale

need to remove environmentals field from functions - maybe change toxins to predictors?

toxins <- c("gtx4","gtx1","dcgtx3","gtx5","dcgtx2","gtx3","gtx2","neo","dcstx","stx","c1","c2")
psp <- psp |>
  psptools::normalize_input(toxins, environmentals=c())

summary(psp)
      id            location_id             date              species         
 Length:11229       Length:11229       Min.   :2014-04-01   Length:11229      
 Class :character   Class :character   1st Qu.:2015-07-13   Class :character  
 Mode  :character   Mode  :character   Median :2017-06-27   Mode  :character  
                                       Mean   :2018-02-05                     
                                       3rd Qu.:2020-05-20                     
                                       Max.   :2024-06-10                     
                                                                              
   sampleid         total_toxicity          lat             lon        
 Length:11229       Min.   :   0.000   Min.   :43.10   Min.   :-70.77  
 Class :character   1st Qu.:   0.000   1st Qu.:43.77   1st Qu.:-69.94  
 Mode  :character   Median :   0.810   Median :43.93   Median :-69.33  
                    Mean   :  16.664   Mean   :44.10   Mean   :-68.93  
                    3rd Qu.:   6.252   3rd Qu.:44.52   3rd Qu.:-67.79  
                    Max.   :3092.832   Max.   :44.97   Max.   :-66.98  
                                       NA's   :29      NA's   :29      
     year              gap_days      classification   meets_gap      
 Length:11229       Min.   : 0.000   Min.   :0.0000   Mode :logical  
 Class :character   1st Qu.: 6.000   1st Qu.:0.0000   FALSE:1815     
 Mode  :character   Median : 7.000   Median :0.0000   TRUE :9414     
                    Mean   : 6.965   Mean   :0.3405                  
                    3rd Qu.: 7.000   3rd Qu.:0.0000                  
                    Max.   :96.000   Max.   :3.0000                  
                                                                     
      gtx4              gtx1              dcgtx3              gtx5          
 Min.   :0.00000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000  
 Median :0.00000   Median :0.000000   Median :0.000000   Median :0.0000000  
 Mean   :0.00512   Mean   :0.004778   Mean   :0.001515   Mean   :0.0009969  
 3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.0000000  
 Max.   :1.00000   Max.   :1.000000   Max.   :1.000000   Max.   :1.0000000  
                                                                            
     dcgtx2              gtx3                gtx2               neo          
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Median :0.000000   Median :0.0006825   Median :0.000000   Median :0.000000  
 Mean   :0.002345   Mean   :0.0047486   Mean   :0.003967   Mean   :0.003653  
 3rd Qu.:0.000000   3rd Qu.:0.0028490   3rd Qu.:0.000000   3rd Qu.:0.000000  
 Max.   :1.000000   Max.   :1.0000000   Max.   :1.000000   Max.   :1.000000  
                                                                             
     dcstx               stx                 c1                 c2         
 Min.   :0.00e+00   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000  
 1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000  
 Median :0.00e+00   Median :0.000000   Median :0.000000   Median :0.00000  
 Mean   :9.53e-05   Mean   :0.004128   Mean   :0.002749   Mean   :0.00608  
 3rd Qu.:0.00e+00   3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00000  
 Max.   :1.00e+00   Max.   :1.000000   Max.   :1.000000   Max.   :1.00000  
                                                                           

Group by location and year

For the purpose of this demo, we’ll just work on one group

g <- psp |>
  filter(location_id == "PSP12.28" & year == 2014)

g
# A tibble: 28 × 24
   id         location_id date       species sampleid total_toxicity   lat   lon
   <chr>      <chr>       <date>     <chr>   <chr>             <dbl> <dbl> <dbl>
 1 PSP12.28_… PSP12.28    2014-04-04 mytilus 040414P…          5.33   43.8 -69.9
 2 PSP12.28_… PSP12.28    2014-04-08 mytilus 040814P…          0.565  43.8 -69.9
 3 PSP12.28_… PSP12.28    2014-04-15 mytilus 041514P…          8.82   43.8 -69.9
 4 PSP12.28_… PSP12.28    2014-04-22 mytilus 042214P…         12.9    43.8 -69.9
 5 PSP12.28_… PSP12.28    2014-04-28 mytilus 042814P…         18.1    43.8 -69.9
 6 PSP12.28_… PSP12.28    2014-05-06 mytilus 050614P…          7.86   43.8 -69.9
 7 PSP12.28_… PSP12.28    2014-05-12 mytilus 051214P…         15.3    43.8 -69.9
 8 PSP12.28_… PSP12.28    2014-05-19 mytilus 051914P…         22.0    43.8 -69.9
 9 PSP12.28_… PSP12.28    2014-05-27 mytilus 052714P…        148.     43.8 -69.9
10 PSP12.28_… PSP12.28    2014-06-02 mytilus 060214P…        492.     43.8 -69.9
# ℹ 18 more rows
# ℹ 16 more variables: year <chr>, gap_days <dbl>, classification <dbl>,
#   meets_gap <lgl>, gtx4 <dbl>, gtx1 <dbl>, dcgtx3 <dbl>, gtx5 <dbl>,
#   dcgtx2 <dbl>, gtx3 <dbl>, gtx2 <dbl>, neo <dbl>, dcstx <dbl>, stx <dbl>,
#   c1 <dbl>, c2 <dbl>

For each location-year subset

n_steps <- 2
forecast_steps <- 1

Find how many samples can be made (n_batches) List the samples indices (batches)

nb <- n_batches(nrow(g), (n_steps+forecast_steps))

nb
[1] 26
batches <- compute_batches(nb, (n_steps+forecast_steps))

batches[1:4]
[[1]]
[1] 1 2 3

[[2]]
[1] 2 3 4

[[3]]
[1] 3 4 5

[[4]]
[1] 4 5 6

For each batch

Slice out the rows for the image

Check gap status of each row - if any don’t meet the requirement (besides the first), move on to the next batch

batch <- batches[[1]]
batch
[1] 1 2 3
b <- g |> 
  dplyr::slice(batch)
b
# A tibble: 3 × 24
  id    location_id date       species sampleid total_toxicity   lat   lon year 
  <chr> <chr>       <date>     <chr>   <chr>             <dbl> <dbl> <dbl> <chr>
1 PSP1… PSP12.28    2014-04-04 mytilus 040414P…          5.33   43.8 -69.9 2014 
2 PSP1… PSP12.28    2014-04-08 mytilus 040814P…          0.565  43.8 -69.9 2014 
3 PSP1… PSP12.28    2014-04-15 mytilus 041514P…          8.82   43.8 -69.9 2014 
# ℹ 15 more variables: gap_days <dbl>, classification <dbl>, meets_gap <lgl>,
#   gtx4 <dbl>, gtx1 <dbl>, dcgtx3 <dbl>, gtx5 <dbl>, dcgtx2 <dbl>, gtx3 <dbl>,
#   gtx2 <dbl>, neo <dbl>, dcstx <dbl>, stx <dbl>, c1 <dbl>, c2 <dbl>

Select the predictor variable columns, and make them into a matrix (2 dimensions - predictors/columns and number of weeks/rows)

i <- as.matrix(dplyr::ungroup(b) |> 
      dplyr::select(dplyr::all_of(toxins)))
round(i,3)
     gtx4 gtx1 dcgtx3 gtx5 dcgtx2  gtx3 gtx2 neo dcstx stx c1    c2
[1,]    0 0.00      0    0      0 0.006    0   0     0   0  0 0.000
[2,]    0 0.00      0    0      0 0.001    0   0     0   0  0 0.000
[3,]    0 0.01      0    0      0 0.001    0   0     0   0  0 0.006

Store the image, label and metadata in a list

list(status=TRUE,
     year =           "2014",
     location_id =    b$location_id[1],
     classification = b$classification[n_steps+forecast_steps],
     toxicity =       b$total_toxicity[n_steps+forecast_steps],
     date =           b$date[n_steps],
     image =          i[1:n_steps,])
$status
[1] TRUE

$year
[1] "2014"

$location_id
[1] "PSP12.28"

$classification
[1] 0

$toxicity
[1] 8.821341

$date
[1] "2014-04-08"

$image
     gtx4 gtx1 dcgtx3 gtx5 dcgtx2         gtx3 gtx2 neo dcstx stx c1 c2
[1,]    0    0      0    0      0 0.0062662855    0   0     0   0  0  0
[2,]    0    0      0    0      0 0.0006640863    0   0     0   0  0  0