diff --git a/examples/3_rowcrop/01_ERA5_nc_to_clim.R b/examples/3_rowcrop/01_ERA5_nc_to_clim.R
index 0fc5e51..42aa45e 100755
--- a/examples/3_rowcrop/01_ERA5_nc_to_clim.R
+++ b/examples/3_rowcrop/01_ERA5_nc_to_clim.R
@@ -84,8 +84,8 @@ furrr::future_pwalk(
args$site_era5_path,
paste("ERA5", site_id, ens_id, sep = "_")
),
- start_date = args$start_date,
- end_date = args$end_date,
+ start_date = start_date,
+ end_date = end_date,
in.prefix = paste0("ERA5.", ens_id),
outfolder = file.path(args$site_sipnet_met_path, site_id)
)
diff --git a/examples/3_rowcrop/02_ic_build.R b/examples/3_rowcrop/02_ic_build.R
index 9c07737..f333760 100755
--- a/examples/3_rowcrop/02_ic_build.R
+++ b/examples/3_rowcrop/02_ic_build.R
@@ -10,7 +10,7 @@ options <- list(
help = "CSV giving ids, locations, and PFTs for sites of interest"
),
optparse::make_option("--field_shape_path",
- default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb",
+ default = "data_raw/management/crops/v4.1/parcels-consolidated.gpkg",
help = "file containing site geometries, used for extraction from rasters"
),
optparse::make_option("--ic_ensemble_size",
@@ -49,7 +49,7 @@ options <- list(
)
),
optparse::make_option("--params_read_from_pft",
- default = "SLA,leafC", # SLA units are m2/kg, leafC units are %
+ default = "SLA,leafC,leafGrowth", # Units: SLA m2/kg, leafC percent, leafGrowth g C/m2
help = "Parameters to read from the PFT file, comma separated"
),
optparse::make_option("--landtrendr_raw_files",
@@ -124,7 +124,7 @@ additional_params <- args$additional_params |>
site_info <- read.csv(
args$site_info_path,
- colClasses = c(field_id = "character")
+ colClasses = c(id = "character", field_id = "character")
)
site_info$start_date <- args$run_start_date
site_info$LAI_date <- args$run_LAI_date
@@ -136,7 +136,11 @@ PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m")
soilc_csv_path <- file.path(args$data_dir, "soilgrids_soilC_data.csv")
if (file.exists(soilc_csv_path)) {
PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path)
- soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE)
+ soil_carbon_est <- read.csv(
+ soilc_csv_path,
+ check.names = FALSE,
+ colClasses = c(Site_ID = "character", Site_Name = "character")
+ )
sites_needing_soilc <- site_info |>
filter(!id %in% soil_carbon_est$Site_ID)
} else {
@@ -166,7 +170,9 @@ sm_outdir <- file.path(args$data_dir, "soil_moisture") |>
sm_csv_path <- file.path(args$data_dir, "sm.csv") # name is hardcorded by fn
if (file.exists(sm_csv_path)) {
PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path)
- soil_moisture_est <- read.csv(sm_csv_path)
+ soil_moisture_est <- read.csv(
+ sm_csv_path,
+ colClasses = c(site.id = "character"))
sites_needing_soilmoist <- site_info |>
filter(!id %in% soil_moisture_est$site.id)
} else {
@@ -199,7 +205,11 @@ PEcAn.logger::logger.info("LAI")
lai_csv_path <- file.path(args$data_dir, "LAI_bysite.csv")
if (file.exists(lai_csv_path)) {
PEcAn.logger::logger.info("using existing LAI file", lai_csv_path)
- lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames?
+ lai_est <- read.csv(
+ lai_csv_path,
+ check.names = FALSE, # TODO edit MODIS_LAI_prep to use valid colnames?
+ colClasses = c(site_id = "character")
+ )
sites_needing_lai <- site_info |>
filter(!id %in% lai_est$site_id)
} else {
@@ -235,7 +245,10 @@ if (file.exists(landtrendr_csv_path)) {
"using existing LandTrendr AGB file",
landtrendr_csv_path
)
- agb_est <- read.csv(landtrendr_csv_path)
+ agb_est <- read.csv(
+ landtrendr_csv_path,
+ colClasses = c(site_id = "character")
+ )
sites_needing_agb <- site_info |>
filter(!id %in% agb_est$site_id)
} else {
@@ -256,18 +269,19 @@ if (nagb > 0) {
lt_sd <- terra::rast(lt_sd_path)
field_shp <- terra::vect(args$field_shape_path)
- site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |>
+ site_bnds <- field_shp[field_shp$parcel_id %in% sites_needing_agb$field_id, ] |>
terra::project(lt_med)
# Check for unmatched sites
# TODO is stopping here too strict? Could reduce to warning if needed
- stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID))
+ stopifnot(all(sites_needing_agb$field_id %in% site_bnds$parcel_id))
new_agb <- lt_med |>
terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |>
terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |>
as.data.frame() |>
- left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |>
+ mutate(parcel_id = as.character(parcel_id)) |>
+ left_join(sites_needing_agb, by = c("parcel_id" = "field_id")) |>
dplyr::select(
site_id = id,
AGB_median_Mg_ha = ends_with("median"),
@@ -414,6 +428,9 @@ ic_samples <- initial_condition_estimated |>
tidyr::pivot_wider(names_from = variable, values_from = sample) |>
dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |>
dplyr::mutate(
+ # Experimental hack: Assume leafGrowth==0 means this is an annual crop, force starting biomass to zero even if LandTrendr says otherwise.
+ # This might be a terrible idea.
+ AbvGrndBiomass = if_else(leafGrowth == 0.0, 0, AbvGrndBiomass),
AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction,
leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100),
wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0)
diff --git a/examples/3_rowcrop/02a_build_events.R b/examples/3_rowcrop/02a_build_events.R
new file mode 100755
index 0000000..3d30262
--- /dev/null
+++ b/examples/3_rowcrop/02a_build_events.R
@@ -0,0 +1,160 @@
+#!/usr/bin/env Rscript
+
+# Creates a single PEcAn-formatted `events.JSON` containing all management
+# events from all years of all the locations in `site_info.csv`,
+# then divides it into a Sipnet-formatted `events.in` for each site.
+
+## ---------------------- parse command-line options --------------------------
+options <- list(
+ optparse::make_option("--site_info_path",
+ default = "site_info.csv",
+ help = "CSV giving ids, locations, and PFTs for sites of interest"
+ ),
+ optparse::make_option("--mgmt_file_dir",
+ default = "data_raw/management",
+ help = paste(
+ "Directory containing management inputs in Parquet format.",
+ "Note: Code currently looks for subpaths that include hard-coded",
+ "version numbers for each management type."
+ )
+ ),
+ optparse::make_option("--event_outdir",
+ default = "data/events",
+ help = "directory to write events-*.in, events.json, and phenology.csv"
+ )
+) |>
+ # Show default values in help message
+ purrr::modify(\(x) {
+ x@help <- paste(x@help, "[default: %default]")
+ x
+ })
+
+args <- optparse::OptionParser(option_list = options) |>
+ optparse::parse_args()
+
+## -------------------------- end option parsing ------------------------------
+
+library(tidyverse)
+
+# TODO these probably deserve to be runtime args,
+# but first better fix other version-specific assumptions below
+mgmt_subdirs <- list(
+ pheno = "phenology/v1.0",
+ plant = "planting/v1.0",
+ harv = "harvest/v1.0",
+ till = "tillage/v1.0",
+ irri = "irrigation/v1.0",
+ fert = NULL, # TODO
+ occ = NULL # TODO
+)
+
+if (!dir.exists(args$event_outdir)) {
+ dir.create(args$event_outdir, recursive = TRUE)
+}
+
+ids <- read.csv(
+ args$site_info_path,
+ colClasses = c(field_id = "character")
+)$field_id
+
+# read management files from inside mgmt directory
+# TODO will break if not all mgmt types live at same path
+# (e.g. user wants to pull in their own tillage but use monitored plant/harv)
+read_parquet_years <- function(dir, parcel_ids, id_var = "parcel_id") {
+ file.path(args$mgmt_file_dir, dir) |>
+ list.files("*.parq(uet)?$", full.names = TRUE) |>
+ arrow::open_dataset() |>
+ # as.character is a hack bc planting dataset has id as string some years and int others.
+ # TODO remove when fixed upstream.
+ filter(as.character(.data[[id_var]]) %in% parcel_ids) |>
+ collect()
+}
+
+# phenology doesn't have to be json -- we'll just write one CSV of all sites and years
+pheno <- read_parquet_years(mgmt_subdirs$pheno, ids, "site_id") |>
+ # TODO some seasons wrap past year end (e.g. 2025-09-08 to 2026-03-29) --
+ # current code drops year so leafonday > leafoffday, which write.config skips
+ # as invalid.
+ mutate(
+ leafonday = lubridate::yday(leafonday),
+ leafoffday = lubridate::yday(leafoffday)
+ )
+# If csv exists already, append instead of clobbering
+# TODO keep old version of the dups instead of updating?
+# Not sure which will be less surprising
+pheno_out_path <- file.path(args$event_outdir, "phenology.csv")
+if (file.exists(pheno_out_path)) {
+ existing_pheno <- read.csv(
+ pheno_out_path,
+ colClasses = c(site_id = "character"))
+ site_dups <- intersect(pheno$site_id, existing_pheno$site_id)
+ if (length(site_dups) > 0) {
+ warning(
+ "Overwriting existing phenology records for ",
+ length(site_dups), " sites in ", pheno_out_path
+ )
+ }
+ pheno <- existing_pheno |>
+ filter(!(site_id %in% site_dups)) |>
+ bind_rows(pheno)
+}
+write.csv(pheno, file = pheno_out_path, row.names = FALSE)
+
+plant <- read_parquet_years(mgmt_subdirs$plant, ids, "site_id") |>
+ # TODO fix upstream
+ dplyr::rename(
+ leaf_c_kg_m2 = C_LEAF,
+ wood_c_kg_m2 = C_STEM,
+ fine_root_c_kg_m2 = C_FINEROOT,
+ coarse_root_c_kg_m2 = C_COARSEROOT,
+ ) |>
+ # Some planting events happen in the year previous to bulk of growth.
+ # This is only a problem at the very start of the run, where the reported
+ # planting date lands before start of simulation and causes a Sipnet error.
+ # Temporary workaround: Adjust dates forward.
+ # TODO: Should also (/instead?) adjust initial pool sizes in these cases.
+ dplyr::mutate(date = pmax(date, "2016-01-01"))
+
+harv <- read_parquet_years(mgmt_subdirs$harv, ids, "site_id")
+
+till <- read_parquet_years(mgmt_subdirs$till, ids, "site_id") |>
+ mutate(
+ date = OGMn_date,
+ # Placeholder -- better-informed conversion under development.
+ # Note that NaNs get forced to 0. This was an arbitrary choice.
+ tillage_eff_0to1 = pmax(0, pmin(1, ndti_pct_change / 100), na.rm = TRUE)
+ )
+
+irrig <- read_parquet_years(mgmt_subdirs$irri, ids, "parcel_id") |>
+ filter(
+ # Ignore ensemble dimension. TODO support event ensembles across all event types
+ ens_id == "irr_ens_001",
+ # ignore all irrigation events before start of simulation
+ date >= "2016-01-01"
+ ) |>
+ mutate(
+ event_type = "irrigation",
+ site_id = as.character(parcel_id),
+ date = as.character(date)
+ ) |>
+ select(site_id, event_type, date, amount_mm, method)
+
+# TODO add fertilization / NCC here when available
+
+all_events <- dplyr::bind_rows(plant, harv, till, irrig) |>
+ dplyr::arrange(site_id, date, event_type) |>
+ dplyr::nest_by(site_id, .key="events") |>
+ dplyr::mutate(pecan_events_version = "0.1.0")
+
+evt_json_path <- file.path(args$event_outdir, "combined_events.json")
+if (file.exists(evt_json_path)) {
+ # TODO append to existing file like for phenology?
+ # need more care to handle nesting right.
+ warning("Overwriting existing events file ", evt_json_path)
+}
+jsonlite::write_json(all_events, evt_json_path)
+
+# Now divide the json into Sipnet events files
+PEcAn.SIPNET::write.events.SIPNET(evt_json_path, args$event_outdir)
+
+
diff --git a/examples/3_rowcrop/03_xml_build.R b/examples/3_rowcrop/03_xml_build.R
index b89e906..68ceb70 100755
--- a/examples/3_rowcrop/03_xml_build.R
+++ b/examples/3_rowcrop/03_xml_build.R
@@ -88,6 +88,8 @@ args <- optparse::OptionParser(option_list = options) |>
## Whew, that was a lot of lines to define a few defaults!
+# papply emits a lot of uninformative debug messages; let's ignore those
+PEcAn.logger::logger.setLevel("INFO")
site_info <- read.csv(args$site_file)
stopifnot(
@@ -157,6 +159,14 @@ settings <- settings |>
path = args$event_dir,
path_template = "{path}/events-{id}.in"
) |>
+ # For now, hard-coding one phenology path for all sites, placed inside event dir
+ # TODO make settable?
+ setEnsemblePaths(
+ n_reps = args$n_ens,
+ input_type = "leaf_phenology",
+ path = args$event_dir,
+ path_template = "{path}/phenology.csv"
+ ) |>
papply(add_soil_pft)
# Update just the first component of the output directory,
diff --git a/examples/3_rowcrop/05_run_model.R b/examples/3_rowcrop/05_run_model.R
index 6667cc4..3988c0e 100755
--- a/examples/3_rowcrop/05_run_model.R
+++ b/examples/3_rowcrop/05_run_model.R
@@ -43,12 +43,13 @@ options(error = quote({
library("PEcAn.all")
-# Report package versions for provenance
-PEcAn.all::pecan_version()
-
# Open and read in settings file for PEcAn run.
settings <- PEcAn.settings::read.settings(args$settings)
+# Report package and model versions for provenance
+PEcAn.all::pecan_version()
+PEcAn.logger::logger.info(system2(settings$model$binary, "-v", stdout = TRUE))
+
# Start ecosystem model runs
if (PEcAn.utils::status.check("MODEL") == 0) {
PEcAn.utils::status.start("MODEL")
diff --git a/examples/3_rowcrop/README.md b/examples/3_rowcrop/README.md
index 6962262..19ca40e 100644
--- a/examples/3_rowcrop/README.md
+++ b/examples/3_rowcrop/README.md
@@ -27,18 +27,14 @@ simulation period.
## Caveats
-TODO UPDATE when no longer true
-
-This simulation is under active development and all the notes below are subject
-to change as we update the code.
-For now, instructions assume a local run on MacOS and will be updated for a
+Instructions assume a local run on MacOS and will be updated for a
Linux + Slurm + Apptainer HPC environment as we finish testing and deployment.
Aspirationally, any command prefixed with `[host_args]` is one that ought to
work on HPC by "just" adding a system-specific prefix, e.g.
-`./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on my machine becomes
+`./01_ERA5_nc_to_clim.R --start_date=2016-01-01` on my machine becomes
`sbatch -n16 --mem=12G --mail-type=ALL --uid=jdoe \
- ./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on yours.
+ ./01_ERA5_nc_to_clim.R --start_date=2016-01-01` on yours.
## Running the workflow
@@ -72,17 +68,14 @@ Contact chelsea.carey@arb.ca.gov for more information about the dataset.
Once obtained, place them in `data_raw/private/HSP` and run
```{sh}
-../tools/build_validation_siteinfo.R
+../../tools/build_validation_siteinfo.R
```
to create `validation_site_info.csv`.
### 1. Convert climate driver files
-TODO 1: current development version of PEcAn.sipnet still writes 13-col
- clim files with constants for grid index and soil water. Document which version writes correctly.
-
-TODO 2: show how to pass n_cores from host_args
+TODO: show how to pass n_cores from host_args
(NSLOTS? SLURM_CPUS_PER_TASK?)
```{sh}
@@ -91,7 +84,7 @@ TODO 2: show how to pass n_cores from host_args
--site_sipnet_met_path=data/ERA5_CA_SIPNET \
--site_info_file=data_raw/ca_half_degree_grid.csv \
--start_date=2016-01-01 \
- --end_date=2025-12-31 \
+ --end_date=2023-12-31 \
--n_cores=7
```
@@ -110,6 +103,9 @@ so I symlinked `data/IC_prep_val/soil_moisture/` to `data/IC_prep/soil_moisture/
--pft_dir=data_raw/pfts \
--data_dir=data/IC_prep_val \
--ic_outdir=data/IC_files
+
+../../tools/build_site_info.R --location_file=../../data/design_points.csv
+
[host_args] ./02_ic_build.R \
--site_info_path=site_info.csv \
--pft_dir=data_raw/pfts \
@@ -117,14 +113,65 @@ so I symlinked `data/IC_prep_val/soil_moisture/` to `data/IC_prep/soil_moisture/
--ic_outdir=data/IC_files
```
-### 3. generate settings file
+### 2a. Generate event files
+
+Management events are read from files produced by the monitoring pipeline or equivalent sources.
+The current version assumes all event types are provided as Parquet files, and that they live in subdirectories of `mgmt_file_dir` that include specific, currently hardcoded, version numbers. See script for the details.
+Future versions of the event_build script may support passing separate paths per data product and potentially also add support for alternate storage formats (e.g. allowing csv as well as parquet).
+
+```{sh}
+[host_args] ./02a_build_events.R \
+ --site_info_path=validation_site_info.csv \
+ --mgmt_file_dir=data_raw/management \
+ --event_outdir=data/val_events
+[host_args] ./02a_build_events.R \
+ --site_info_path=site_info.csv \
+ --mgmt_file_dir=data_raw/management \
+ --event_outdir=data/events
+```
+
+For validation we need an additional hack:
+`02a_build_events.R` created event files named by their *parcel* id,
+but the validation dataset uses a separate set of *site* ids derived by hashing locations, experiment names, and treatment codes.
+This is done to (1) keep locations opaque since the validation data are nonpublic, and (2) allow simulation of separate plots (treatments) whose locations all fall in a single parcel of the statewide map.
+A future solution would be to teach `03_xml_build.R` how to find event files that are named by parcel id, so that sites which share a parcel do not need to duplicate the file.
+For now though, let's duplicate the files of interest so they're named after the site IDs PEcAn will use. For now, I'm not cleaning up the originals afterwards -- they're small and might be used by other runs that reuse this data directory.
+
+```{r}
+vsi <- read.csv("validation_site_info.csv") |>
+ dplyr::distinct(id, field_id) |>
+ dplyr::rename(site_id = id)
+vsi |> purrr::pwalk(
+ \(site_id, field_id) file.copy(
+ paste0("data/val_events/events-", field_id, ".in"),
+ paste0("data/val_events/events-", site_id, ".in")
+ )
+)
+
+# And rename inside the phenology file, too
+read.csv("data/val_events/phenology.csv") |>
+ dplyr::rename(field_id = site_id) |>
+ dplyr::left_join(vsi) |>
+ write.csv("data/val_events/phenology.csv", row.names = FALSE)
+```
+
+
+### 3. Generate settings file
```{sh}
[host_args] ./03_xml_build.R \
+ --end_date=2023-12-31 \
--ic_dir=data/IC_files \
--site_file=validation_site_info.csv \
+ --event_dir=data/val_events \
--output_file=validation_settings.xml \
--output_dir_name=val_out
+[host_args] ./03_xml_build.R \
+ --ic_dir=data/IC_files \
+ --end_date=2023-12-31 \
+ --site_file=site_info.csv \
+ --output_file=settings.xml \
+ --output_dir_name=output
```
### 4. Set up model run directories
@@ -134,14 +181,16 @@ stage instead of in xml_build.
```{sh}
[host_args] ./04_set_up_runs.R --settings=validation_settings.xml
+host_args] ./04_set_up_runs.R --settings=settings.xml
```
### 5. Run model
```{sh}
export NCPUS=8
-ln -s [your/path/to]/sipnet/sipnet sipnet.git
+ln -s [your/path/to]/sipnet/sipnet sipnet
[host_args] ./05_run_model.R --settings=val_out/pecan.CONFIGS.xml
+[host_args] ./05_run_model.R --settings=output/pecan.CONFIGS.xml
```
### 6. Validate
diff --git a/examples/3_rowcrop/template.xml b/examples/3_rowcrop/template.xml
index 574014c..6386690 100644
--- a/examples/3_rowcrop/template.xml
+++ b/examples/3_rowcrop/template.xml
@@ -9,6 +9,11 @@
output/out
output/run
+
+ annual_crop
+ data_raw/pfts/annual_crop/post.distns.Rdata
+ data_raw/pfts/annual_crop/
+
temperate.deciduous
data_raw/pfts/temperate.deciduous/post.distns.Rdata
@@ -44,6 +49,9 @@
sampling
+
+ sampling
+
@@ -54,14 +62,27 @@
SIPNET
2.0.0
TRUE
- sipnet.git
+ ./sipnet
0
- 1
- 1
+ 0
+ 0
1
+
+ localhost
+ output/out
+ output/run
+
+
+ squeue -j @JOBID@ || echo DONE
+
+ parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::
+
+ 1000
+
+
@@ -79,21 +100,11 @@
+
+
+
-
- localhost
- output/out
- output/run
-
-
- squeue -j @JOBID@ || echo DONE
-
- parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::
-
- 1000
-
-
diff --git a/examples/3_rowcrop/validate.R b/examples/3_rowcrop/validate.R
index 9c54442..7a3e985 100755
--- a/examples/3_rowcrop/validate.R
+++ b/examples/3_rowcrop/validate.R
@@ -29,6 +29,35 @@ args <- optparse::OptionParser(option_list = options) |>
library(tidyverse)
+## Function definitions
+
+# Calculate change per year from successive timepoints.
+# @param df: dataframe containing at least column `year`
+# plus any others to be converted to differences between years
+# @return dataframe one row shorter than input, with year column removed
+# and all others converted to difference on a yearly basis
+# @examples
+# data.frame(
+# year = c(2020, 2021, 2025),
+# x = c(1, 2, 4),
+# y = c(10, 5, 1)
+# ) |> diff_years()
+# # x y
+# # 2 1.0 -5
+# # 3 0.5 -1
+diff_years <- function(df) {
+ df |>
+ arrange(year) |>
+ # convert all vars, including year, to deltas for each timestep
+ mutate_all(\(x) (x - lag(x))) |>
+ # then scale non-time deltas to timestep length
+ mutate(across(-year, \(x) x / year)) |>
+ select(-year) |>
+ _[-1,]
+}
+
+
+
## Read validation data, identify target years from each site
soc_obs <- read.csv(args$val_data_path) |>
@@ -91,8 +120,8 @@ soc_sim <- sim_files_wanted |>
)
) |>
unnest(contents) |>
- group_by(ens_num, site, year) |>
- slice_max(posix)
+ ungroup() |>
+ slice_max(posix, by = c(ens_num, site, year))
## Combine and align obs + sim
@@ -100,14 +129,40 @@ soc_compare <- soc_sim |>
left_join(soc_obs) |>
# TODO these filters need refinement --
# eg Are NAs actually expected or should they trigger complaints?
- drop_na(obs_SOC) |>
+ drop_na(obs_SOC) #|>
# TODO excluded as surprisingly high
# May want to re-include after inspecting data for individual sites
- filter(obs_SOC < 20)
+ # filter(obs_SOC < 20)
# TODO will eventually want to have PFTs labeled here
if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
+# Debug use only:
+# Contains private treatment names, so do not leave this CSV sitting in your output directory
+# write.csv(
+# soc_compare |> select(-path),
+# file.path(args$output_dir, "soc_compare_tmp.csv"),
+# row.names = FALSE
+# )
+
+
+
+# SOC change between sequential measurements, in g/m2/yr
+soc_timedelta <- soc_compare |>
+ # some years have multiple samples, others don't; can't track individual samples across years.
+ # Instead collapse these to treatment means
+ # TODO propagate variance?
+ summarize(
+ across(c(TotSoilCarb, obs_SOC), mean),
+ .by = c(ens_num, site, year)
+ ) |>
+ nest_by(ens_num, site) |>
+ filter(nrow(data) > 1) |>
+ mutate(yrly_diff = map(list(data), diff_years)) |>
+ unnest(yrly_diff)
+
+
+
## lm fit + CIs
soc_fits <- soc_compare |>
@@ -147,6 +202,34 @@ soc_ci <- soc_fits |>
pred_mean = mean(pred),
)
+soc_timedelta_fits <- soc_timedelta |>
+ ungroup() |>
+ nest_by(ens_num) |>
+ mutate(
+ fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)),
+ r2 = summary(fit)$adj.r.squared,
+ nse = 1 - (
+ sum((data$obs_SOC - data$TotSoilCarb)^2) /
+ sum((data$obs_SOC - mean(data$obs_SOC))^2)
+ ),
+ rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)),
+ bias = mean(data$TotSoilCarb - data$obs_SOC)
+ )
+soc_timedelta_ci <- soc_timedelta_fits |>
+ mutate(
+ predx = list(seq(min(data$obs_SOC), max(data$obs_SOC), by = 0.1)),
+ pred = list(predict(fit, data.frame(obs_SOC = predx)))
+ ) |>
+ unnest(c(predx, pred)) |>
+ ungroup() |>
+ group_by(predx) |>
+ summarize(
+ pred_q5 = quantile(pred, 0.05),
+ pred_q95 = quantile(pred, 0.95),
+ pred_mean = mean(pred),
+ )
+
+
## Scatterplot
soc_lm_plot <- ggplot(soc_compare) +
@@ -186,6 +269,35 @@ ggsave(
)
+soc_timedelta_plot <- ggplot(soc_timedelta) +
+ aes(obs_SOC, TotSoilCarb) +
+ geom_point() +
+ geom_abline(lty = "dotted") +
+ geom_ribbon(
+ data = soc_timedelta_ci,
+ mapping = aes(
+ x = predx,
+ ymin = pred_q5,
+ ymax = pred_q95,
+ y = NULL
+ ),
+ alpha = 0.4
+ ) +
+ geom_line(
+ data = soc_timedelta_ci,
+ mapping = aes(predx, pred_mean),
+ col = "blue"
+ ) +
+ xlab("Change in measured 0-30 cm soil C stock (kg C / m2 / yr)") +
+ ylab("Change in simulated 0-30 cm soil C stock (kg C / m2 / yr)") +
+ theme_bw()
+ggsave(
+ file.path(args$output_dir, "SOC_yrly_scatter.png"),
+ plot = soc_timedelta_plot,
+ height = 8,
+ width = 8
+)
+
soc_fits |>
ungroup() |>
summarize(across(r2:bias, c(mean = mean, sd = sd))) |>
diff --git a/tools/build_site_info.R b/tools/build_site_info.R
new file mode 100755
index 0000000..6a95c56
--- /dev/null
+++ b/tools/build_site_info.R
@@ -0,0 +1,155 @@
+#!/usr/bin/env Rscript
+
+# (re)building site info from an existing set of design points,
+# retaining location but updating to use harmonized parcel IDs
+# and PFT asignments for 2016 (previous version used 2018)
+
+# This may need further adjustment to account for PFT timeseries once restarts
+# are enabled.
+
+## ---------------------- parse command-line options --------------------------
+options <- list(
+ optparse::make_option("--location_file",
+ default = "../data/design_points.csv",
+ help = paste(
+ "CSV giving at least lat and lon for sites of interest.",
+ "Any other columns will be passed unchanged to the output."
+ )
+ ),
+ optparse::make_option("--out_file",
+ default = "site_info.csv",
+ help = "Path to write CSV with parcel ids and PFTs added"
+ ),
+ optparse::make_option("--parcel_file",
+ default = "data_raw/management/crops/v4.1/parcels-consolidated.gpkg",
+ help = "Geopackage to be used for spatial lookup of parcel IDs"
+ ),
+ optparse::make_option("--crop_file",
+ default = "data_raw/management/crops/v4.1/crops_all_years.parq",
+ help = "Parquet file containing harmonized DWR crop history"
+ )
+) |>
+ # Show default values in help message
+ purrr::modify(\(x) {
+ x@help <- paste(x@help, "[default: %default]")
+ x
+ })
+
+args <- optparse::OptionParser(option_list = options) |>
+ optparse::parse_args()
+
+## -------------------------- end option parsing ------------------------------
+
+
+library(tidyverse)
+
+
+
+#' Assign DWR/LandIQ California crop codes to Sipnet PFT names
+#'
+#' Built for the MAGiC project, may or may not be applicable elsewhere
+#'
+#' @param CLASS vector of crop class codes (1-2 capital letters each)
+#' @param SUBCLASS vector of crop identifiers (1-2 numeric digits each)
+#' @return PFT assignments as character, NA if unclassified
+dwr_crop_to_pft <- function(CLASS, SUBCLASS) {
+ dplyr::case_when(
+
+ ## N fixers, treated as generic annual until N fixer PFT is created
+ CLASS == "F" & SUBCLASS %in% c(10) ~ "annual_crop", # dry beans
+ CLASS == "P" & SUBCLASS %in% c(1, 2) ~ "annual_crop", # alfalfa, clover
+ CLASS == "T" & SUBCLASS %in% c(3, 11) ~ "annual_crop", # Green beans, peas
+
+ ## subclasses with physiology differing from the rest of their class
+ # woody berries
+ CLASS == "T" & SUBCLASS %in% c(19, 28) ~ "temperate.deciduous",
+ # "Flowers, nursery & Christmas tree farms"
+ # (A weird grouping, but assuming these are most likely to be tree-like)
+ CLASS == "T" & SUBCLASS %in% c(16) ~ "temperate.deciduous",
+
+ ## Whole-class assignments
+ CLASS %in% c("F", "G", "T") ~ "annual_crop", # field crops, grains/hay, truck crops
+ CLASS %in% c("P") ~ "grass", # perennial pasture grass; annual grasses in G
+ CLASS %in% c("D", "C", "V", "YP") ~ "temperate.deciduous", # deciduous, citrus, vineyard, young perennial
+ CLASS %in% c("R") ~ "grass", # Rice; TODO update when rice PFT is created
+ CLASS %in% c("X", "I") ~ "annual_crop", # fallow, not cropped, or unclassified
+ # TODO maybe this should just get soil PFT or be skipped during site selection?
+ # Logic for defaulting to annual crop here:
+ # Temporarily idle/fallow likely to grow small amount annual weeds;
+ # If no plant/harv events, annual will grow very little.
+
+ # Urban, industrial, native vegetation, semi-agricultural, vacant, etc
+ TRUE ~ NA_character_
+ )
+}
+
+#' Look up parcel IDs from harmonized DWR California crop map
+#'
+#' @param df dataframe with at least columns `lat` and `lon`
+#' Any other columns will be passed through unchanged
+#'
+#' @return dataframe with `parcel_id` column added
+#'
+#' @examples
+#' point_to_dwr_parcelid(
+#' c(32.18, 32.22),
+#' c(-122.22, -123.18),
+#' "data_raw/management/crops/v4.1/parcels-consolidated.gpkg"
+#' )
+#'
+point_to_dwr_parcelid <- function(df, geo_file = args$parcel_file) {
+ stopifnot(is.numeric(df$lat), is.numeric(df$lon))
+ parcel_geo <- terra::vect(geo_file)
+ nearest_parcels <- df |>
+ terra::vect(crs="epsg:4326") |>
+ terra::project(parcel_geo) |>
+ terra::nearest(parcel_geo)
+
+ df |>
+ dplyr::mutate(parcel_id = parcel_geo$parcel_id[nearest_parcels$to_id])
+}
+
+#' @param ids vector of parcel ids
+#' @param years,seasons numeric vectors to subset by.
+#' If not specified, returns all years and seasons.
+#' @param crop_file path to a Parquet file containing harmonized DWR crop history
+#' @return dataframe of crop info
+dwr_parcelid_to_crop <- function(
+ ids,
+ years = NULL,
+ seasons = NULL,
+ crop_file = args$crop_file) {
+ cropdat <- arrow::open_dataset(args$crop_file) |>
+ dplyr::filter(.data$parcel_id %in% ids) |>
+ select(parcel_id, year, season, CLASS, SUBCLASS)
+ if (!is.null(years)) {
+ cropdat <- cropdat |> dplyr::filter(.data$year %in% years)
+ }
+ if (!is.null(seasons)) {
+ cropdat <- cropdat |> dplyr::filter(.data$season %in% seasons)
+ }
+
+ dplyr::collect(cropdat)
+}
+
+
+design_pts <- read.csv(args$location_file)
+pts_matched <- point_to_dwr_parcelid(design_pts)
+crop_2016 <- dwr_parcelid_to_crop(pts_matched$parcel_id, years = 2016, seasons = 2) |>
+ mutate(site.pft = dwr_crop_to_pft(CLASS, SUBCLASS)) |>
+ dplyr::select("parcel_id", "site.pft")
+
+if (!is.null(design_pts$id) && anyDuplicated(design_pts$id)) {
+ PEcAn.logger::logger.severe("column `id` of design points is not unique")
+}
+
+site_info <- pts_matched |>
+ left_join(crop_2016, by = "parcel_id") |>
+ rename(field_id = parcel_id) # TODO propagate `parcel_id` convention further downstream?
+ # OR rethink naming: id vs site_id vs something else?
+
+if (is.null(site_info$id)) {
+ site_info$id <- site_info$field_id
+}
+
+write.csv(site_info, args$out_file, row.names = FALSE)
diff --git a/tools/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R
index 53d95d5..e8872d8 100755
--- a/tools/build_validation_siteinfo.R
+++ b/tools/build_validation_siteinfo.R
@@ -14,13 +14,12 @@ data_dir <- "data_raw/private/HSP/"
soc_file <- "Harmonized_Data_Croplands.csv"
mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv"
+loc_tmp_file <- "validation_site_locs.csv"
+output_file <- "validation_site_info.csv"
+on.exit(unlink(loc_tmp_file), add = TRUE)
# can't use datapoints from before our simulations start
min_yr <- 2016
-# parcel ID lookup
-field_map <- "data_raw/management/crops/v4.1/parcels-consolidated.gpkg"
-field_pft_info <- "data_raw/management/crops/v4.1/crops_all_years.parq"
-
# For first pass, selecting only the control/no-treatment plots.
# TODO: revisit this as we build more management into the workflow.
site_ids <- read.csv(file.path(data_dir, mgmt_file)) |>
@@ -32,47 +31,28 @@ site_locs <- read.csv(file.path(data_dir, soc_file)) |>
# some IDs in site_locs contain spaces stripped in site_id; let's match
mutate(BaseID = gsub("\\s+", "", BaseID)) |>
distinct(ProjectName, BaseID, Latitude, Longitude) |>
- rename(lat = Latitude, lon = Longitude) |>
- inner_join(site_ids)
-
-dwr_fields <- terra::vect(field_map)
-dwr_field_pfts <- arrow::read_parquet(field_pft_info) |>
- dplyr::filter(year == min_yr, season == 2) |>
- select(parcel_id, crop_class = CLASS)
-site_dwr_ids <- site_locs |>
- terra::vect(crs = "epsg:4326") |>
- terra::project(dwr_fields) |>
- terra::nearest(dwr_fields) |>
- as.data.frame() |>
- mutate(
- ProjectName = site_locs$ProjectName[from_id],
- BaseID = site_locs$BaseID[from_id],
- parcel_id = dwr_fields$parcel_id[to_id],
- ) |>
- left_join(dwr_field_pfts)
-
-stopifnot(nrow(site_dwr_ids) == nrow(site_locs))
-
-site_locs |>
- left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |>
mutate(
- id = paste(ProjectName, BaseID, lat, lon) |>
- purrr::map_chr(rlang::hash),
- pft = dplyr::case_when(
- crop_class %in% c("G", "F", "P", "T") ~ "grass",
- crop_class %in% c("D", "C", "V", "YP") ~ "temperate.deciduous",
- # TODO later: R = rice, T19 & T28 = woody berries
- TRUE ~ NA_character_
- )
+ val_id = paste(ProjectName, BaseID, Latitude, Longitude) |>
+ purrr::map_chr(rlang::hash)
) |>
- # TODO is this desirable?
- # In production, may be better to complain if no PFT match
- drop_na(pft) |>
+ inner_join(site_ids) |>
+ distinct(id = val_id, lat = Latitude, lon = Longitude)
+
+# Match against field IDs and crop codes using same script used on non-val files
+# TODO avoid round trip to CSV here...
+# it's only here because I replaced a chunk of code that was redundant with
+# `build_site_info.R` and this was faster than thinking how to refactor further
+write.csv(site_locs, loc_tmp_file, row.names = FALSE)
+callr::rscript(
+ "../../tools/build_site_info.R",
+ cmdargs = c(paste0("--location_file=", loc_tmp_file),
+ paste0("--out_file=", output_file))
+)
+read.csv(output_file) |>
# Temporary hack:
# Where multiple treatments share a location,
# use only one of them
- group_by(lat, lon, pft) |>
+ group_by(lat, lon, site.pft) |>
slice_sample(n = 1) |>
ungroup() |>
- select(id, field_id = parcel_id, lat, lon, site.pft = pft) |>
- write.csv("validation_site_info.csv", row.names = FALSE)
+ write.csv(output_file, row.names = FALSE)
diff --git a/tools/run_CA_grid_ERA5_nc_extraction.R b/tools/run_CA_grid_ERA5_nc_extraction.R
index c0bf935..68d6fdb 100644
--- a/tools/run_CA_grid_ERA5_nc_extraction.R
+++ b/tools/run_CA_grid_ERA5_nc_extraction.R
@@ -17,7 +17,7 @@ ca_bboxgrid <- expand.grid(
lon = seq(from = -124.5, to = -114, by = 0.5),
lat = seq(from = 32.5, to = 42, by = 0.5)
) |>
- mutate(id = paste0(lat, "N_", abs(lon), "W"))
+ dplyr::mutate(id = paste0(lat, "N_", abs(lon), "W"))
ca_gridcell_ids <- ca_bboxgrid |>
vect(crs = "epsg:4326") |>
project(ca_shp) |>
@@ -25,7 +25,7 @@ ca_gridcell_ids <- ca_bboxgrid |>
intersect(ca_shp) |>
_$id
ca_grid <- ca_bboxgrid |>
- filter(id %in% ca_gridcell_ids)
+ dplyr::filter(id %in% ca_gridcell_ids)
PEcAn.data.atmosphere::extract.nc.ERA5(
slat = ca_grid$lat,