techmatters
diff --git a/‎.github/workflows/api-snapshot-check.yml‎
Lines changed: 42 additions & 0 deletions b/‎.github/workflows/api-snapshot-check.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 32 additions & 24 deletions b/‎Makefile‎
Lines changed: 32 additions & 24 deletions
diff --git a/‎README.md‎
Lines changed: 25 additions & 17 deletions b/‎README.md‎
Lines changed: 25 additions & 17 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,42 @@
+name: API snapshot check
+
+on:
+  schedule:
+    - cron: '0 16 * * 1' # Monday 08:00 PST (UTC-8)
+  workflow_dispatch:
+
+jobs:
+  check-api-snapshot:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Update and install Linux packages
+        run: |
+          sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable
+          sudo apt-get update
+          sudo apt-get install libgdal-dev gdal-bin python3-gdal
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13.7'
+          cache-dependency-path: |
+            'requirements.txt'
+            'requirements-dev.txt'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+          cache-dependency-glob: 'requirements**.txt'
+
+      - name: Install Python dependencies
+        run: |
+          make install DC_ENV=ci
+          make install_dev DC_ENV=ci
+
+      - name: Run API snapshot tests
+        run: make test_api_snapshot
@@ -32,7 +32,7 @@ jobs:
           cache-dependency-glob: 'requirements**.txt'
 
       - name: Install Python dependencies
-        run: make install-dev DC_ENV=ci
+        run: make install_dev DC_ENV=ci
 
       - name: Run lint checks
         run: make lint
@@ -68,7 +68,7 @@ jobs:
       - name: Install Python dependencies
         run: |
           make install DC_ENV=ci
-          make install-dev DC_ENV=ci
+          make install_dev DC_ENV=ci
 
       - name: Restore data cache
         uses: actions/cache@v4
@@ -79,7 +79,7 @@ jobs:
 
       - name: Download data
         if: ${{ hashFiles('Data/*') == '' }}
-        run: make download-soil-data
+        run: make download_soil_data
 
       - name: Start soil id DB
         run: docker compose up -d
 
@@ -5,10 +5,10 @@ endif
 install:
 	uv pip install -r requirements.txt $(UV_FLAGS)
 
-install-dev:
+install_dev:
 	uv pip install -r requirements-dev.txt $(UV_FLAGS)
 
-setup-git-hooks:
+setup_git_hooks:
 	@pre-commit install
 
 lint:
@@ -22,46 +22,54 @@ format:
 lock:
 	CUSTOM_COMPILE_COMMAND="make lock" uv pip compile --upgrade --generate-hashes requirements/base.in -o requirements.txt
 
-lock-package:
+lock_package:
 	CUSTOM_COMPILE_COMMAND="make lock" uv pip compile --upgrade-package $(PACKAGE) --generate-hashes --emit-build-options requirements/base.in requirements/deploy.in -o requirements.txt
 
-lock-dev:
+lock_dev:
 	CUSTOM_COMPILE_COMMAND="make lock-dev" uv pip compile --upgrade --generate-hashes requirements/dev.in -o requirements-dev.txt
 
-lock-dev-package:
+lock_dev_package:
 	CUSTOM_COMPILE_COMMAND="make lock-dev" uv pip compile --upgrade-package $(PACKAGE) --generate-hashes requirements/dev.in -o requirements-dev.txt
 
-build:
-	echo "Building TK..."
-
-check_rebuild:
-	./scripts/rebuild.sh
-
 clean:
 	@find . -name *.pyc -delete
 	@find . -name __pycache__ -delete
 
-test: clean check_rebuild
+# run the standard test suite (unit + integration, no api_snapshots)
+test:
 	if [ -z "$(PATTERN)" ]; then \
-		$(DC_RUN_CMD) pytest soil_id -vv; \
+		pytest soil_id -m "not api_snapshot"; \
 	else \
-		$(DC_RUN_CMD) pytest soil_id -vv -k "$(PATTERN)"; \
+		pytest soil_id -m "not api_snapshot" -k "$(PATTERN)"; \
 	fi
 
-test_update_snapshots: clean check_rebuild
-	if [ -z "$(PATTERN)" ]; then \
-		$(DC_RUN_CMD) pytest soil_id --snapshot-update; \
-	else \
-		$(DC_RUN_CMD) pytest soil_id --snapshot-update -k "$(PATTERN)"; \
-	fi
+# All tests except api_snapshot and integration (no live external APIs)
+test_unit:
+	pytest soil_id -m "not api_snapshot and not integration"
+
+# update the unit test snapshots (but not the API snapshots)
+test_update_unit_snapshots:
+	pytest soil_id -m "not api_snapshot and not integration" --snapshot-update; \
+
+# Integration smoke tests only (full live API run, no output validation)
+test_integration:
+	pytest soil_id -m integration
+
+# API response snapshot tests only (compares live API responses to stored snapshots)
+test_api_snapshot:
+	pytest soil_id -m api_snapshot
+
+# Refresh stored API response snapshots from live APIs
+test_update_api_snapshots:
+	pytest soil_id -m api_snapshot --snapshot-update
 
-test-verbose:
+test_verbose:
 	pytest soil_id --capture=no
 
-test-profile:
+test_profile:
 	pytest soil_id --profile
 
-test-graphs: test-profile graphs
+test_graphs: test-profile graphs
 
 graphs:
 	# gprof2dot -f pstats  prof/combined.prof | dot -Tsvg -o prof/combined.svg
@@ -96,7 +104,7 @@ process_bulk_test_results_legacy:
 # 1P3xl1YRlfcMjfO_4PM39tkrrlL3hoLzv: gsmsoilmu_a_us.prj
 # 1K0GkqxhZiVUND6yfFmaI7tYanLktekyp: gsmsoilmu_a_us.dbf
 # 1z7foFFHv_mTsuxMYnfOQRvXT5LKYlYFN: SoilID_US_Areas.shz
-download-soil-data:
+download_soil_data:
 	mkdir -p Data
 	cd Data; \
 	gdown 1tN23iVe6X1fcomcfveVp4w3Pwd0HJuTe; \
 
@@ -2,42 +2,42 @@
 
 ## Requirements
 
--   Python: 3.12 or better
+- Python: 3.12 or better
 
 # Contributing
 
 Configure git to automatically lint your code and validate validate your commit messages:
 
 ```sh
-$ make setup-git-hooks
+$ make setup_git_hooks
 ```
 
 Set up a virtual environment and install dependencies:
 
 ```sh
 $ uv venv
 $ source .venv/bin/activate
-$ make install && make install-dev
+$ make install && make install_dev
 ```
 
 ## explanation of algorithm
 
 ### terminology
 
--   soil map unit: (possibly disjoint) geographic area that is associated with soil component percentage / arial coverage
--   soil series: collection of related soil components
--   soil component: description of various soil properties at specific depth intervals
+- soil map unit: (possibly disjoint) geographic area that is associated with soil component percentage / arial coverage
+- soil series: collection of related soil components
+- soil component: description of various soil properties at specific depth intervals
 
 ### references
 
--   equation 1 in https://landpotential.org/wp-content/uploads/2020/07/sssaj-0-0-sssaj2017.09.0337.pdf
+- equation 1 in https://landpotential.org/wp-content/uploads/2020/07/sssaj-0-0-sssaj2017.09.0337.pdf
 
 ### dependencies
 
--   simple features: https://r-spatial.github.io/sf/index.html
--   well-known geometry: https://paleolimbot.github.io/wk/
--   R package for querying soilDB: https://ncss-tech.github.io/soilDB/
--   dplyr: https://dplyr.tidyverse.org/
+- simple features: https://r-spatial.github.io/sf/index.html
+- well-known geometry: https://paleolimbot.github.io/wk/
+- R package for querying soilDB: https://ncss-tech.github.io/soilDB/
+- dplyr: https://dplyr.tidyverse.org/
 
 ### algorithm
 
@@ -72,19 +72,27 @@ Input: a specific point in lat/lon, and a set of depth intervals.
 
 ### Regular tests
 
-There is a small suite of integration tests which can be run with the `make test` command, and gets run regularly by CI.
+There are several smaller test suites:
+
+- There is a set of "unit" tests, which really are testing the entire codebase more or less, but don't rely on any external API services, instead using snapshotted data from those services. You can run these tests with `make test_unit`.
+    - These tests mostly produce snapshots of algorithm output rather than validating specific properties of the output, so they moreso validate that the algorithm hasn't changed (or how it has changed) rather than that it is correct. If the snapshots have changed in a desirable way, you can update them with `make test_update_unit_snapshots`.
+- For US only, there is a set of "integration" tests which run the algorithm against the live API services, but just confirm that the algorithm doesn't crash, they don't validate the output since it can change over time. These can be run with `make test_integration`.
+- The unit and integration tests can be run together with `make test` for convenience: this is what must pass for a PR to be mergeable.
+- The API snapshots themselves can be checked against the live API for drift using `make test_api_snapshot`. They can be updated to the new live API values using `make test_update_api_snapshots`.
 
 ### Bulk test
 
 There is a large suite of integration tests which takes many hours to run. It comes in the format of two scripts:
 
--   Run `make generate_bulk_test_results` to run the algorithm over a collection of 3000 soil pits, which will accumulate the results in a log file.
--   Run `RESULTS_FILE=$RESULTS_FILE make process_bulk_test_results` to view statistics calculated over that log file.
+- Run `make generate_bulk_test_results_us` or `make generate_bulk_test_results_global` to run the algorithm over a collection of thousands soil pits with soil IDs given by trained data collectors, which will accumulate the results in a log file. This can take several hours or potentially need to run overnight due (especially the US tests are slow due to the speed of external API services).
+- Run `RESULTS_FILE=$RESULTS_FILE make process_bulk_test_results_us` or `RESULTS_FILE=$RESULTS_FILE make process_bulk_test_results_global` to view statistics calculated over that log file. This can be run concurrently with `generate_bulk_test_results` to see statistics over the soil pits which have been run so far.
+- It has been nice to have these as two separate scripts because then you can iterate on the processing and display of statistics without interrupting the data collection.
+- It would be of value to also be able to run these US tests against snapshotted API data, it would just be much more onerous to collect and update the data.
 
 ## Acknowledgements
 
--   Beaudette, D., Roudier, P., Brown, A. (2023). [aqp: Algorithms for Quantitative Pedology](https://CRAN.R-project.org/package=aqp). R package version 2.0.
+- Beaudette, D., Roudier, P., Brown, A. (2023). [aqp: Algorithms for Quantitative Pedology](https://CRAN.R-project.org/package=aqp). R package version 2.0.
 
--   Beaudette, D.E., Roudier, P., O'Geen, A.T. [Algorithms for quantitative pedology: A toolkit for soil scientists, Computers & Geosciences](http://dx.doi.org/10.1016/j.cageo.2012.10.020), Volume 52, March 2013, Pages 258-268, ISSN 0098-3004.
+- Beaudette, D.E., Roudier, P., O'Geen, A.T. [Algorithms for quantitative pedology: A toolkit for soil scientists, Computers & Geosciences](http://dx.doi.org/10.1016/j.cageo.2012.10.020), Volume 52, March 2013, Pages 258-268, ISSN 0098-3004.
 
--   soilDB: Beaudette, D., Skovlin, J., Roecker, S., Brown, A. (2024). [soilDB: Soil Database Interface](https://CRAN.R-project.org/package=soilDB). R package version 2.8.3.
+- soilDB: Beaudette, D., Skovlin, J., Roecker, S., Brown, A. (2024). [soilDB: Soil Database Interface](https://CRAN.R-project.org/package=soilDB). R package version 2.8.3.
@@ -24,6 +24,10 @@ extend-ignore = ["E203"]
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"
+markers = [
+    "integration: runs against live external APIs; does not validate output",
+    "api_snapshot: captures and compares API response snapshots against live APIs",
+]
 
 [tool.setuptools.dynamic]
 dependencies = { file = ["requirements/base.in"] }