mehta-lab · edyoshikun · May 22, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.envrc b/.envrc
@@ -0,0 +1,3 @@
+export CUDA_PATH=/hpc/apps/cuda/12.8.0_570.86.10
+export PATH=$CUDA_PATH/bin:$PATH
+export LD_LIBRARY_PATH=$CUDA_PATH/lib64:${LD_LIBRARY_PATH:-}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -118,10 +118,50 @@ jobs:
         run: uv run --frozen pytest
         working-directory: applications/${{ matrix.application }}
 
+  test-dynacell-configs:
+    name: Test dynacell benchmark configs (Python 3.13, ubuntu-latest)
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Set up uv with Python 3.13
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.13"
+          enable-cache: true
+          cache-suffix: ubuntu-latest-3.13
+
+      - name: Install minimal dynacell (base deps + test group)
+        run: uv sync --frozen --group test
+        working-directory: applications/dynacell
+
+      - name: Run benchmark-schema + submit-tool + eval-runtime tests
+        # tests/test_runtime.py + tests/test_evaluation_pipeline_parallel.py
+        # cover the dynacell.evaluation.runtime module + FovResult pickle
+        # contract. tests/test_evaluation_pipeline_parallel_cpu.py drives
+        # evaluate_predictions end-to-end (serial vs spawn-process) on a
+        # tiny iohub fixture + prebuilt mask cache — no eval extras (no
+        # cellpose, transformers, cubic) needed since target_name=er +
+        # require_complete_cache=true short-circuit segmenter and feature
+        # extractor loads. tests/test_evaluation_grouped.py drives the
+        # multi-condition driver against the same cache-only fixture.
+        run: |
+          uv run --frozen pytest \
+            tests/test_benchmark_config_composition.py \
+            tests/test_submit_benchmark_job.py \
+            tests/test_runtime.py \
+            tests/test_evaluation_pipeline_parallel.py \
+            tests/test_evaluation_pipeline_parallel_cpu.py \
+            tests/test_evaluation_grouped.py \
+            -v
+        working-directory: applications/dynacell
+
   check:
     name: All tests pass
     if: always()
-    needs: [test, test-data, test-data-extras, test-applications]
+    needs: [test, test-data, test-data-extras, test-applications, test-dynacell-configs]
     runs-on: ubuntu-latest
     steps:
       - name: Verify all test jobs succeeded

diff --git a/.gitignore b/.gitignore
@@ -66,3 +66,7 @@ slurm*.out
 lightning_logs/
 
 # NOTE: uv.lock is NOT ignored - it should be tracked for reproducibility
+
+checkpoints/
+
+plot_related/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1,16 +1,10 @@
-# CLAUDE.md
+# VisCy — Claude Code Reference
 
-Project-specific instructions for Claude Code sessions in this repository.
+## Project
 
-## Git Workflow
-- **NEVER** use `git commit --amend` or `git push --force` / `--force-with-lease` unless the user explicitly requests it. Always create NEW commits.
-- ALWAYS use atomic commits: one logical change per commit. Never bundle unrelated changes.
-- Never use `git add -A` or `git add .`. Always stage specific files by name.
-- Always pull before pushing. If push is rejected, pull and retry — never force-push.
-
-## Repository Structure
+VisCy is a **uv workspace monorepo** for virtual staining and computational microscopy. Sub-packages live under `packages/`.
 
-VisCy is a **uv workspace monorepo**. Sub-packages live under `packages/`:
+## Repo Layout
 
 ```
 pyproject.toml              # Root config (ruff, pytest, uv workspace)
@@ -28,51 +22,116 @@ applications/               # Self-contained research applications
 - **Applications must not import from each other.** If two applications need the same logic, move it to an existing package or create a new one.
 - Applications are consumers of packages — the dependency graph always flows `applications/ → packages/`, never sideways.
 
-## Code Style
+---
+
+## Development
 
+### Environment Setup
 
-## Testing
+Use `uv` package manager. Run commands with `uv run <command>`. Edit `pyproject.toml` to modify dependencies and sync to update `uv.lock`.
 
 ```sh
-uv run pytest                          # all tests
-uv run pytest packages/viscy-data/     # single package (data)
-uv run pytest packages/viscy-models/   # single package (models)
+uv venv -p 3.13
+uv sync --all-packages --all-extras
 ```
 
-## Common Commands
+If `uv` is not installed:
+```sh
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
 
+On HPC, symlink the uv cache out of your home directory first:
 ```sh
-uvx ruff check packages/       # lint
+mkdir -p /hpc/mydata/firstname.lastname/.cache/uv && ln -s /hpc/mydata/firstname.lastname/.cache/uv ~/.cache/uv
+```
+
+For full setup instructions (installing uv, creating a venv, syncing dependencies), see [CONTRIBUTING.md](./CONTRIBUTING.md).
+
+### SLURM scripts for Lightning DDP jobs
+
+When hand-writing `.slurm` scripts that launch Lightning via `srun`, always use `--ntasks-per-node=N` (not `--ntasks=N`). Lightning's `SLURMEnvironment` validates `SLURM_NTASKS_PER_NODE` at trainer init and raises `RuntimeError: You set --ntasks=N in your SLURM bash script, but this variable is not supported. HINT: Use --ntasks-per-node=N instead.` — the job then dies seconds into the allocation.
+
+Invariant: `#SBATCH --ntasks-per-node=N` must equal `trainer.devices` in the YAML config and `#SBATCH --gpus=N` (single-node) or `#SBATCH --gpus-per-node=N` (multi-node).
+
+The dynacell launcher (`applications/dynacell/tools/submit_benchmark_job.py`) already emits `--ntasks-per-node` correctly; this note is for hand-written scripts (e.g., `applications/cytoland/examples/configs/*/run_*.slurm`).
+
+### Joint vs single-set training batch semantics
+
+`HCSDataModule` and `BatchedConcatDataModule` produce the same number of GPU samples per training step — but the YAML `batch_size` value that gets there is **different by a factor of `num_samples`**. Easy to misread either by skimming.
+
+| DataModule | `train_dataloader` divides by `num_samples`? | Samples per step |
+|---|---|---|
+| `HCSDataModule` (single-set) | yes (`hcs.py` `train_dataloader`) | `batch_size` |
+| `ConcatDataModule` (parent class) | yes (`combined.py` `train_dataloader`) | `batch_size` |
+| `BatchedConcatDataModule` (joint) | **no** (`combined.py` overrides; uses `batch_size` as-is) | `batch_size * num_samples` |
+
+To match the same effective per-step samples between a single-set and a joint config, **set `joint.batch_size = single_set.batch_size / num_samples`**.
+
+Examples (verified against the `applications/dynacell/configs/benchmarks/virtual_staining/_internal/shared/model/data_overlays/` overlays + their joint leaves):
+
+- FCMAE (`fcmae_vscyto3d_*`): single-set `batch_size: 32, num_samples: 4` → joint `batch_size: 8, num_samples: 4` → both yield **32 samples/step**.
+- FNet3D (`fnet3d_paper`): single-set `batch_size: 48, num_samples: 8` → joint `batch_size: 6, num_samples: 8` → both yield **48 samples/step**.
+
+`HCSDataModule._train_transform` enforces `batch_size % num_samples == 0` for single-set use because `train_dataloader` would otherwise round down silently. The check is suppressed for `BatchedConcatDataModule` children via the `_is_batched_concat_child` flag set in the wrapper's `setup()` — joint configs are free to pick any `(batch_size, num_samples)` pair as long as the product is the desired sample count. **Do not** "fix" a joint config by raising `batch_size` to satisfy the divisibility rule; it would multiply effective samples by `num_samples`.
+
+When in doubt, read both `train_dataloader` overrides directly — they are short. Don't infer from comments alone.
+
+### Common Commands
+
+```sh
+uvx ruff check packages/        # lint
 uvx ruff check --fix packages/  # lint + auto-fix
 uvx ruff format packages/       # format
+uv run pytest                    # all tests
 ```
 
-## Code Style
+### Testing
+
+```sh
+uv run pytest                          # all tests
+uv run pytest packages/viscy-data/     # single package (data)
+uv run pytest packages/viscy-models/   # single package (models)
+```
+
+Prefer `{file}_test.py` in the same directory as `{file}.py`, unless there are import issues, in which case use `tests/`.
+
+---
+
+## Project Conventions
+
+- Ruff config is centralized in the root `pyproject.toml` only. Sub-packages must NOT have their own `[tool.ruff.*]` sections. Ruff does not inherit config — any `[tool.ruff.*]` in a sub-package silently overrides the entire root config (including `lint.select`, `per-file-ignores`, etc.).
+- Run `uvx prek run --files {files_you_edited}` (unless the change was simple) and fix typing and linting errors. Use `# type: ignore` as needed. The precommit will give you type errors which is useful — especially to know if you have incorrect code — but for many minor changes it's better to do this after testing. Use a subagent to apply complex fixes.
+
+---
+
+## Engineering Standards
+
+### Git Workflow
+
+- **NEVER** use `git commit --amend` or `git push --force` / `--force-with-lease` unless the user explicitly requests it. Always create NEW commits.
+- ALWAYS use atomic commits: one logical change per commit. Never bundle unrelated changes.
+- Never use `git add -A` or `git add .`. Always stage specific files by name.
+- Always pull before pushing. If push is rejected, pull and retry — never force-push.
+
+### Code Style
 
-### General
-- **Ruff config is centralized in the root `pyproject.toml` only.**
-  Sub-packages must NOT have their own `[tool.ruff.*]` sections.
-  Ruff does not inherit config — any `[tool.ruff.*]` in a sub-package
-  silently overrides the entire root config (including `lint.select`,
-  `per-file-ignores`, etc.).
 - Docstrings use **numpy style** (`convention = "numpy"`).
 - Lint rules: `D, E, F, I, NPY, PD, W`.
 - `D` rules are ignored in `**/tests/**` and notebooks.
 - Format: double quotes, spaces, 120 char line length.
-- Prefer {file}_test.py in the same directory as {file}.py, unless there are import issues, in which case use tests/...
-- Run `uvx prek run --files {files_you_editted}` (unless the change was simple) and fix typing and linting errors, you make `# type: ignore` as needed.
-  The precommit will give you type errors which is nice - especially to know if you have incorrect code - but for many minor changes it's better to do this after testing.
-  Use a subagent to apply complex fixes.
-- Use a subagent to run tests and complex bash commands, especially that which you think will return complex output.
+- Use a subagent to run tests and complex bash commands, especially those expected to return complex output.
+- Run independent tasks (multi-file edits across separate concerns, cross-cutting verifications, distinct review angles) in parallel via concurrent subagents in a single message. Subagent startup overhead is negligible relative to sequential blocking. Only sequence subagents when a later task needs an earlier task's output.
 
-### Avoid Backwards Compatibility
-In most cases it is incorrect to maintain backwards compatibility with a previous pipeline. This is a research codebase - changes are expected and encouraged. Keeping backwards compatibility risks MORE bugs, since someone can unknowingly run old code.
+#### Avoid Backwards Compatibility
+
+In most cases it is incorrect to maintain backwards compatibility with a previous pipeline. This is a research codebase — changes are expected and encouraged. Keeping backwards compatibility risks MORE bugs, since someone can unknowingly run old code.
 
 If you believe it is important to maintain backwards compatibility, explicitly ask the user if you should do so during the planning stage. If the user says no, then do not maintain backwards compatibility.
 
 Delete and remove old code that is not used.
 
-### Use Context Managers for Resources
+#### Use Context Managers for Resources
+
 Always use context managers (`with` statements) when opening external resources like zarr stores, files, or database connections. Never assign them to a variable without a context manager — this leaks file handles and locks.
 
 ```python
@@ -84,95 +143,76 @@ with open_ome_zarr(path, mode="r") as plate:
 plate = open_ome_zarr(path, mode="r")
 ```
 
-### Prefer Raising Errors
-In general, prefer raising errors instead of silently catching them. Errors are good and warn us of issues in the script. For example, prefer `value = my_dictionary['key']` over `value = my_dictionary.get('key')` since the former will raise a `KeyError` to signal that the underlying data is not behaving as expected.
+#### Prefer Raising Errors
+
+Prefer raising errors instead of silently catching them. Errors are good and warn us of issues. For example, prefer `value = my_dictionary['key']` over `value = my_dictionary.get('key')` since the former will raise a `KeyError` to signal that the underlying data is not behaving as expected.
 
 Only catch errors when there is a good reason to do so: for example, catching HTTP errors in order to retry a request.
 
 If you find yourself writing an if statement, fallback, or except statement designed to avoid errors, ask yourself if it would be better to raise the error as a signal to the user.
 
+#### Use Real Integration Tests
 
-### Use Real Integration Tests
-Tests should directly *import* the actual code we are trying to test. For example, if you are trying to test `my_function` on some sample data, your test should directly import `my_function` and run it on the sample data. AVOID testing "key behavior" or components of the pipeline, since this can miss bugs.
+Tests should directly *import* the actual code we are trying to test. For example, if you are trying to test `my_function` on some sample data, your test should directly import `my_function` and run it on the sample data. Avoid testing "key behavior" or components in isolation when an integration test would catch more bugs.
 
 Ask yourself if your test is actually covering the true function.
 
-### Imports
-- Import at the top of the file. Don't use inline imports without strong reason.
-- Use absolute imports (`from projects.my_directory.my_file`) instead of relative.
-- Do not modify `sys.path` for imports.
-
-## Development Environment
-
-### Environment
-Use `uv` package manager. Run commands with `uv run <command>`. Edit `pyproject.toml` to modify dependencies and sync to update `uv.lock`
-
-For full setup instructions (installing uv, creating a venv, syncing dependencies), see [CONTRIBUTING.md](./CONTRIBUTING.md).
-
-Quick start:
-```sh
-uv venv -p 3.13
-uv sync --all-packages --all-extras
-uv run pytest
-```
+#### Imports
 
-If `uv` is not installed:
-```sh
-curl -LsSf https://astral.sh/uv/install.sh | sh
-```
+- Import at the top of the file. No inline imports without strong reason.
+- Use absolute imports (`from packages.my_directory.my_file`) instead of relative.
+- Do not modify `sys.path` for imports.
 
-On HPC, symlink the uv cache out of your home directory first:
-```sh
-mkdir -p /hpc/mydata/firstname.lastname/.cache/uv && ln -s /hpc/mydata/firstname.lastname/.cache/uv ~/.cache/uv
-```
+### Coding Philosophy
 
-## Coding
+#### 1. Think Before Coding
 
-1. Think Before Coding
 Don't assume. Don't hide confusion. Surface tradeoffs.
 
 Before implementing:
+- State your assumptions explicitly. If uncertain, ask.
+- If multiple interpretations exist, present them — don't pick silently.
+- If a simpler approach exists, say so. Push back when warranted.
+- If something is unclear, stop. Name what's confusing. Ask.
+
+#### 2. Simplicity First
 
-State your assumptions explicitly. If uncertain, ask.
-If multiple interpretations exist, present them - don't pick silently.
-If a simpler approach exists, say so. Push back when warranted.
-If something is unclear, stop. Name what's confusing. Ask.
-2. Simplicity First
 Minimum code that solves the problem. Nothing speculative.
 
-No features beyond what was asked.
-No abstractions for single-use code.
-No "flexibility" or "configurability" that wasn't requested.
-No error handling for impossible scenarios.
-If you write 200 lines and it could be 50, rewrite it.
-Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
+- No features beyond what was asked.
+- No abstractions for single-use code.
+- No "flexibility" or "configurability" that wasn't requested.
+- No error handling for impossible scenarios.
+- If you write 200 lines and it could be 50, rewrite it.
+- Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
+
+#### 3. Surgical Changes
 
-3. Surgical Changes
 Touch only what you must. Clean up only your own mess.
 
 When editing existing code:
+- Don't "improve" adjacent code, comments, or formatting.
+- Don't refactor things that aren't broken.
+- Match existing style, even if you'd do it differently.
+- If you notice unrelated dead code, mention it — don't delete it.
 
-Don't "improve" adjacent code, comments, or formatting.
-Don't refactor things that aren't broken.
-Match existing style, even if you'd do it differently.
-If you notice unrelated dead code, mention it - don't delete it.
 When your changes create orphans:
+- Remove imports/variables/functions that YOUR changes made unused.
+- Don't remove pre-existing dead code unless asked.
+
+The test: every changed line should trace directly to the user's request.
 
-Remove imports/variables/functions that YOUR changes made unused.
-Don't remove pre-existing dead code unless asked.
-The test: Every changed line should trace directly to the user's request.
+#### 4. Goal-Driven Execution
 
-4. Goal-Driven Execution
 Define success criteria. Loop until verified.
 
 Transform tasks into verifiable goals:
+- "Add validation" → "Write tests for invalid inputs, then make them pass"
+- "Fix the bug" → "Write a test that reproduces it, then make it pass"
+- "Refactor X" → "Ensure tests pass before and after"
 
-"Add validation" → "Write tests for invalid inputs, then make them pass"
-"Fix the bug" → "Write a test that reproduces it, then make it pass"
-"Refactor X" → "Ensure tests pass before and after"
 For multi-step tasks, state a brief plan:
-
 1. [Step] → verify: [check]
 2. [Step] → verify: [check]
-3. [Step] → verify: [check]
+
 Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification.
diff --git a/applications/cytoland/examples/configs/dynacell/fit_fnet3d_sec61b.yml b/applications/cytoland/examples/configs/dynacell/fit_fnet3d_sec61b.yml
@@ -5,7 +5,8 @@
 # Batch related launches with:
 #   export VISCY_WANDB_LAUNCH=20260401-augfix-r1
 base:
-  - ../recipes/trainer/fit_1gpu.yml
+  - ../recipes/trainer/fit.yml
+  - ../recipes/topology/single_gpu.yml
   - ../recipes/data/hcs_sec61b_3d.yml
   - ../recipes/models/fnet3d_z8.yml
 
@@ -20,9 +21,12 @@ model:
     schedule: WarmupCosine
 
 trainer:
+  precision: bf16-mixed
   max_epochs: 100
   logger:
     init_args:
+      # Override cytoland's default project: this bridge trains on a dynacell dataset (iPSC SEC61B).
+      project: dynacell
       name: FNet3D_iPSC_SEC61B
       save_dir: /hpc/projects/comp.micro/virtual_staining/models/dynacell_cytoland/ipsc/sec61b/fnet3d
   callbacks: