ucgmsim · Copilot · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 16, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,11 +18,13 @@ dependencies = [
   "qcore-utils>=2025.12.2",
   "source_modelling>=2025.12.1",
   # Data Formats
+  "dask",
+  "flacarray",
   "geopandas",
+  "h5py",
   "pandas[parquet, hdf5]",
   "pyyaml",
   "xarray[io]",
-
   # Numerics
   "numpy",
   "scipy",
@@ -35,7 +37,6 @@ dependencies = [
   "schema",    # For loading realisations
   "structlog", # Logging.
   "psutil",    # To get the CPU affinity for jobs
-
 ]
 
 [project.optional-dependencies]
@@ -64,6 +65,7 @@ generate-stoch = "workflow.scripts.generate_stoch:app"
 merge-ts = "workflow.scripts.merge_ts:app"
 hf-sim = "workflow.scripts.hf_sim:app"
 bb-sim = "workflow.scripts.bb_sim:app"
+compress-waveform = "workflow.scripts.compress_waveform:app"
 im-calc = "workflow.scripts.im_calc:app"
 check-srf = "workflow.scripts.check_srf:app"
 check-domain = "workflow.scripts.check_domain:app"

diff --git a/tests/test_compress_waveform.py b/tests/test_compress_waveform.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+import numpy as np
+import xarray as xr
+
+from workflow import waveform
+from workflow.scripts.compress_waveform import (
+    compress_waveform,
+)
+
+# Constants for test data generation
+N_COMPONENTS, N_STATIONS, N_TIME = 3, 5, 1000
+DT = 0.05
+
+
+def _make_test_dataset() -> xr.Dataset:
+    """Create a simple synthetic waveform dataset for testing."""
+    time = np.arange(N_TIME) * DT
+    waveform = (
+        np.sin(time * 2 * np.pi * 1.0)
+        + np.random.default_rng(42).standard_normal((N_COMPONENTS, N_STATIONS, N_TIME))
+        * 0.1
+    )
+
+    return xr.Dataset(
+        {"waveform": (["component", "station", "time"], waveform.astype(np.float32))},
+        coords={
+            "component": ["x", "y", "z"],
+            "station": [f"STA{i:02d}" for i in range(N_STATIONS)],
+            "time": time,
+            "lat": ("station", np.linspace(-45, -43, N_STATIONS)),
+        },
+        attrs={"units": "m/s", "source": "test_gen"},
+    )
+
+
+def test_waveform_roundtrip_integrity(tmp_path: Path) -> None:
+    """Verify waveform values and metadata survive the compression roundtrip."""
+    with _make_test_dataset() as ds:
+        input_path = tmp_path / "input.h5"
+        original_attrs = ds.attrs
+        ds.to_netcdf(input_path, engine="h5netcdf")
+    output_path = tmp_path / "output.h5"
+
+    compress_waveform(input_path, output_path)
+    restored = waveform.load_waveform_dataset(output_path).compute()
+
+    restored_subset = {k: v for k, v in restored.attrs.items() if k in original_attrs}
+    assert restored_subset == original_attrs, (
+        "Restored attributes do not match original attributes."
+    )
+
+    for coord in ds.coords:
+        np.testing.assert_array_equal(restored[coord].values, ds[coord].values)
+
+    xr.testing.assert_allclose(restored, ds, atol=5e-4)
+
+
+def test_compression_efficiency(tmp_path: Path) -> None:
+    """Verify the compressed file is actually smaller than the raw values."""
+    input_path = tmp_path / "input.h5"
+    output_path = tmp_path / "output.h5"
+
+    with _make_test_dataset() as ds:
+        ds.to_netcdf(input_path, engine="h5netcdf")
+
+    compress_waveform(input_path, output_path)
+
+    raw_size = input_path.stat().st_size
+    compressed_size = output_path.stat().st_size
+
+    assert compressed_size < raw_size, (
+        f"Compression failed to reduce size: {compressed_size} >= {raw_size}"
+    )
diff --git a/uv.lock b/uv.lock
diff --git a/workflow/scripts/compress_waveform.py b/workflow/scripts/compress_waveform.py
@@ -0,0 +1,89 @@
+"""Compress Waveform.
+
+Description
+-----------
+Compress a broadband waveform HDF5 file using FLAC compression.
+
+Inputs
+------
+1. A broadband waveform file (HDF5/NetCDF4 format, output of ``bb-sim``).
+
+Outputs
+-------
+A compressed waveform file in HDF5 format with FlacArray-encoded waveform data.
+
+Environment
+-----------
+Can be run in the cybershake container. Can also be run from your own
+computer using the ``compress-waveform`` command which is installed after running
+``pip install workflow@git+https://github.com/ucgmsim/workflow``.
+
+Usage
+-----
+``compress-waveform WAVEFORM_FFP OUTPUT_FFP``
+
+For More Help
+-------------
+See the output of ``compress-waveform --help``.
+"""
+
+from pathlib import Path
+from typing import Annotated
+
+import flacarray.hdf5
+import h5py
+import typer
+import xarray as xr
+
+from qcore import cli
+from workflow import log_utils
+
+app = typer.Typer()
+
+
+@cli.from_docstring(app)
+@log_utils.log_call()
+def compress_waveform(
+    waveform_ffp: Annotated[Path, typer.Argument(dir_okay=False, exists=True)],
+    output_ffp: Annotated[Path, typer.Argument(dir_okay=False, writable=True)],
+    level: Annotated[int, typer.Option(min=0, max=8)] = 5,
+    precision: Annotated[int, typer.Option(min=1)] = 4,
+) -> None:
+    """Compress a broadband waveform file using FLAC.
+
+    Parameters
+    ----------
+    waveform_ffp : Path
+        Path to the input broadband waveform file (HDF5/NetCDF4).
+    output_ffp : Path
+        Path to the output compressed HDF5 file.
+    level : int, optional
+        FLAC compression level (0-8). Higher values compress more but
+        are slower. Defaults to 5.
+    precision : int, optional
+        FLAC precision level (in significant digits of input data). Higher values compress less but
+        have more precision. Defaults to 4.
+    """
+    with (
+        xr.open_dataset(waveform_ffp, engine="h5netcdf") as broadband,
+    ):
+        broadband.drop_vars("waveform").to_netcdf(output_ffp, engine="h5netcdf")
+        with h5py.File(output_ffp, "a") as hdf:
+            group = hdf.create_group("_flac_compressed_waveform")
+            group.attrs["flac_array"] = True
+            group.attrs["name"] = "waveform"
+            group.attrs["shape"] = broadband.waveform.shape
+            group.attrs["dims"] = broadband.waveform.dims
+            group.attrs["dtype"] = str(broadband.waveform.dtype)
+
+            flacarray.hdf5.write_array(
+                broadband.waveform.values,
+                group,
+                precision=precision,
+                level=level,
+                use_threads=True,
+            )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/workflow/scripts/hf_sim.py b/workflow/scripts/hf_sim.py
@@ -33,6 +33,7 @@
 """
 
 import concurrent.futures
+import hashlib
 import subprocess
 import tempfile
 from collections.abc import Iterable