Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b713d91
Initial plan
Copilot Mar 15, 2026
a055112
Add compress-waveform stage using FlacArray compression
Copilot Mar 15, 2026
29c7d3b
Address code review: use dtype.kind check instead of np.issubdtype fo…
Copilot Mar 15, 2026
928491a
Rewrite compress_waveform with int16 scaling, delta encoding, and das…
Copilot Mar 16, 2026
a238447
Address code review: rename constant, document dask materialization, …
Copilot Mar 16, 2026
c20a76c
Fix dask chunking to station axis, add component delta encoding via m…
Copilot Mar 16, 2026
c542e55
Address code review: explicit .compute(), clearer scale_factor cast, …
Copilot Mar 16, 2026
cbbdfc5
Improve waveform recovery: scale to int32 range (23-bit) instead of i…
Copilot Mar 16, 2026
f30674c
Fix waveform drift: remove time-delta encoding, keep component-delta …
Copilot Mar 16, 2026
40091b9
fix: do not delta encode waveform components
lispandfound Mar 16, 2026
a5930d2
Use level argument
lispandfound Mar 16, 2026
4401979
actually use the precision argument
lispandfound Mar 16, 2026
3511c09
tests(compress-waveform): simplify tests to reflect simplified code
lispandfound Mar 16, 2026
03bf833
fix(compress-waveform): satisfy type checker
lispandfound Mar 16, 2026
80d018d
tests(compress-waveform): remove ds make test dataset
lispandfound Mar 16, 2026
05ff473
fix(hf-sim): fix import
lispandfound Mar 16, 2026
0568c17
remove unused import
lispandfound Mar 16, 2026
034183d
include h5py as a dependency and remove dask
lispandfound Mar 16, 2026
a086d55
feat: add xarray shim for waveform reading
lispandfound Mar 16, 2026
f434df7
fix: ci nitpicks
lispandfound Mar 16, 2026
40f4786
docs: reduce documentation
lispandfound Mar 16, 2026
cd8d660
docs: make CLI docs simpler
lispandfound Mar 16, 2026
4fe3930
docs: precision accuracy
lispandfound Mar 16, 2026
b313109
fix slice indexing
lispandfound Mar 16, 2026
32d785a
feat: support stepped timestepping values
lispandfound Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ dependencies = [
"qcore-utils>=2025.12.2",
"source_modelling>=2025.12.1",
# Data Formats
"dask",
"flacarray",
"geopandas",
"h5py",
"pandas[parquet, hdf5]",
"pyyaml",
"xarray[io]",

# Numerics
"numpy",
"scipy",
Expand All @@ -35,7 +37,6 @@ dependencies = [
"schema", # For loading realisations
"structlog", # Logging.
"psutil", # To get the CPU affinity for jobs

]

[project.optional-dependencies]
Expand Down Expand Up @@ -64,6 +65,7 @@ generate-stoch = "workflow.scripts.generate_stoch:app"
merge-ts = "workflow.scripts.merge_ts:app"
hf-sim = "workflow.scripts.hf_sim:app"
bb-sim = "workflow.scripts.bb_sim:app"
compress-waveform = "workflow.scripts.compress_waveform:app"
im-calc = "workflow.scripts.im_calc:app"
check-srf = "workflow.scripts.check_srf:app"
check-domain = "workflow.scripts.check_domain:app"
Expand Down
74 changes: 74 additions & 0 deletions tests/test_compress_waveform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from pathlib import Path

import numpy as np
import xarray as xr

from workflow import waveform
from workflow.scripts.compress_waveform import (
compress_waveform,
)

# Constants for test data generation
N_COMPONENTS, N_STATIONS, N_TIME = 3, 5, 1000
DT = 0.05


def _make_test_dataset() -> xr.Dataset:
"""Create a simple synthetic waveform dataset for testing."""
time = np.arange(N_TIME) * DT
waveform = (
np.sin(time * 2 * np.pi * 1.0)
+ np.random.default_rng(42).standard_normal((N_COMPONENTS, N_STATIONS, N_TIME))
* 0.1
)

return xr.Dataset(
{"waveform": (["component", "station", "time"], waveform.astype(np.float32))},
coords={
"component": ["x", "y", "z"],
"station": [f"STA{i:02d}" for i in range(N_STATIONS)],
"time": time,
"lat": ("station", np.linspace(-45, -43, N_STATIONS)),
},
attrs={"units": "m/s", "source": "test_gen"},
)


def test_waveform_roundtrip_integrity(tmp_path: Path) -> None:
"""Verify waveform values and metadata survive the compression roundtrip."""
with _make_test_dataset() as ds:
input_path = tmp_path / "input.h5"
original_attrs = ds.attrs
ds.to_netcdf(input_path, engine="h5netcdf")
output_path = tmp_path / "output.h5"

compress_waveform(input_path, output_path)
restored = waveform.load_waveform_dataset(output_path).compute()

restored_subset = {k: v for k, v in restored.attrs.items() if k in original_attrs}
assert restored_subset == original_attrs, (
"Restored attributes do not match original attributes."
)

for coord in ds.coords:
np.testing.assert_array_equal(restored[coord].values, ds[coord].values)

xr.testing.assert_allclose(restored, ds, atol=5e-4)


def test_compression_efficiency(tmp_path: Path) -> None:
"""Verify the compressed file is actually smaller than the raw values."""
input_path = tmp_path / "input.h5"
output_path = tmp_path / "output.h5"

with _make_test_dataset() as ds:
ds.to_netcdf(input_path, engine="h5netcdf")

compress_waveform(input_path, output_path)

raw_size = input_path.stat().st_size
compressed_size = output_path.stat().st_size

assert compressed_size < raw_size, (
f"Compression failed to reduce size: {compressed_size} >= {raw_size}"
)
83 changes: 83 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

89 changes: 89 additions & 0 deletions workflow/scripts/compress_waveform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Compress Waveform.

Description
-----------
Compress a broadband waveform HDF5 file using FLAC compression.

Inputs
------
1. A broadband waveform file (HDF5/NetCDF4 format, output of ``bb-sim``).

Outputs
-------
A compressed waveform file in HDF5 format with FlacArray-encoded waveform data.

Environment
-----------
Can be run in the cybershake container. Can also be run from your own
computer using the ``compress-waveform`` command which is installed after running
``pip install workflow@git+https://github.com/ucgmsim/workflow``.

Usage
-----
``compress-waveform WAVEFORM_FFP OUTPUT_FFP``

For More Help
-------------
See the output of ``compress-waveform --help``.
"""

from pathlib import Path
from typing import Annotated

import flacarray.hdf5
import h5py
import typer
import xarray as xr

from qcore import cli
from workflow import log_utils

app = typer.Typer()


@cli.from_docstring(app)
@log_utils.log_call()
def compress_waveform(
waveform_ffp: Annotated[Path, typer.Argument(dir_okay=False, exists=True)],
output_ffp: Annotated[Path, typer.Argument(dir_okay=False, writable=True)],
level: Annotated[int, typer.Option(min=0, max=8)] = 5,
precision: Annotated[int, typer.Option(min=1)] = 4,
) -> None:
"""Compress a broadband waveform file using FLAC.

Parameters
----------
waveform_ffp : Path
Path to the input broadband waveform file (HDF5/NetCDF4).
output_ffp : Path
Path to the output compressed HDF5 file.
level : int, optional
FLAC compression level (0-8). Higher values compress more but
are slower. Defaults to 5.
precision : int, optional
FLAC precision level (in significant digits of input data). Higher values compress less but
have more precision. Defaults to 4.
"""
with (
xr.open_dataset(waveform_ffp, engine="h5netcdf") as broadband,
):
broadband.drop_vars("waveform").to_netcdf(output_ffp, engine="h5netcdf")
with h5py.File(output_ffp, "a") as hdf:
group = hdf.create_group("_flac_compressed_waveform")
group.attrs["flac_array"] = True
group.attrs["name"] = "waveform"
group.attrs["shape"] = broadband.waveform.shape
group.attrs["dims"] = broadband.waveform.dims
group.attrs["dtype"] = str(broadband.waveform.dtype)

flacarray.hdf5.write_array(
broadband.waveform.values,
group,
precision=precision,
level=level,
use_threads=True,
)


if __name__ == "__main__":
app()
1 change: 1 addition & 0 deletions workflow/scripts/hf_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"""

import concurrent.futures
import hashlib
import subprocess
import tempfile
from collections.abc import Iterable
Expand Down
Loading
Loading