From 24a96090ddf26d4cc57762ccabc66b701881c563 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 25 May 2026 17:33:39 +0100 Subject: [PATCH 1/3] Add loom run examples Signed-off-by: Arthit Suriyawongkul --- .github/workflows/lint.yml | 8 +- .github/workflows/typecheck.yml | 47 +-- CHANGELOG.md | 15 + CITATION.cff | 4 +- README.md | 6 +- codemeta.json | 6 +- docs/design/format-neutral-representation.md | 2 +- docs/design/mlflow-extractor.md | 2 +- docs/design/sbom-fragments.md | 36 +- docs/implementation/demo.md | 2 +- docs/implementation/summary.md | 4 +- examples/sentimentdemo-aibom/.gitignore | 8 + examples/sentimentdemo-aibom/README.md | 385 ++++++++++++++++++ examples/sentimentdemo-aibom/data/raw/neg.txt | 15 + examples/sentimentdemo-aibom/data/raw/pos.txt | 15 + examples/sentimentdemo-aibom/pyproject.toml | 67 +++ .../scripts/run_pipeline.sh | 57 +++ .../src/sentimentdemo/__about__.py | 6 + .../src/sentimentdemo/__init__.py | 9 + .../src/sentimentdemo/evaluate.py | 57 +++ .../src/sentimentdemo/preprocess.py | 111 +++++ .../src/sentimentdemo/train.py | 73 ++++ pyproject.toml | 45 +- src/pitloom/__about__.py | 2 +- src/pitloom/assemble/spdx3/deps.py | 8 +- src/pitloom/assemble/spdx3/fragments.py | 2 +- src/pitloom/export/spdx3_json.py | 4 +- src/pitloom/extract/_pytorch.py | 4 +- src/pitloom/loom.py | 111 ++--- tests/fixtures/fragments/README.md | 2 +- .../training-run-fragment.spdx3.json | 2 +- tests/test_extract_fasttext.py | 2 +- tests/test_fragments.py | 2 +- tests/test_loom.py | 35 +- 34 files changed, 986 insertions(+), 168 deletions(-) create mode 100644 examples/sentimentdemo-aibom/.gitignore create mode 100644 examples/sentimentdemo-aibom/README.md create mode 100644 examples/sentimentdemo-aibom/data/raw/neg.txt create mode 100644 examples/sentimentdemo-aibom/data/raw/pos.txt create mode 100644 examples/sentimentdemo-aibom/pyproject.toml create mode 100755 examples/sentimentdemo-aibom/scripts/run_pipeline.sh create mode 100644 examples/sentimentdemo-aibom/src/sentimentdemo/__about__.py create mode 100644 examples/sentimentdemo-aibom/src/sentimentdemo/__init__.py create mode 100644 examples/sentimentdemo-aibom/src/sentimentdemo/evaluate.py create mode 100644 examples/sentimentdemo-aibom/src/sentimentdemo/preprocess.py create mode 100644 examples/sentimentdemo-aibom/src/sentimentdemo/train.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 145e97e..5b1f64d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -30,16 +30,16 @@ jobs: - name: Ruff linter run: | - ruff check src/ tests/ + ruff check examples/ src/ tests/ - name: Pylint run: | - pylint src/ tests/ + pylint examples/ src/ tests/ - name: Flake8 run: | - flake8 src/ tests/ + flake8 examples/ src/ tests/ - name: Ruff formatter check run: | - ruff format --check src/ tests/ + ruff format --check examples/ src/ tests/ diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml index 1168b18..a4efa2d 100644 --- a/.github/workflows/typecheck.yml +++ b/.github/workflows/typecheck.yml @@ -29,51 +29,12 @@ jobs: - name: Run mypy run: | - mypy src/pitloom --ignore-missing-imports --no-strict-optional - continue-on-error: true - - pyright: - name: pyright - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pyright - pip install -e . + mypy examples/ src/ tests/ --ignore-missing-imports --no-strict-optional - name: Run pyright run: | - pyright src/pitloom - continue-on-error: true - - pyrefly: - name: pyrefly (via pyflakes) - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pyflakes - pip install -e . + pyright examples/ src/ tests/ - - name: Run pyflakes + - name: Run pyrefly run: | - pyflakes src/pitloom - continue-on-error: true + pyrefly check examples/ src/ tests/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c79b01..946c738 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,21 @@ and this project adheres to - Full release notes: - Commit history: +## [UNRELEASED] + +### Added + +- A complete example for SBOM fragments generation and aggregation + +### Changed + +- Rename context manager method from `loom.shoot` to `loom.run`; + make consistent with MLflow + +### Fixed + +- Fix wrong fickling import in in PyTorch extractor + ## [0.7.1] - 2026-05-14 ### Changed diff --git a/CITATION.cff b/CITATION.cff index a147300..067306a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -9,7 +9,7 @@ abstract: Automated transparency, woven from the ground up. SBOM generation for repository-code: "https://github.com/bact/pitloom" type: software doi: 10.5281/zenodo.19243681 -version: 0.7.1 +version: 0.8.0 license-url: "https://spdx.org/licenses/Apache-2.0" keywords: - sbom @@ -21,4 +21,4 @@ keywords: - gguf - onnx - safetensors -date-released: 2026-05-14 +date-released: 2026-05-25 diff --git a/README.md b/README.md index 0035f81..91c075f 100644 --- a/README.md +++ b/README.md @@ -273,14 +273,14 @@ external SBOM fragments that Pitloom will merge during the build process: from pitloom import loom # Use as a function decorator... -@loom.shoot(output_file="fragments/sentiment_model.json") +@loom.run(output_file="fragments/sentiment_model.json") def train_model(): loom.set_model("sentiment-clf") loom.add_dataset("imdb-reviews", dataset_type="text") # ... training logic ... # ...or use as a context manager -with loom.shoot(output_file="fragments/sentiment_model.json"): +with loom.run(output_file="fragments/sentiment_model.json"): loom.set_model("sentiment-clf") loom.add_dataset("imdb-reviews", dataset_type="text") ``` @@ -305,6 +305,8 @@ The generated SBOM will include: - Creator and creation timestamp information - **Metadata provenance** tracking for transparency +See a more complete example in [examples/`](./examples/) directory. + ## Metadata provenance Pitloom tracks the source of each metadata field in the SBOM using the SPDX 3 diff --git a/codemeta.json b/codemeta.json index af46a2f..5b69506 100644 --- a/codemeta.json +++ b/codemeta.json @@ -18,8 +18,8 @@ "codeRepository": "https://github.com/bact/pitloom", "copyrightYear": 2026, "dateCreated": "2026-03-27", - "dateModified": "2026-05-14", - "datePublished": "2026-05-14", + "dateModified": "2026-05-25", + "datePublished": "2026-05-25", "description": "Automated transparency, woven from the ground up. SBOM generation for Python & AI projects. Extract metadata from GGUF, ONNX, PyTorch, and Safetensors models with native Hatchling build-hook support.", "developmentStatus": "active", "downloadUrl": "https://github.com/bact/pitloom/releases", @@ -54,5 +54,5 @@ "programmingLanguage": "Python 3", "readme": "https://github.com/bact/pitloom/blob/main/README.md", "url": "https://github.com/bact/pitloom", - "version": "0.7.1" + "version": "0.8.0" } diff --git a/docs/design/format-neutral-representation.md b/docs/design/format-neutral-representation.md index 2051d5c..775d7ff 100644 --- a/docs/design/format-neutral-representation.md +++ b/docs/design/format-neutral-representation.md @@ -40,7 +40,7 @@ Extractors Core model Serializers / Assemblers read_pyproject() ─┐ read_setuptools() ─┤ read_ai_model() ─┤─-> DocumentModel ─-> Spdx3Assembler -> SPDX 3 JSON-LD -loom.shoot() (fragments) ─┘ (pitloom.core) [future] CycloneDXAssembler -> CycloneDX JSON +loom.run() (fragments) ─┘ (pitloom.core) [future] CycloneDXAssembler -> CycloneDX JSON [future] AidocRenderer -> AIDOC markdown [future] TechOpsDoc -> documentation ``` diff --git a/docs/design/mlflow-extractor.md b/docs/design/mlflow-extractor.md index ec30fee..f5affc1 100644 --- a/docs/design/mlflow-extractor.md +++ b/docs/design/mlflow-extractor.md @@ -38,7 +38,7 @@ with mlflow.start_run(): mlflow.log_param("learning_rate", 3e-4) mlflow.log_metric("accuracy", 0.95) -with loom.shoot("fragment.spdx3.json") as run: +with loom.run("fragment.spdx3.json") as run: run.set_model("my-transformer") # The same facts, typed again ``` diff --git a/docs/design/sbom-fragments.md b/docs/design/sbom-fragments.md index 8f385ab..d71e571 100644 --- a/docs/design/sbom-fragments.md +++ b/docs/design/sbom-fragments.md @@ -147,7 +147,7 @@ for each object. This approach: ### 3. `loom.py` SDK: sparse API surface -The current `Shoot` context manager supports only `set_model` and +The current `Run` context manager supports only `set_model` and `add_dataset`. This is far less expressive than ML tracking SDKs that practitioners already use daily. Key missing capabilities: @@ -158,7 +158,7 @@ practitioners already use daily. Key missing capabilities: - No serialisation of individual dataset elements with schema, provenance, or curation notes. - No model evaluation / scoring records. -- Error is raised if `loom.*` functions are called outside a `Shoot` block; +- Error is raised if `loom.*` functions are called outside a `Run` block; notebook workflows need a more lenient persistent-session mode. ### 4. No W&B Weave integration @@ -326,19 +326,19 @@ in `docs/design/mlflow-extractor.md`. from pitloom import loom # --- Context-managed fragment recording (existing, enhanced) --- -with loom.shoot("fragments/bert-v3.spdx3.json") as shot: - shot.set_model("my-bert", type_of_model="transformer") +with loom.run("fragments/bert-v3.spdx3.json") as run: + run.set_model("my-bert", model_type="transformer") # MLflow-compatible logging functions - shot.log_param("learning_rate", 3e-4) - shot.log_param("batch_size", 32) - shot.log_metric("accuracy", 0.91) - shot.log_metric("f1_score", 0.88) - shot.log_tag("domain", "natural_language_processing") - shot.log_tag(stav.INFO_TRAINING, "Fine-tuned on FLORES-200") + run.log_param("learning_rate", 3e-4) + run.log_param("batch_size", 32) + run.log_metric("accuracy", 0.91) + run.log_metric("f1_score", 0.88) + run.log_tag("domain", "natural_language_processing") + run.log_tag(stav.INFO_TRAINING, "Fine-tuned on FLORES-200") # Dataset documentation - ds = shot.add_dataset("flores-200", dataset_type="text") + ds = run.add_dataset("flores-200", dataset_type="text") ds.set_size(rows=5_000_000) ds.set_license("CC-BY-4.0") ds.set_source_url("https://huggingface.co/datasets/facebook/flores") @@ -346,13 +346,13 @@ with loom.shoot("fragments/bert-v3.spdx3.json") as shot: ds.log_tag("language_count", "200") # Evaluation results (maps to SPDX Annotation) - shot.log_evaluation("flores-dev", {"accuracy": 0.91, "bleu": 42.3}) + run.log_evaluation("flores-dev", {"accuracy": 0.91, "bleu": 42.3}) # --- Persistent session mode (for notebooks) --- loom.start_session("fragments/notebook-run.spdx3.json") # ... cell 1 ... -loom.set_model("incremental-model", type_of_model="classifier") +loom.set_model("incremental-model", model_type="classifier") loom.log_param("epochs", 10) # ... cell 2 (appends to same session) ... @@ -367,12 +367,12 @@ loom.end_session() ### Accumulation mode for notebooks The persistent session is backed by an `_ActiveSession` object (distinct -from `_ActiveShot`) that persists in module-level state and writes a +from `_ActiveRun`) that persists in module-level state and writes a checkpoint file to disk on each `loom.save_session()` call. If the kernel restarts, `loom.resume_session("fragments/notebook-run.spdx3.json")` reads the last checkpoint and continues accumulating. -Key difference from `Shoot`: a session does **not** discard partial output +Key difference from `Run`: a session does **not** discard partial output on exception; it preserves whatever has been recorded up to the crash. ### IPython magic integration @@ -659,11 +659,11 @@ all earlier items; each can be delivered independently. ### Phase 2: SDK improvements (notebook and ML workflow ergonomics) -1. **`log_param`, `log_metric`, `log_tag` on `_ActiveShot`** -- expands the - existing `Shoot` API without breaking changes. +1. **`log_param`, `log_metric`, `log_tag` on `_ActiveRun`** -- expands the + existing `Run` API without breaking changes. 2. **`add_dataset` builder object** -- replace the current `add_dataset(name, type)` with a fluent builder that supports `set_size`, `set_license`, etc. -3. **`log_evaluation` on `_ActiveShot`** -- maps to SPDX `Annotation` elements. +3. **`log_evaluation` on `_ActiveRun`** -- maps to SPDX `Annotation` elements. 4. **Persistent session mode** -- `loom.start_session()` / `loom.end_session()`. 5. **IPython magic** -- `%%pitloom_record` cell magic; optional, only activated if `ipython` is installed. diff --git a/docs/implementation/demo.md b/docs/implementation/demo.md index 8f7be59..b0f35e8 100644 --- a/docs/implementation/demo.md +++ b/docs/implementation/demo.md @@ -125,7 +125,7 @@ dynamically from training loops into your main SBOM seamlessly: ```python from pitloom import loom -@loom.shoot("fragments/model.json") +@loom.run("fragments/model.json") def run(): loom.set_model("my-classifier") loom.add_dataset("my-text-data") diff --git a/docs/implementation/summary.md b/docs/implementation/summary.md index ac73d70..ce022e5 100644 --- a/docs/implementation/summary.md +++ b/docs/implementation/summary.md @@ -76,7 +76,7 @@ SPDX 3.0 compliant SBOMs in JSON-LD format. - See [docs/design/metadata-provenance.md](../design/metadata-provenance.md) 8. **ML tracking SDK** (`src/pitloom/loom.py`) - - Dual-syntax ContextDecorator (`@loom.shoot` and `with loom.shoot`) + - Dual-syntax ContextDecorator (`@loom.run` and `with loom.run`) - Emits SPDX 3 SBOM fragments automatically during ML executions - Seamlessly ingested into project SBOMs using `[tool.pitloom.fragments]` config @@ -229,7 +229,7 @@ pitloom/ │ ├── __about__.py # Package version (__version__) │ ├── __init__.py │ ├── __main__.py # CLI entry point (loom / python -m pitloom) -│ ├── loom.py # ML tracking SDK (Shoot context manager / decorator) +│ ├── loom.py # ML tracking SDK (Run context manager / decorator) │ └── py.typed # PEP 561 marker ├── tests/ │ ├── fixtures/ diff --git a/examples/sentimentdemo-aibom/.gitignore b/examples/sentimentdemo-aibom/.gitignore new file mode 100644 index 0000000..c2f9ce2 --- /dev/null +++ b/examples/sentimentdemo-aibom/.gitignore @@ -0,0 +1,8 @@ +# Build output +dist/ +*.egg-info/ + +# Generated artefacts - regenerate by re-running the pipeline. +data/processed/ +models/ +fragments/ diff --git a/examples/sentimentdemo-aibom/README.md b/examples/sentimentdemo-aibom/README.md new file mode 100644 index 0000000..067e665 --- /dev/null +++ b/examples/sentimentdemo-aibom/README.md @@ -0,0 +1,385 @@ +--- +SPDX-FileCopyrightText: 2026-present Arthit Suriyawongkul +SPDX-FileType: DOCUMENTATION +SPDX-License-Identifier: CC0-1.0 +--- + +# Sentiment Demo - end-to-end AI SBOM walkthrough + +This worked example shows, step by step, how to build an +**AI Bill of Materials (AI BOM)** for a small AI project by combining +information from many different sources at many different stages of the +AI lifecycle. + +It is based on example repository and two independent tools: + +| Tool | Role in this example | +| :--- | :--- | +| [Sentiment Demo](https://github.com/bact/sentimentdemo) | The underlying AI project. A tiny fastText sentiment classifier. | +| [Pitloom](https://github.com/bact/pitloom) | SBOM generator. Records fragments at run time (decorator / context manager), extracts metadata from AI model files, and merges everything at build time via a Hatchling hook. | +| [STAV](https://github.com/bact/stav) | Vocabulary library. Supplies the Pitloom decorator with standardised IRI strings used for dataset types, preprocessing operations, and AI techniques. | + +The lifecycle is split into **five stages**, each contributing one slice +of the final AI SBOM: + +```text ++-------------+ +-----------+ +------------+ +-------------+ +----------+ +| 1 Pre- | | 2 Train | | 3 Evaluate | | 4 Inspect | | 5 Build | +| process |-> | |-> | |-> | model file |-> | & embed | +| (decorator) | | (context) | | (context) | | (CLI) | | (hook) | ++------+------+ +-----+-----+ +------+-----+ +------+------+ +-----+----+ + | | | | | + v v v v v + fragments/01 fragments/02 fragments/03 fragments/04 wheel SBOM + preprocess train evaluate model_file (composite) +``` + +Each stage writes a self-contained SPDX 3 JSON-LD **fragment**. The +build hook then weaves all of them - plus dependency information taken +straight from `pyproject.toml` - into a single composite SBOM that +ships inside the wheel at `.dist-info/sboms/sentimentdemo.spdx3.json` +([PEP 770](https://peps.python.org/pep-0770/)). + +--- + +## Project layout + +```text +sentimentdemo-aibom/ +|-- pyproject.toml # project metadata + Pitloom hatch hook config +|-- data/raw/ # source training data (committed) +| |-- pos.txt +| `-- neg.txt +|-- data/processed/ # generated by stage 1 +|-- models/sentiment.bin # generated by stage 2 +|-- fragments/ # generated by stages 1-4 +|-- dist/ # generated by stage 5 +|-- scripts/run_pipeline.sh # runs every stage in order +`-- src/sentimentdemo/ + |-- __about__.py + |-- preprocess.py # @loom.run decorator (stage 1) + |-- train.py # with loom.run(): ... (stage 2) + `-- evaluate.py # with loom.run(): ... (stage 3) +``` + +--- + +## Setup + +```bash +# From the pitloom checkout +pip install -e . # install pitloom in dev mode +pip install fasttext stav build # runtime dependencies of this example +cd examples/sentimentdemo-aibom +``` + +Or, in one go: + +```bash +./scripts/run_pipeline.sh +``` + +The rest of this document walks through what each stage does, what it +records, and how the pieces fit together. + +--- + +## Stage 1 - Data preprocessing (Pitloom decorator) + +**Script:** `src/sentimentdemo/preprocess.py` +**Output fragment:** `fragments/01_preprocess.spdx3.json` + +The preprocessing function is wrapped with the +`@pitloom.loom.run(...)` *decorator*. The decorator opens a recording +session before the function runs, captures every `loom.add_*` call made +inside the function, and writes a fragment file when the function +returns. + +```python +@loom.run(FRAGMENT_PATH, pretty=True) +def preprocess() -> tuple[Path, Path]: + loom.add_input_dataset("data/raw/pos.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name) + loom.add_input_dataset("data/raw/neg.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name) + + # ... tokenise, label, split ... + + loom.add_output_dataset("data/processed/train.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name, + data_preprocessing=PREPROCESSING_STEPS) + loom.add_output_dataset("data/processed/test.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name, + data_preprocessing=PREPROCESSING_STEPS) +``` + +**Where STAV comes in.** Both the `dataset_type` argument and every entry +in `PREPROCESSING_STEPS` come from STAV's standardised vocabulary +(`stav.vocab.spdx.dataset.DatasetType`, plus IRIs in +`stav.vocab` for the operations). Using STAV means the operation names +in your SBOM are *the same strings* everyone else in the AI SBOM +community uses, instead of free-form labels. + +Run it: + +```bash +python -m sentimentdemo.preprocess +``` + +**What the fragment contains:** + +- Two `dataset_DatasetPackage` elements for the raw inputs. +- Two `dataset_DatasetPackage` elements for the processed outputs, + each carrying a `dataset_dataPreprocessing` array of STAV IRIs. +- One SPDX `Relationship` (`hasInput`) per output dataset, linking it + back to both raw inputs. This is the **data lineage**. + +--- + +## Stage 2 - Model training (Pitloom context manager) + +**Script:** `src/sentimentdemo/train.py` +**Output fragment:** `fragments/02_train.spdx3.json` + +For training we use Pitloom as a **context manager** instead of a +decorator. Same recording semantics, different syntax - convenient +inside larger functions or when you want to scope the recording to just +part of a script: + +```python +with loom.run(FRAGMENT_PATH, pretty=True): + loom.set_model( + name="sentimentdemo", + model_type=str(stav_dpv.ai.AITechnique.SupervisedLearning), + hyperparameters={"lr": "0.5", "epoch": "25", "wordNgrams": "2", ...}, + ) + loom.add_dataset("data/processed/train.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name) + + model = fasttext.train_supervised(input=str(TRAIN_FILE), **HYPERPARAMS) + model.save_model(str(MODEL_PATH)) +``` + +**Where STAV comes in.** `model_type` is the IRI +`https://w3id.org/dpv/ai#SupervisedLearning`, pulled from +`stav.vocab.dpv.ai.AITechnique`. Anyone reading the SBOM can resolve +that IRI back to the formal DPV-AI concept - no guessing what +"supervised" means in context. + +Run it: + +```bash +python -m sentimentdemo.train +``` + +**What the fragment contains:** + +- One `ai_AIPackage` element for the model, with `ai_typeOfModel` and + every hyperparameter as a `DictionaryEntry` under `ai_hyperparameter`. +- One `dataset_DatasetPackage` element for the training set. +- A `trainedOn` SPDX `Relationship` from the model to that dataset. + +--- + +## Stage 3 - Model evaluation (Pitloom context manager) + +**Script:** `src/sentimentdemo/evaluate.py` +**Output fragment:** `fragments/03_evaluate.spdx3.json` + +Evaluation looks structurally the same as training, but uses +`loom.add_validation_dataset(...)` instead of `loom.add_dataset(...)` +so that the relationship type comes out as `testedOn` instead of +`trainedOn`. + +```python +with loom.run(FRAGMENT_PATH, pretty=True): + loom.set_model(name="sentimentdemo", + model_type=str(stav_dpv.ai.AITechnique.SupervisedLearning)) + loom.add_validation_dataset("data/processed/test.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name) + + model = fasttext.load_model(str(MODEL_PATH)) + n, p_at_1, r_at_1 = model.test(str(TEST_FILE)) +``` + +Run it: + +```bash +python -m sentimentdemo.evaluate +``` + +**What the fragment contains:** + +- The same model element (re-declared so the fragment is standalone). +- One `dataset_DatasetPackage` element for the test set. +- A `testedOn` SPDX `Relationship` from the model to the test set. + +--- + +## Stage 4 - Direct AI model extraction (Pitloom CLI) + +**Tool:** `loom -m` (Pitloom CLI) +**Output fragment:** `fragments/04_model_file.spdx3.json` + +Stages 1-3 record information that the *training code* knows. But the +trained model file itself carries information that only it can supply: +its actual architecture, vocabulary size, embedding dimension, output +labels, file hash, and so on. Pitloom reads those straight out of the +binary: + +```bash +loom -m models/sentiment.bin -o fragments/04_model_file.spdx3.json --pretty +``` + +This is **independent of the training script** - you can run it against +any fastText / GGUF / ONNX / PyTorch / Safetensors model, including +ones you didn't train yourself. It is also how a downstream integrator +who only has the model artefact (no source) can still produce an +AI BOM fragment for it. + +**What the fragment contains:** + +- An `ai_AIPackage` describing the model file, with file hashes, + hyperparameters extracted from the binary, and the SPDX AI profile. + +--- + +## Stage 5 - Build time aggregation (Pitloom Hatchling hook) + +**Script:** `pyproject.toml` (Hatchling build hook configuration) +**Output:** `dist/sentimentdemo-0.1.0-*.whl` containing the final composite SBOM + +This is where everything comes together. The build hook is declared +in `pyproject.toml`: + +```toml +[build-system] +requires = ["hatchling>=1.28.0", "pitloom"] +build-backend = "hatchling.build" + +[tool.hatch.build.hooks.pitloom] +enabled = true +sbom-basename = "sentimentdemo" +fragments = [ + "fragments/01_preprocess.spdx3.json", + "fragments/02_train.spdx3.json", + "fragments/03_evaluate.spdx3.json", + "fragments/04_model_file.spdx3.json", +] +``` + +When you run `python -m build --wheel`, the Pitloom Hatchling hook +fires automatically and does three things: + +1. **Reads `pyproject.toml`** - extracts project name, version (from + `__about__.py`), description, license, authors, and the full + `[project.dependencies]` list. Each dependency becomes a + `software_Package` element with a `dependsOn` relationship. +2. **Merges every fragment** listed under + `[tool.hatch.build.hooks.pitloom].fragments`. Every element + inside each fragment - datasets, AI packages, relationships - + is added to the wheel SBOM with its original `spdxId` preserved, + so the lineage from stages 1-4 survives intact. +3. **Embeds the composite SBOM** in the wheel at + `sentimentdemo-0.1.0.dist-info/sboms/sentimentdemo.spdx3.json` + per [PEP 770](https://peps.python.org/pep-0770/). + +Build it: + +```bash +python -m build --wheel --no-isolation +``` + +(`--no-isolation` is only needed when you're testing against a local +checkout of Pitloom. With `pip install pitloom` from PyPI you can drop +the flag.) + +--- + +## What ends up in the composite SBOM + +The final SBOM brings together five categories of information from four +different sources: + +| Category | Source | Captured by | Stage | +| :--- | :--- | :--- | :--- | +| Package metadata (name, version, license, description) | `pyproject.toml` | Pitloom hatch hook | 5 | +| Runtime dependencies | `pyproject.toml` `[project.dependencies]` | Pitloom hatch hook | 5 | +| Raw + processed datasets, with `hasInput` lineage | Preprocess script | `@loom.run` decorator + STAV IRIs | 1 | +| AI model, hyperparameters, `trainedOn` relationship | Training script | `loom.run` context manager + STAV IRIs | 2 | +| Validation set, `testedOn` relationship | Eval script | `loom.run` context manager + STAV IRIs | 3 | +| Architecture, file hash, vocab/dim/labels | Model file (`sentiment.bin`) | `loom -m` CLI extractor | 4 | + +You can inspect the final SBOM with: + +```bash +python - <<'PY' +import json, zipfile, sys +wheel = next(__import__("pathlib").Path("dist").glob("*.whl")) +with zipfile.ZipFile(wheel) as zf: + sbom_name = next(n for n in zf.namelist() if n.endswith(".spdx3.json")) + data = json.loads(zf.read(sbom_name)) +graph = data["@graph"] +print(f"Total elements: {len(graph)}") +by_type = {} +for e in graph: + by_type.setdefault(e.get("type", "?"), 0) + by_type[e["type"]] += 1 +for t, n in sorted(by_type.items()): + print(f" {t}: {n}") +PY +``` + +A successful run typically prints something like: + +```text +Total elements: 28 + CreationInfo: 1 + Person: 1 + Relationship: 8 + SpdxDocument: 1 + Tool: 1 + ai_AIPackage: 2 + dataset_DatasetPackage: 7 + software_Package: 5 + software_Sbom: 2 +``` + +--- + +## Why this matters + +A real AI SBOM cannot be produced by a single tool reading a single +file. The information is genuinely **distributed**: + +- *Dataset lineage and preprocessing* are only knowable to the people + running the preprocessing code, while it runs. +- *Hyperparameters and the training -> dataset relationship* are only + knowable to the training script, while training happens. +- *Evaluation results* are only knowable to the eval script. +- *Concrete model facts* (file hash, embedding dimension, vocabulary + size, output labels) live inside the model artefact and can only be + recovered by reading that file. +- *Software dependencies and packaging metadata* live in + `pyproject.toml` and are only fully resolved at build time. + +This example demonstrates the pattern of letting **each producer +contribute the fragment it actually knows about**, and then having a +single build-time consumer weave them together. That is the +"Author - Distributor - Consumer" workflow recommended by the CISA +SBOM Sharing Lifecycle report, applied to AI systems. + +--- + +## Further reading + +- Pitloom design doc on fragments: + [`docs/design/sbom-fragments.md`](../../docs/design/sbom-fragments.md) +- Pitloom Hatchling hook docs: + [`docs/design/hatchling-build-hook.md`](../../docs/design/hatchling-build-hook.md) +- Linux Foundation AI BOM guide: + +- PEP 770 - SBOM metadata in Python packages: + +- STAV - Standards Taxonomy for AI Vocabularies: + diff --git a/examples/sentimentdemo-aibom/data/raw/neg.txt b/examples/sentimentdemo-aibom/data/raw/neg.txt new file mode 100644 index 0000000..c9131e9 --- /dev/null +++ b/examples/sentimentdemo-aibom/data/raw/neg.txt @@ -0,0 +1,15 @@ +I really hate the new design, it is terrible to use +The movie was boring and a complete waste of my time +Awful experience, I will never come back here again +Worst purchase I have made, totally not worth the money +The service was slow and the staff were unfriendly +Disappointing quality, broke after just a few days +This update is buggy and has made the product worse +Poorly written and uninspired, I could not finish reading +Terrible customer support, they ignored my complaint +I regret buying this, it does not work as advertised +Bland, dull, and forgettable, would not recommend +The interface is confusing and the documentation is bad +Late delivery, damaged packaging, and a broken item +Very loud, uncomfortable, and the design is ugly +What a frustrating evening, every minute felt too long diff --git a/examples/sentimentdemo-aibom/data/raw/pos.txt b/examples/sentimentdemo-aibom/data/raw/pos.txt new file mode 100644 index 0000000..50bf4cd --- /dev/null +++ b/examples/sentimentdemo-aibom/data/raw/pos.txt @@ -0,0 +1,15 @@ +I absolutely love this product, it works perfectly +The film was excellent and very entertaining from start to finish +Great experience overall, I would recommend it to anyone +Wonderful service and the staff were super friendly +Best purchase I have made this year, totally worth it +The new update is fantastic, lots of improvements +Amazing quality and the price is very reasonable +Truly a delightful evening, every minute was enjoyable +Outstanding performance by the whole team, bravo +Fast shipping, well packaged, and the product is excellent +I am very happy with how it turned out, exceeded expectations +This is the best book I have read in a long time +Beautiful design and intuitive to use, love it +Superb customer support, they resolved my issue quickly +What a fun and uplifting story, I smiled the whole way through diff --git a/examples/sentimentdemo-aibom/pyproject.toml b/examples/sentimentdemo-aibom/pyproject.toml new file mode 100644 index 0000000..22b11a5 --- /dev/null +++ b/examples/sentimentdemo-aibom/pyproject.toml @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2026-present Arthit Suriyawongkul +# SPDX-License-Identifier: Apache-2.0 + +[build-system] +# Pitloom is listed as a build requirement so that the Hatchling build +# back-end can load Pitloom's hatch hook at build time. +requires = ["hatchling>=1.28.0", "pitloom"] +build-backend = "hatchling.build" + +[project] +name = "sentimentdemo" +dynamic = ["version"] +description = "A tiny sentiment classifier used as the worked example for the Pitloom AI SBOM walkthrough." +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +authors = [{ name = "Arthit Suriyawongkul" }] +keywords = ["aibom", "sbom", "spdx", "sentiment-analysis", "fasttext"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +# Real runtime dependencies of the trained model. These are picked up by +# Pitloom from [project.dependencies] during the build step. +dependencies = [ + "fasttext>=0.9.3", + "stav>=0.4.1", +] + +[project.urls] +Homepage = "https://github.com/bact/pitloom" +Source = "https://github.com/bact/pitloom/tree/main/examples/sentimentdemo-aibom" + +# --------------------------------------------------------------------------- +# Hatch configuration +# --------------------------------------------------------------------------- +[tool.hatch.version] +path = "src/sentimentdemo/__about__.py" + +[tool.hatch.build.targets.wheel] +packages = ["src/sentimentdemo"] + +# --------------------------------------------------------------------------- +# Pitloom Hatchling hook +# +# When ``hatch build`` (or ``python -m build``) runs, Pitloom's build hook: +# 1. Extracts project metadata from this pyproject.toml (PEP 621 +# ``[project]`` table, dynamic version from ``__about__.py``). +# 2. Merges in every SBOM fragment listed in ``fragments`` below. +# 3. Writes a single composite ``sentimentdemo.spdx3.json`` into the +# wheel under ``.dist-info/sboms/`` (PEP 770). +# --------------------------------------------------------------------------- +[tool.hatch.build.hooks.pitloom] +enabled = true +sbom-basename = "sentimentdemo" +creator-name = "Sentiment Demo CI" +fragments = [ + "fragments/01_preprocess.spdx3.json", # produced by `python -m sentimentdemo.preprocess` + "fragments/02_train.spdx3.json", # produced by `python -m sentimentdemo.train` + "fragments/03_evaluate.spdx3.json", # produced by `python -m sentimentdemo.evaluate` + "fragments/04_model_file.spdx3.json", # produced by `loom -m models/sentiment.bin` +] + +[tool.pitloom] +pretty = true diff --git a/examples/sentimentdemo-aibom/scripts/run_pipeline.sh b/examples/sentimentdemo-aibom/scripts/run_pipeline.sh new file mode 100755 index 0000000..0f39a5e --- /dev/null +++ b/examples/sentimentdemo-aibom/scripts/run_pipeline.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026-present Arthit Suriyawongkul +# SPDX-License-Identifier: Apache-2.0 +# +# Runs the full Sentiment Demo AI lifecycle end-to-end, producing one SBOM +# fragment per stage plus the final composite AI SBOM embedded in the wheel. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$ROOT" + +export PYTHONPATH="$ROOT/src${PYTHONPATH:+:$PYTHONPATH}" +mkdir -p fragments models + +echo "================================================================" +echo "Stage 1/5 - Data preprocessing (loom decorator)" +echo "================================================================" +python -m sentimentdemo.preprocess + +echo +echo "================================================================" +echo "Stage 2/5 - Model training (loom context manager)" +echo "================================================================" +python -m sentimentdemo.train + +echo +echo "================================================================" +echo "Stage 3/5 - Model evaluation (loom context manager)" +echo "================================================================" +python -m sentimentdemo.evaluate + +echo +echo "================================================================" +echo "Stage 4/5 - Direct AI model extraction (loom -m)" +echo "================================================================" +loom -m models/sentiment.bin -o fragments/04_model_file.spdx3.json --pretty + +echo +echo "================================================================" +echo "Stage 5/5 - Hatchling build with Pitloom hook" +echo "================================================================" +rm -rf dist/ +python -m build --wheel --no-isolation + +echo +echo "================================================================" +echo "Done. Final composite AI SBOM lives inside the wheel:" +echo "================================================================" +WHEEL=$(ls dist/*.whl | head -1) +python - < tuple[int, float, float]: + """Score the trained model on the test set and write an SBOM fragment. + + Records the model identity and a ``testedOn`` SPDX relationship to the + validation dataset in the fragment file. + + Returns: + A 3-tuple of (n_samples, precision_at_1, recall_at_1). + """ + with loom.run(FRAGMENT_PATH, pretty=True): + loom.set_model( + name="sentimentdemo", + model_type=str(stav_dpv.ai.AITechnique.SupervisedLearning), + ) + # Creates a ``testedOn`` SPDX relationship from the model to this dataset. + loom.add_validation_dataset( + "data/processed/test.txt", + dataset_type=stav_spdx.dataset.DatasetType.text.name, + ) + + model = fasttext.load_model(str(MODEL_PATH)) + n, p_at_1, r_at_1 = model.test(str(TEST_FILE)) + print(f"[evaluate] n={n} precision@1={p_at_1:.3f} recall@1={r_at_1:.3f}") + + return n, p_at_1, r_at_1 + + +if __name__ == "__main__": + evaluate() + print(f"[evaluate] SBOM fragment -> {FRAGMENT_PATH}") diff --git a/examples/sentimentdemo-aibom/src/sentimentdemo/preprocess.py b/examples/sentimentdemo-aibom/src/sentimentdemo/preprocess.py new file mode 100644 index 0000000..550c489 --- /dev/null +++ b/examples/sentimentdemo-aibom/src/sentimentdemo/preprocess.py @@ -0,0 +1,111 @@ +# SPDX-FileCopyrightText: 2026-present Arthit Suriyawongkul +# SPDX-License-Identifier: Apache-2.0 + +"""Stage 1: Data preprocessing. + +Reads the raw labelled text files (``data/raw/pos.txt`` and +``data/raw/neg.txt``), normalises and tokenises them, and writes a +fastText-style train/test split to ``data/processed/``. + +Wrapped with the :func:`pitloom.loom.run` decorator so that the +dataset lineage (raw inputs -> processed outputs, with a list of +preprocessing operations) is recorded as an SPDX 3 SBOM fragment at +``fragments/01_preprocess.spdx3.json``. + +The preprocessing operation names come from STAV's standardised +vocabulary (https://github.com/bact/stav). We pull SPDX dataset-type +strings from ``stav.vocab.spdx.dataset`` so the fragment uses the same +IRIs everyone else in the AI SBOM ecosystem uses. +""" + +from __future__ import annotations + +import random +import re +from pathlib import Path + +from stav.vocab import spdx as stav_spdx + +from pitloom import loom + +ROOT = Path(__file__).resolve().parents[2] +RAW_DIR = ROOT / "data" / "raw" +PROCESSED_DIR = ROOT / "data" / "processed" +FRAGMENT_PATH = ROOT / "fragments" / "01_preprocess.spdx3.json" + +# Stable STAV IRIs that flow verbatim into SPDX ``dataset_dataPreprocessing``. +PREPROCESSING_STEPS: list[str] = [ + "https://w3id.org/stav/vocab/data#CaseFolding", + "https://w3id.org/stav/vocab/data#WhitespaceTokenization", + "https://w3id.org/stav/vocab/data#FastTextLabelling", + "https://w3id.org/stav/vocab/data#TrainTestSplit", +] + + +def _tokenize(line: str) -> str: + """Lowercase, strip punctuation, and collapse whitespace in *line*.""" + line = line.strip().lower() + line = re.sub(r"[^a-z0-9\s]", " ", line) + return re.sub(r"\s+", " ", line).strip() + + +def _read_labelled(path: Path, label: str) -> list[str]: + """Return fastText-formatted lines (``__label__