From 69d8c409869bdfd7b003d0ad6069a2e6f4f945fa Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 5 Mar 2026 17:56:03 +0100
Subject: [PATCH 01/63] chore: vscode excluded by gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 38d49af..2d0c882 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 __pycache__/
 dist/
 .claude/
+
+.vscode/

From 5e9499df2765940a0a7bb3d78586ca68f5fdc8cf Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Tue, 17 Mar 2026 16:01:25 +0100
Subject: [PATCH 02/63] fix: run ingestion in executor to avoid blocking event
 loop

---
 .../url_handler/ingestion_web_page_scrape_url_handler.py   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py b/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py
index a228e26..6c7a719 100644
--- a/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py
+++ b/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import asyncio
+import functools
 import json
 import logging
 import re
@@ -43,7 +45,10 @@ def __init__(
 
     async def __call__(self, url: Url) -> None:
         settings = self._build_settings(url)
-        result = run_ingestion(settings)
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(
+            None, functools.partial(run_ingestion, settings)
+        )
 
         if not result.pages:
             failed = [

From 87d5a149f2a047c5baa2baa4adf8db1ebaf8e2a4 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Tue, 17 Mar 2026 16:05:45 +0100
Subject: [PATCH 03/63] fix: offload postprocessors and validation to executor
 to prevent blocking event loop

---
 wordlift_sdk/kg_build/protocol.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index fd8ddc0..6bf88b8 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import functools
 import hashlib
 import logging
 import os
@@ -95,6 +96,7 @@ def __init__(
         self._mapping_cache: dict[Path, str] = {}
         self._static_templates_patched = False
         self._static_templates_lock = asyncio.Lock()
+        self._postprocessor_lock = asyncio.Lock()
         canonical_id_strategy = (
             str(
                 self.profile.settings.get(
@@ -209,7 +211,18 @@ async def callback(
 
         if existing_web_page_id:
             self._reconcile_root_id(graph, existing_web_page_id)
-        graph = self._apply_postprocessors(graph, url, response, existing_web_page_id)
+        loop = asyncio.get_event_loop()
+        async with self._postprocessor_lock:
+            graph = await loop.run_in_executor(
+                None,
+                functools.partial(
+                    self._apply_postprocessors,
+                    graph,
+                    url,
+                    response,
+                    existing_web_page_id,
+                ),
+            )
         # Canonical IDs must run after custom postprocessors so any nodes minted
         # by local logic are normalized before graph sync patching.
         graph = self._core_ids.process_graph(
@@ -225,7 +238,9 @@ async def callback(
             )
             self._write_debug_graph(graph, url)
 
-        validation_payload = self._validate_graph_if_enabled(graph, url)
+        validation_payload = await loop.run_in_executor(
+            None, functools.partial(self._validate_graph_if_enabled, graph, url)
+        )
         graph_metrics = self._kpi.graph_metrics(graph)
         self._emit_progress(
             {
@@ -284,8 +299,14 @@ async def _patch_static_templates_once(self) -> None:
 
             self._ensure_templates_loaded()
             if self._template_graph and len(self._template_graph) > 0:
-                validation_payload = self._validate_graph_if_enabled(
-                    self._template_graph, "static_templates"
+                _loop = asyncio.get_event_loop()
+                validation_payload = await _loop.run_in_executor(
+                    None,
+                    functools.partial(
+                        self._validate_graph_if_enabled,
+                        self._template_graph,
+                        "static_templates",
+                    ),
                 )
                 self._emit_progress(
                     {

From 9514e4580c14e74cd1b93f7acf3e398fdf3eec25 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Tue, 17 Mar 2026 16:39:24 +0100
Subject: [PATCH 04/63] feat: use postprocessor pool for true concurrent
 processing

---
 wordlift_sdk/kg_build/protocol.py | 57 +++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 6bf88b8..9aaecb4 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -96,7 +96,6 @@ def __init__(
         self._mapping_cache: dict[Path, str] = {}
         self._static_templates_patched = False
         self._static_templates_lock = asyncio.Lock()
-        self._postprocessor_lock = asyncio.Lock()
         canonical_id_strategy = (
             str(
                 self.profile.settings.get(
@@ -117,11 +116,25 @@ def __init__(
             self._postprocessor_runtime,
             self.profile.origins.get("postprocessor_runtime", "default"),
         )
-        self._postprocessors = load_postprocessors_for_profile(
-            root_dir=self.root_dir,
-            profile_name=self.profile.name,
-            runtime=self._postprocessor_runtime,
+        _pool_size = int(
+            self.profile.settings.get(
+                "concurrency", self.profile.settings.get("CONCURRENCY", 4)
+            )
         )
+        logger.info(
+            "Postprocessor pool size for profile '%s': %d",
+            self.profile.name,
+            _pool_size,
+        )
+        self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
+        for _ in range(_pool_size):
+            self._postprocessors_queue.put_nowait(
+                load_postprocessors_for_profile(
+                    root_dir=self.root_dir,
+                    profile_name=self.profile.name,
+                    runtime=self._postprocessor_runtime,
+                )
+            )
         self._shacl_mode = self._resolve_validation_mode(
             self.profile.settings.get(
                 "shacl_validate_mode",
@@ -212,17 +225,21 @@ async def callback(
         if existing_web_page_id:
             self._reconcile_root_id(graph, existing_web_page_id)
         loop = asyncio.get_event_loop()
-        async with self._postprocessor_lock:
+        _postprocessors = await self._postprocessors_queue.get()
+        try:
             graph = await loop.run_in_executor(
                 None,
                 functools.partial(
-                    self._apply_postprocessors,
+                    self._apply_postprocessors_with,
                     graph,
                     url,
                     response,
                     existing_web_page_id,
+                    _postprocessors,
                 ),
             )
+        finally:
+            self._postprocessors_queue.put_nowait(_postprocessors)
         # Canonical IDs must run after custom postprocessors so any nodes minted
         # by local logic are normalized before graph sync patching.
         graph = self._core_ids.process_graph(
@@ -262,7 +279,11 @@ async def callback(
         logger.info("Wrote %s triples for %s", len(graph), url)
 
     def close(self) -> None:
-        close_loaded_postprocessors(self._postprocessors)
+        while not self._postprocessors_queue.empty():
+            try:
+                close_loaded_postprocessors(self._postprocessors_queue.get_nowait())
+            except asyncio.QueueEmpty:
+                break
 
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)
@@ -485,7 +506,23 @@ def _apply_postprocessors(
         response: WebPageScrapeResponse,
         existing_web_page_id: str | None,
     ) -> Graph:
-        if not self._postprocessors:
+        return self._apply_postprocessors_with(
+            graph,
+            url,
+            response,
+            existing_web_page_id,
+            list(self._postprocessors_queue._queue),  # type: ignore[attr-defined]
+        )
+
+    def _apply_postprocessors_with(
+        self,
+        graph: Graph,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        postprocessors: list,
+    ) -> Graph:
+        if not postprocessors:
             return graph
 
         pp_context = self._build_pp_context(url, response, existing_web_page_id)
@@ -495,7 +532,7 @@ def _apply_postprocessors(
                 "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY."
             )
 
-        for processor in self._postprocessors:
+        for processor in postprocessors:
             graph = processor.run(graph, pp_context)
             logger.info("Applied postprocessor '%s' for %s", processor.name, url)
         return graph

From 6706fcc018d689ea491b922306e3ec6ce3d5fefb Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Tue, 17 Mar 2026 17:12:06 +0100
Subject: [PATCH 05/63] fix: increase postprocessor startup timeout from 10s to
 60s

---
 wordlift_sdk/kg_build/postprocessors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index b67de6b..a41af15 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -181,7 +181,7 @@ def _ensure_started(self) -> subprocess.Popen[str]:
 
         try:
             ready = self._read_message(
-                process, timeout_seconds=min(self._spec.timeout_seconds, 10)
+                process, timeout_seconds=min(self._spec.timeout_seconds, 60)
             )
         except Exception:
             self._terminate(process)

From 1ceeef78cdde90b9337c5b9ad6c792e8c9899e57 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Tue, 17 Mar 2026 17:41:51 +0100
Subject: [PATCH 06/63] debug: add timing instrumentation to mapping,
 postprocessor, and validation stages

---
 wordlift_sdk/kg_build/protocol.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 9aaecb4..4b63047 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import tempfile
+import time
 from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
@@ -210,6 +211,7 @@ async def callback(
         mapping_response = self._mapping_response(response, existing_web_page_id)
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
 
+        _t0 = time.perf_counter()
         graph = await self.rml_service.apply_mapping(
             html=response.web_page.html,
             url=url,
@@ -218,6 +220,7 @@ async def callback(
             response=mapping_response,
             debug_output=debug_output,
         )
+        _t_mapping = int((time.perf_counter() - _t0) * 1000)
         if not graph or len(graph) == 0:
             logger.warning("No triples produced for %s", url)
             return
@@ -225,8 +228,11 @@ async def callback(
         if existing_web_page_id:
             self._reconcile_root_id(graph, existing_web_page_id)
         loop = asyncio.get_event_loop()
+        _t1 = time.perf_counter()
         _postprocessors = await self._postprocessors_queue.get()
+        _t_queue_wait = int((time.perf_counter() - _t1) * 1000)
         try:
+            _t2 = time.perf_counter()
             graph = await loop.run_in_executor(
                 None,
                 functools.partial(
@@ -238,6 +244,7 @@ async def callback(
                     _postprocessors,
                 ),
             )
+            _t_postprocessors = int((time.perf_counter() - _t2) * 1000)
         finally:
             self._postprocessors_queue.put_nowait(_postprocessors)
         # Canonical IDs must run after custom postprocessors so any nodes minted
@@ -255,9 +262,11 @@ async def callback(
             )
             self._write_debug_graph(graph, url)
 
+        _t3 = time.perf_counter()
         validation_payload = await loop.run_in_executor(
             None, functools.partial(self._validate_graph_if_enabled, graph, url)
         )
+        _t_validation = int((time.perf_counter() - _t3) * 1000)
         graph_metrics = self._kpi.graph_metrics(graph)
         self._emit_progress(
             {
@@ -276,7 +285,15 @@ async def callback(
         ):
             raise RuntimeError(f"SHACL validation failed for {url} in fail mode.")
         await self._write_graph(graph)
-        logger.info("Wrote %s triples for %s", len(graph), url)
+        logger.info(
+            "Wrote %s triples for %s [mapping=%dms queue_wait=%dms postprocessors=%dms validation=%dms]",
+            len(graph),
+            url,
+            _t_mapping,
+            _t_queue_wait,
+            _t_postprocessors,
+            _t_validation,
+        )
 
     def close(self) -> None:
         while not self._postprocessors_queue.empty():
@@ -533,8 +550,14 @@ def _apply_postprocessors_with(
             )
 
         for processor in postprocessors:
+            _tp = time.perf_counter()
             graph = processor.run(graph, pp_context)
-            logger.info("Applied postprocessor '%s' for %s", processor.name, url)
+            logger.info(
+                "Applied postprocessor '%s' for %s [%dms]",
+                processor.name,
+                url,
+                int((time.perf_counter() - _tp) * 1000),
+            )
         return graph
 
     def _build_pp_context(

From 6b99e6e7464fec7623b38f5c708a0815dac22c28 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 09:03:14 +0100
Subject: [PATCH 07/63] refactor: pre-load SHACL and validate in-memory to
 avoid I/O

---
 wordlift_sdk/kg_build/protocol.py | 60 ++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 4b63047..d6b7c65 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -5,7 +5,7 @@
 import hashlib
 import logging
 import os
-import tempfile
+
 import time
 from dataclasses import asdict
 from pathlib import Path
@@ -19,10 +19,13 @@
 from wordlift_sdk.protocol.web_page_import_protocol import (
     WebPageImportProtocolInterface,
 )
+from pyshacl import validate as pyshacl_validate
+from rdflib.namespace import SH
 from wordlift_sdk.validation.shacl import (
     ValidationResult,
+    _load_shapes_graph,
+    _normalize_schema_org_uris,
     resolve_shape_specs,
-    validate_file,
 )
 
 from .config import ProfileDefinition
@@ -164,6 +167,24 @@ def __init__(
             exclude_builtin_shapes=shacl_exclude_builtin_shapes or None,
             extra_shapes=shacl_extra_shapes or None,
         )
+        _shacl_validate_mode_for_preload = self._resolve_validation_mode(
+            self.profile.settings.get(
+                "shacl_validate_mode",
+                self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"),
+            )
+        )
+        if _shacl_validate_mode_for_preload != "off":
+            self._shacl_shapes_graph, self._shacl_source_map = _load_shapes_graph(
+                self._shacl_shape_specs if self._shacl_shape_specs else None
+            )
+            logger.info(
+                "Pre-loaded %d SHACL shape triples for profile '%s'",
+                len(self._shacl_shapes_graph),
+                self.profile.name,
+            )
+        else:
+            self._shacl_shapes_graph = None
+            self._shacl_source_map = {}
         self._import_hash_mode = self._resolve_import_hash_mode(
             self.profile.settings.get(
                 "import_hash_mode",
@@ -745,21 +766,26 @@ def _validate_graph_if_enabled(
         return summary
 
     def _validate_graph(self, graph: Graph) -> ValidationResult:
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".ttl", delete=False) as f:
-            tmp = Path(f.name)
-        try:
-            graph.serialize(destination=tmp, format="turtle")
-            return validate_file(
-                str(tmp),
-                shape_specs=self._shacl_shape_specs
-                if self._shacl_shape_specs
-                else None,
-            )
-        finally:
-            try:
-                tmp.unlink(missing_ok=True)
-            except Exception:
-                logger.debug("Failed to remove temporary SHACL graph file: %s", tmp)
+        data_graph = _normalize_schema_org_uris(graph)
+        conforms, report_graph, report_text = pyshacl_validate(
+            data_graph,
+            shacl_graph=self._shacl_shapes_graph,
+            inference="rdfs",
+            abort_on_first=False,
+            allow_infos=True,
+            allow_warnings=True,
+        )
+        warning_count = sum(
+            1 for _ in report_graph.subjects(SH.resultSeverity, SH.Warning)
+        )
+        return ValidationResult(
+            conforms=conforms,
+            report_text=report_text,
+            report_graph=report_graph,
+            data_graph=data_graph,
+            shape_source_map=self._shacl_source_map,
+            warning_count=warning_count,
+        )
 
     def _summarize_validation(self, result: ValidationResult) -> dict[str, Any]:
         sh = URIRef("http://www.w3.org/ns/shacl#")

From 030fbaa559c6404ade4cd9b4f6c8bedc677f04c3 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 10:38:21 +0100
Subject: [PATCH 08/63] feat: run SHACL validation in a process pool to bypass
 GIL and parallelize across CPUs

---
 wordlift_sdk/kg_build/protocol.py | 119 ++++++++++++++++++++++++------
 1 file changed, 95 insertions(+), 24 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index d6b7c65..4149cc3 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -5,8 +5,8 @@
 import hashlib
 import logging
 import os
-
 import time
+from concurrent.futures import ProcessPoolExecutor
 from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
@@ -50,6 +50,59 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
+# Module-level state for SHACL worker processes (one copy per process)
+_shacl_worker_shapes_graph: Graph | None = None
+_shacl_worker_source_map: dict = {}
+
+
+def _init_shacl_worker(shape_specs: list[str] | None) -> None:
+    global _shacl_worker_shapes_graph, _shacl_worker_source_map
+    _shacl_worker_shapes_graph, _shacl_worker_source_map = _load_shapes_graph(
+        shape_specs
+    )
+
+
+def _shacl_validate_in_worker(ntriples: str) -> dict:
+    data_graph = Graph()
+    data_graph.parse(data=ntriples, format="nt")
+    data_graph = _normalize_schema_org_uris(data_graph)
+    conforms, report_graph, _ = pyshacl_validate(
+        data_graph,
+        shacl_graph=_shacl_worker_shapes_graph,
+        inference="rdfs",
+        abort_on_first=False,
+        allow_infos=True,
+        allow_warnings=True,
+    )
+    warning_sources: dict[str, int] = {}
+    error_sources: dict[str, int] = {}
+    warning_count = 0
+    error_count = 0
+    for node in report_graph.subjects(SH.resultSeverity, SH.Warning):
+        warning_count += 1
+        shape = next(report_graph.objects(node, SH.sourceShape), None)
+        label = _shacl_worker_source_map.get(shape, "unknown")
+        warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1
+    for node in report_graph.subjects(SH.resultSeverity, SH.Violation):
+        error_count += 1
+        shape = next(report_graph.objects(node, SH.sourceShape), None)
+        label = _shacl_worker_source_map.get(shape, "unknown")
+        error_sources[str(label)] = error_sources.get(str(label), 0) + 1
+    return {
+        "total": 1,
+        "pass": bool(conforms),
+        "fail": not bool(conforms),
+        "warnings": {
+            "count": warning_count,
+            "sources": dict(sorted(warning_sources.items())),
+        },
+        "errors": {
+            "count": error_count,
+            "sources": dict(sorted(error_sources.items())),
+        },
+    }
+
+
 def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str:
     value = settings.get("postprocessor_runtime")
     if value is None:
@@ -167,24 +220,21 @@ def __init__(
             exclude_builtin_shapes=shacl_exclude_builtin_shapes or None,
             extra_shapes=shacl_extra_shapes or None,
         )
-        _shacl_validate_mode_for_preload = self._resolve_validation_mode(
-            self.profile.settings.get(
-                "shacl_validate_mode",
-                self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"),
-            )
-        )
-        if _shacl_validate_mode_for_preload != "off":
-            self._shacl_shapes_graph, self._shacl_source_map = _load_shapes_graph(
-                self._shacl_shape_specs if self._shacl_shape_specs else None
+        if self._shacl_mode != "off":
+            self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor(
+                max_workers=_pool_size,
+                initializer=_init_shacl_worker,
+                initargs=(
+                    self._shacl_shape_specs if self._shacl_shape_specs else None,
+                ),
             )
             logger.info(
-                "Pre-loaded %d SHACL shape triples for profile '%s'",
-                len(self._shacl_shapes_graph),
+                "Created SHACL process pool with %d workers for profile '%s'",
+                _pool_size,
                 self.profile.name,
             )
         else:
-            self._shacl_shapes_graph = None
-            self._shacl_source_map = {}
+            self._process_executor = None
         self._import_hash_mode = self._resolve_import_hash_mode(
             self.profile.settings.get(
                 "import_hash_mode",
@@ -284,9 +334,7 @@ async def callback(
             self._write_debug_graph(graph, url)
 
         _t3 = time.perf_counter()
-        validation_payload = await loop.run_in_executor(
-            None, functools.partial(self._validate_graph_if_enabled, graph, url)
-        )
+        validation_payload = await self._async_validate_if_enabled(loop, graph, url)
         _t_validation = int((time.perf_counter() - _t3) * 1000)
         graph_metrics = self._kpi.graph_metrics(graph)
         self._emit_progress(
@@ -322,6 +370,8 @@ def close(self) -> None:
                 close_loaded_postprocessors(self._postprocessors_queue.get_nowait())
             except asyncio.QueueEmpty:
                 break
+        if self._process_executor is not None:
+            self._process_executor.shutdown(wait=False)
 
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)
@@ -359,13 +409,8 @@ async def _patch_static_templates_once(self) -> None:
             self._ensure_templates_loaded()
             if self._template_graph and len(self._template_graph) > 0:
                 _loop = asyncio.get_event_loop()
-                validation_payload = await _loop.run_in_executor(
-                    None,
-                    functools.partial(
-                        self._validate_graph_if_enabled,
-                        self._template_graph,
-                        "static_templates",
-                    ),
+                validation_payload = await self._async_validate_if_enabled(
+                    _loop, self._template_graph, "static_templates"
                 )
                 self._emit_progress(
                     {
@@ -742,6 +787,32 @@ def _mapping_response(
             web_page=response.web_page,
         )
 
+    async def _async_validate_if_enabled(
+        self, loop: Any, graph: Graph, url: str
+    ) -> dict[str, Any] | None:
+        if self._shacl_mode == "off":
+            return None
+        ntriples = graph.serialize(format="nt")
+        summary = await loop.run_in_executor(
+            self._process_executor,
+            functools.partial(_shacl_validate_in_worker, ntriples),
+        )
+        self._kpi.record_validation(
+            passed=summary["pass"],
+            warning_count=summary["warnings"]["count"],
+            error_count=summary["errors"]["count"],
+            warning_sources=summary["warnings"]["sources"],
+            error_sources=summary["errors"]["sources"],
+        )
+        logger.info(
+            "SHACL validation for %s: pass=%s warnings=%s errors=%s",
+            url,
+            summary["pass"],
+            summary["warnings"]["count"],
+            summary["errors"]["count"],
+        )
+        return summary
+
     def _validate_graph_if_enabled(
         self, graph: Graph, url: str
     ) -> dict[str, Any] | None:

From d06ebc31765f03cf4c9b3b58a2a9de3fb8d2fb4c Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 10:59:56 +0100
Subject: [PATCH 09/63] feat: add separate pool size settings for
 postprocessors and SHACL validation

---
 wordlift_sdk/kg_build/protocol.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 4149cc3..a32238c 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -178,13 +178,20 @@ def __init__(
                 "concurrency", self.profile.settings.get("CONCURRENCY", 4)
             )
         )
+        _pp_pool_size = int(
+            self.profile.settings.get(
+                "postprocessor_pool_size",
+                self.profile.settings.get("POSTPROCESSOR_POOL_SIZE", _pool_size),
+            )
+        )
         logger.info(
-            "Postprocessor pool size for profile '%s': %d",
+            "Postprocessor pool size for profile '%s': %d (concurrency=%d)",
             self.profile.name,
+            _pp_pool_size,
             _pool_size,
         )
         self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
-        for _ in range(_pool_size):
+        for _ in range(_pp_pool_size):
             self._postprocessors_queue.put_nowait(
                 load_postprocessors_for_profile(
                     root_dir=self.root_dir,
@@ -221,8 +228,16 @@ def __init__(
             extra_shapes=shacl_extra_shapes or None,
         )
         if self._shacl_mode != "off":
+            _shacl_pool_size = int(
+                self.profile.settings.get(
+                    "shacl_pool_size",
+                    self.profile.settings.get(
+                        "SHACL_POOL_SIZE", max(2, _pool_size // 2)
+                    ),
+                )
+            )
             self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor(
-                max_workers=_pool_size,
+                max_workers=_shacl_pool_size,
                 initializer=_init_shacl_worker,
                 initargs=(
                     self._shacl_shape_specs if self._shacl_shape_specs else None,
@@ -230,7 +245,7 @@ def __init__(
             )
             logger.info(
                 "Created SHACL process pool with %d workers for profile '%s'",
-                _pool_size,
+                _shacl_pool_size,
                 self.profile.name,
             )
         else:

From c6591e5891b6c7628394676a1fa89003d103519f Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 11:44:53 +0100
Subject: [PATCH 10/63] feat: track SHACL process pool queue wait and execution
 time separately in timing log

---
 wordlift_sdk/kg_build/protocol.py | 49 ++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index a32238c..989d889 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -62,7 +62,9 @@ def _init_shacl_worker(shape_specs: list[str] | None) -> None:
     )
 
 
-def _shacl_validate_in_worker(ntriples: str) -> dict:
+def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict:
+    _queue_wait_ms = int((time.time() - submit_time) * 1000)
+    _t_start = time.perf_counter()
     data_graph = Graph()
     data_graph.parse(data=ntriples, format="nt")
     data_graph = _normalize_schema_org_uris(data_graph)
@@ -100,6 +102,8 @@ def _shacl_validate_in_worker(ntriples: str) -> dict:
             "count": error_count,
             "sources": dict(sorted(error_sources.items())),
         },
+        "_queue_wait_ms": _queue_wait_ms,
+        "_validation_ms": int((time.perf_counter() - _t_start) * 1000),
     }
 
 
@@ -348,9 +352,11 @@ async def callback(
             )
             self._write_debug_graph(graph, url)
 
-        _t3 = time.perf_counter()
-        validation_payload = await self._async_validate_if_enabled(loop, graph, url)
-        _t_validation = int((time.perf_counter() - _t3) * 1000)
+        (
+            validation_payload,
+            _t_validation_wait,
+            _t_validation_actual,
+        ) = await self._async_validate_if_enabled(loop, graph, url)
         graph_metrics = self._kpi.graph_metrics(graph)
         self._emit_progress(
             {
@@ -370,13 +376,14 @@ async def callback(
             raise RuntimeError(f"SHACL validation failed for {url} in fail mode.")
         await self._write_graph(graph)
         logger.info(
-            "Wrote %s triples for %s [mapping=%dms queue_wait=%dms postprocessors=%dms validation=%dms]",
+            "Wrote %s triples for %s [mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]",
             len(graph),
             url,
             _t_mapping,
             _t_queue_wait,
             _t_postprocessors,
-            _t_validation,
+            _t_validation_wait,
+            _t_validation_actual,
         )
 
     def close(self) -> None:
@@ -424,7 +431,7 @@ async def _patch_static_templates_once(self) -> None:
             self._ensure_templates_loaded()
             if self._template_graph and len(self._template_graph) > 0:
                 _loop = asyncio.get_event_loop()
-                validation_payload = await self._async_validate_if_enabled(
+                validation_payload, _, _ = await self._async_validate_if_enabled(
                     _loop, self._template_graph, "static_templates"
                 )
                 self._emit_progress(
@@ -804,29 +811,31 @@ def _mapping_response(
 
     async def _async_validate_if_enabled(
         self, loop: Any, graph: Graph, url: str
-    ) -> dict[str, Any] | None:
+    ) -> tuple[dict[str, Any] | None, int, int]:
         if self._shacl_mode == "off":
-            return None
+            return None, 0, 0
         ntriples = graph.serialize(format="nt")
-        summary = await loop.run_in_executor(
+        result = await loop.run_in_executor(
             self._process_executor,
-            functools.partial(_shacl_validate_in_worker, ntriples),
+            functools.partial(_shacl_validate_in_worker, ntriples, time.time()),
         )
+        validation_queue_wait_ms = result.pop("_queue_wait_ms", 0)
+        validation_ms = result.pop("_validation_ms", 0)
         self._kpi.record_validation(
-            passed=summary["pass"],
-            warning_count=summary["warnings"]["count"],
-            error_count=summary["errors"]["count"],
-            warning_sources=summary["warnings"]["sources"],
-            error_sources=summary["errors"]["sources"],
+            passed=result["pass"],
+            warning_count=result["warnings"]["count"],
+            error_count=result["errors"]["count"],
+            warning_sources=result["warnings"]["sources"],
+            error_sources=result["errors"]["sources"],
         )
         logger.info(
             "SHACL validation for %s: pass=%s warnings=%s errors=%s",
             url,
-            summary["pass"],
-            summary["warnings"]["count"],
-            summary["errors"]["count"],
+            result["pass"],
+            result["warnings"]["count"],
+            result["errors"]["count"],
         )
-        return summary
+        return result, validation_queue_wait_ms, validation_ms
 
     def _validate_graph_if_enabled(
         self, graph: Graph, url: str

From 5338b9c384712b01af83b48e4c18ace9a9feecc2 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 12:13:08 +0100
Subject: [PATCH 11/63] feat: add inprocess postprocessor runtime for running
 processors in the same process

---
 wordlift_sdk/kg_build/postprocessors.py | 65 +++++++++++++++++--------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index a41af15..4e5a079 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import asyncio
+import importlib
+import inspect
 import json
 import logging
 import select
@@ -21,6 +24,7 @@
 
 _RUNTIME_ONESHOT = "oneshot"
 _RUNTIME_PERSISTENT = "persistent"
+_RUNTIME_INPROCESS = "inprocess"
 
 
 @dataclass(frozen=True)
@@ -373,6 +377,23 @@ def _run_persistent(
         )
 
 
+@dataclass(frozen=True)
+class InProcessPostprocessor:
+    class_path: str
+
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None:
+        module_name, class_name = self.class_path.split(":", 1)
+        module = importlib.import_module(module_name)
+        klass = getattr(module, class_name)
+        processor = klass()
+        result = processor.process_graph(graph, context)
+        if inspect.isawaitable(result):
+            result = asyncio.run(result)
+        return result
+
+
 def _as_bool(value: Any, default: bool) -> bool:
     if value is None:
         return default
@@ -399,8 +420,10 @@ def _as_positive_int(value: Any, default: int) -> int:
 
 def _normalize_runtime(value: str | None) -> str:
     runtime = (value or _RUNTIME_ONESHOT).strip().lower()
-    if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT}:
-        raise ValueError("POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent.")
+    if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}:
+        raise ValueError(
+            "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess."
+        )
     return runtime
 
 
@@ -510,16 +533,17 @@ def load_postprocessors_for_profile(
     for spec in specs:
         if not spec.enabled:
             continue
-        loaded.append(
-            LoadedPostprocessor(
-                name=spec.class_path,
-                handler=SubprocessPostprocessor(
-                    spec=spec,
-                    root_dir=root_dir,
-                    runtime=resolved_runtime,
-                ),
+        if resolved_runtime == _RUNTIME_INPROCESS:
+            handler: GraphPostprocessor = InProcessPostprocessor(
+                class_path=spec.class_path
             )
-        )
+        else:
+            handler = SubprocessPostprocessor(
+                spec=spec,
+                root_dir=root_dir,
+                runtime=resolved_runtime,
+            )
+        loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler))
 
     logger.info(
         "Loaded %s postprocessors for profile '%s' from manifest: %s (runtime=%s)",
@@ -550,16 +574,17 @@ def load_postprocessors(
     for spec in specs:
         if not spec.enabled:
             continue
-        loaded.append(
-            LoadedPostprocessor(
-                name=spec.class_path,
-                handler=SubprocessPostprocessor(
-                    spec=spec,
-                    root_dir=root_dir,
-                    runtime=resolved_runtime,
-                ),
+        if resolved_runtime == _RUNTIME_INPROCESS:
+            handler: GraphPostprocessor = InProcessPostprocessor(
+                class_path=spec.class_path
             )
-        )
+        else:
+            handler = SubprocessPostprocessor(
+                spec=spec,
+                root_dir=root_dir,
+                runtime=resolved_runtime,
+            )
+        loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler))
     return loaded
 
 

From 1dc472a7a24d992f7f8b4dd4365fe2740b7502db Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 12:32:08 +0100
Subject: [PATCH 12/63] feat: run postprocessors on a dedicated thread pool
 instead of the default executor

---
 wordlift_sdk/kg_build/protocol.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 989d889..69e17e7 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -6,7 +6,7 @@
 import logging
 import os
 import time
-from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
@@ -194,6 +194,9 @@ def __init__(
             _pp_pool_size,
             _pool_size,
         )
+        self._pp_executor = ThreadPoolExecutor(
+            max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
+        )
         self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
         for _ in range(_pp_pool_size):
             self._postprocessors_queue.put_nowait(
@@ -324,7 +327,7 @@ async def callback(
         try:
             _t2 = time.perf_counter()
             graph = await loop.run_in_executor(
-                None,
+                self._pp_executor,
                 functools.partial(
                     self._apply_postprocessors_with,
                     graph,
@@ -392,6 +395,7 @@ def close(self) -> None:
                 close_loaded_postprocessors(self._postprocessors_queue.get_nowait())
             except asyncio.QueueEmpty:
                 break
+        self._pp_executor.shutdown(wait=False)
         if self._process_executor is not None:
             self._process_executor.shutdown(wait=False)
 

From 85dd88620cce244a128557cad0de80736218709d Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 12:54:15 +0100
Subject: [PATCH 13/63] fix: handle SHACL process pool timeout and broken
 executor errors gracefully

---
 wordlift_sdk/kg_build/protocol.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 69e17e7..a7fcaf6 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import time
+import concurrent.futures
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import asdict
 from pathlib import Path
@@ -819,10 +820,22 @@ async def _async_validate_if_enabled(
         if self._shacl_mode == "off":
             return None, 0, 0
         ntriples = graph.serialize(format="nt")
-        result = await loop.run_in_executor(
-            self._process_executor,
-            functools.partial(_shacl_validate_in_worker, ntriples, time.time()),
-        )
+        try:
+            result = await asyncio.wait_for(
+                loop.run_in_executor(
+                    self._process_executor,
+                    functools.partial(_shacl_validate_in_worker, ntriples, time.time()),
+                ),
+                timeout=120.0,
+            )
+        except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc:
+            logger.warning(
+                "SHACL validation skipped for %s: %s (%s)",
+                url,
+                type(exc).__name__,
+                exc,
+            )
+            return None, 0, 0
         validation_queue_wait_ms = result.pop("_queue_wait_ms", 0)
         validation_ms = result.pop("_validation_ms", 0)
         self._kpi.record_validation(

From fb10f70a20ced6aa629b33917f957f59a2af51c8 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 13:51:58 +0100
Subject: [PATCH 14/63] fix: offload graph hashing to executor to avoid
 blocking the event loop

---
 wordlift_sdk/protocol/graph/graph_queue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py
index 1ad8e33..f104777 100644
--- a/wordlift_sdk/protocol/graph/graph_queue.py
+++ b/wordlift_sdk/protocol/graph/graph_queue.py
@@ -39,7 +39,8 @@ def __init__(self, client_configuration: Configuration):
         reraise=True,
     )
     async def put(self, graph: Graph) -> None:
-        hash = GraphQueue.hash_graph(graph)
+        loop = asyncio.get_event_loop()
+        hash = await loop.run_in_executor(None, GraphQueue.hash_graph, graph)
         if hash not in self.hashes:
             self.hashes.add(hash)
 

From 1e6c540f28ab1dfdac2fc67525b3a7a4b770f198 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 13:52:47 +0100
Subject: [PATCH 15/63] fix: enable stop_after_attempt(5) retry limit on graph
 queue put

---
 wordlift_sdk/protocol/graph/graph_queue.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py
index f104777..ff14016 100644
--- a/wordlift_sdk/protocol/graph/graph_queue.py
+++ b/wordlift_sdk/protocol/graph/graph_queue.py
@@ -8,7 +8,13 @@
 from rdflib import Graph
 from rdflib.compare import to_isomorphic
 from wordlift_client import Configuration
-from tenacity import retry, retry_if_exception_type, wait_fixed, after_log
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    wait_fixed,
+    after_log,
+    stop_after_attempt,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -22,7 +28,7 @@ def __init__(self, client_configuration: Configuration):
         self.hashes = set()
 
     @retry(
-        # stop=stop_after_attempt(5),  # Retry up to 5 times
+        stop=stop_after_attempt(5),
         retry=retry_if_exception_type(
             asyncio.TimeoutError
             | aiohttp.client_exceptions.ServerDisconnectedError

From a8190ca7096660a8de12c68bf3c2581ff23861fb Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 14:47:54 +0100
Subject: [PATCH 16/63] fix: disable morph_kgc internal multiprocessing to
 prevent fork deadlocks in threaded context

---
 wordlift_sdk/structured_data/engine.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py
index 2fec73d..5107852 100644
--- a/wordlift_sdk/structured_data/engine.py
+++ b/wordlift_sdk/structured_data/engine.py
@@ -1351,6 +1351,10 @@ def _materialize_graph(mapping_path: Path) -> Graph:
     config = (
         "[CONFIGURATION]\n"
         "output_format = N-TRIPLES\n"
+        # Disable morph_kgc internal multiprocessing: on Linux it uses fork() which
+        # deadlocks when the parent process already has threads running (asyncio pool,
+        # SHACL ProcessPoolExecutor). The outer pipeline handles concurrency.
+        "number_of_processes = 1\n"
         "\n"
         "[DataSource1]\n"
         f"mappings = {mapping_path}\n"

From 7905d7db8273d59ef3ec62194ea864e86932c5a0 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 15:01:39 +0100
Subject: [PATCH 17/63] fix: offload RML mapping to dedicated thread pool to
 prevent blocking the event loop

---
 wordlift_sdk/kg_build/protocol.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index a7fcaf6..04cb0e4 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -198,6 +198,11 @@ def __init__(
         self._pp_executor = ThreadPoolExecutor(
             max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
         )
+        # Dedicated executor for RML mapping (morph_kgc is CPU-bound and has no
+        # async I/O — running it directly on the event loop thread blocks everything).
+        self._mapping_executor = ThreadPoolExecutor(
+            max_workers=_pool_size, thread_name_prefix="worai_ml"
+        )
         self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
         for _ in range(_pp_pool_size):
             self._postprocessors_queue.put_nowait(
@@ -306,13 +311,23 @@ async def callback(
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
 
         _t0 = time.perf_counter()
-        graph = await self.rml_service.apply_mapping(
-            html=response.web_page.html,
-            url=url,
-            mapping_file_path=mapping_path,
-            mapping_content=rendered_mapping,
-            response=mapping_response,
-            debug_output=debug_output,
+        # apply_mapping is async def but contains no awaits — it runs morph_kgc
+        # (pure Python CPU work) synchronously. Running it directly on the event
+        # loop blocks all other coroutines for ~450ms per URL. Offload to a thread
+        # so the event loop stays free to schedule I/O for other concurrent URLs.
+        _mapping_loop = asyncio.get_event_loop()
+        graph = await _mapping_loop.run_in_executor(
+            self._mapping_executor,
+            lambda: asyncio.run(
+                self.rml_service.apply_mapping(
+                    html=response.web_page.html,
+                    url=url,
+                    mapping_file_path=mapping_path,
+                    mapping_content=rendered_mapping,
+                    response=mapping_response,
+                    debug_output=debug_output,
+                )
+            ),
         )
         _t_mapping = int((time.perf_counter() - _t0) * 1000)
         if not graph or len(graph) == 0:
@@ -397,6 +412,7 @@ def close(self) -> None:
             except asyncio.QueueEmpty:
                 break
         self._pp_executor.shutdown(wait=False)
+        self._mapping_executor.shutdown(wait=False)
         if self._process_executor is not None:
             self._process_executor.shutdown(wait=False)
 

From df23fe7701fad4bc1012dd44469c0373921fb93d Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 15:11:18 +0100
Subject: [PATCH 18/63] fix: serialize morph_kgc calls with a lock to prevent
 thread-safety issues in pyparsing

---
 wordlift_sdk/structured_data/engine.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py
index 5107852..cb59f15 100644
--- a/wordlift_sdk/structured_data/engine.py
+++ b/wordlift_sdk/structured_data/engine.py
@@ -7,6 +7,7 @@
 import json
 import logging
 import re
+import threading
 from dataclasses import dataclass
 from importlib import resources
 from pathlib import Path
@@ -28,6 +29,9 @@
 from wordlift_sdk.utils.ssl_ca_bundle import resolve_ssl_ca_cert
 from wordlift_sdk.validation.shacl import ValidationResult, validate_file
 
+# morph_kgc uses rdflib's SPARQL parser (pyparsing) which has global state and
+# is NOT thread-safe. Serialize all morph_kgc calls with a module-level lock.
+_morph_kgc_lock = threading.Lock()
 
 _SCHEMA_BASE = "https://schema.org"
 _SCHEMA_HTTP = "http://schema.org/"
@@ -1360,7 +1364,8 @@ def _materialize_graph(mapping_path: Path) -> Graph:
         f"mappings = {mapping_path}\n"
     )
     try:
-        return morph_kgc.materialize(config)
+        with _morph_kgc_lock:
+            return morph_kgc.materialize(config)
     except RuntimeError:
         raise
     except Exception as exc:

From bee7dd802cfa496a29189bef38f2761cbffb94e3 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 15:22:20 +0100
Subject: [PATCH 19/63] Revert "fix: offload RML mapping to dedicated thread
 pool to prevent blocking the event loop"

This reverts commit 783c7eac16038aeb60e85e34829212cfb3e546b7.
---
 wordlift_sdk/kg_build/protocol.py | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 04cb0e4..a7fcaf6 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -198,11 +198,6 @@ def __init__(
         self._pp_executor = ThreadPoolExecutor(
             max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
         )
-        # Dedicated executor for RML mapping (morph_kgc is CPU-bound and has no
-        # async I/O — running it directly on the event loop thread blocks everything).
-        self._mapping_executor = ThreadPoolExecutor(
-            max_workers=_pool_size, thread_name_prefix="worai_ml"
-        )
         self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
         for _ in range(_pp_pool_size):
             self._postprocessors_queue.put_nowait(
@@ -311,23 +306,13 @@ async def callback(
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
 
         _t0 = time.perf_counter()
-        # apply_mapping is async def but contains no awaits — it runs morph_kgc
-        # (pure Python CPU work) synchronously. Running it directly on the event
-        # loop blocks all other coroutines for ~450ms per URL. Offload to a thread
-        # so the event loop stays free to schedule I/O for other concurrent URLs.
-        _mapping_loop = asyncio.get_event_loop()
-        graph = await _mapping_loop.run_in_executor(
-            self._mapping_executor,
-            lambda: asyncio.run(
-                self.rml_service.apply_mapping(
-                    html=response.web_page.html,
-                    url=url,
-                    mapping_file_path=mapping_path,
-                    mapping_content=rendered_mapping,
-                    response=mapping_response,
-                    debug_output=debug_output,
-                )
-            ),
+        graph = await self.rml_service.apply_mapping(
+            html=response.web_page.html,
+            url=url,
+            mapping_file_path=mapping_path,
+            mapping_content=rendered_mapping,
+            response=mapping_response,
+            debug_output=debug_output,
         )
         _t_mapping = int((time.perf_counter() - _t0) * 1000)
         if not graph or len(graph) == 0:
@@ -412,7 +397,6 @@ def close(self) -> None:
             except asyncio.QueueEmpty:
                 break
         self._pp_executor.shutdown(wait=False)
-        self._mapping_executor.shutdown(wait=False)
         if self._process_executor is not None:
             self._process_executor.shutdown(wait=False)
 

From 7044deede01c2e59fbe4e4d5411e5ebe6baeb646 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 15:46:37 +0100
Subject: [PATCH 20/63] perf: run morph_kgc in a subprocess pool for true
 parallelism without pyparsing lock contention

---
 wordlift_sdk/kg_build/protocol.py      | 30 +++++++++++----
 wordlift_sdk/structured_data/engine.py | 51 +++++++++++++++++++-------
 2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index a7fcaf6..ba4205f 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -198,6 +198,12 @@ def __init__(
         self._pp_executor = ThreadPoolExecutor(
             max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
         )
+        # Wraps apply_mapping calls so they run in a thread rather than blocking
+        # the asyncio event loop. The thread itself blocks on the morph_kgc
+        # ProcessPoolExecutor slot, leaving the event loop free for I/O.
+        self._mapping_executor = ThreadPoolExecutor(
+            max_workers=_pool_size, thread_name_prefix="worai_ml"
+        )
         self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
         for _ in range(_pp_pool_size):
             self._postprocessors_queue.put_nowait(
@@ -306,13 +312,22 @@ async def callback(
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
 
         _t0 = time.perf_counter()
-        graph = await self.rml_service.apply_mapping(
-            html=response.web_page.html,
-            url=url,
-            mapping_file_path=mapping_path,
-            mapping_content=rendered_mapping,
-            response=mapping_response,
-            debug_output=debug_output,
+        # apply_mapping has no awaits — all work is synchronous (morph_kgc).
+        # Run it in a thread so the event loop stays free for I/O while the
+        # thread waits for its morph_kgc subprocess slot to become available.
+        _loop = asyncio.get_event_loop()
+        graph = await _loop.run_in_executor(
+            self._mapping_executor,
+            lambda: asyncio.run(
+                self.rml_service.apply_mapping(
+                    html=response.web_page.html,
+                    url=url,
+                    mapping_file_path=mapping_path,
+                    mapping_content=rendered_mapping,
+                    response=mapping_response,
+                    debug_output=debug_output,
+                )
+            ),
         )
         _t_mapping = int((time.perf_counter() - _t0) * 1000)
         if not graph or len(graph) == 0:
@@ -397,6 +412,7 @@ def close(self) -> None:
             except asyncio.QueueEmpty:
                 break
         self._pp_executor.shutdown(wait=False)
+        self._mapping_executor.shutdown(wait=False)
         if self._process_executor is not None:
             self._process_executor.shutdown(wait=False)
 
diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py
index cb59f15..9543330 100644
--- a/wordlift_sdk/structured_data/engine.py
+++ b/wordlift_sdk/structured_data/engine.py
@@ -6,8 +6,10 @@
 import hashlib
 import json
 import logging
+import multiprocessing
+import os
 import re
-import threading
+from concurrent.futures import ProcessPoolExecutor
 from dataclasses import dataclass
 from importlib import resources
 from pathlib import Path
@@ -29,9 +31,34 @@
 from wordlift_sdk.utils.ssl_ca_bundle import resolve_ssl_ca_cert
 from wordlift_sdk.validation.shacl import ValidationResult, validate_file
 
-# morph_kgc uses rdflib's SPARQL parser (pyparsing) which has global state and
-# is NOT thread-safe. Serialize all morph_kgc calls with a module-level lock.
-_morph_kgc_lock = threading.Lock()
+
+# Top-level worker — must be module-level to be picklable for ProcessPoolExecutor.
+# Each subprocess has its own Python interpreter so pyparsing state is isolated;
+# no lock needed and genuine parallelism is possible.
+def _morph_kgc_worker(config: str) -> str:
+    import morph_kgc as _mkgc
+
+    return _mkgc.materialize(config).serialize(format="nt")
+
+
+# Lazy process pool — created on first use in the main process only.
+# Worker subprocesses import this module but never call _get_morph_kgc_pool(),
+# so they do NOT create their own pools (no recursive process explosion).
+_morph_kgc_pool: ProcessPoolExecutor | None = None
+
+
+def _get_morph_kgc_pool() -> ProcessPoolExecutor:
+    global _morph_kgc_pool
+    if _morph_kgc_pool is None:
+        # Use "spawn" context to start workers cleanly without inheriting any
+        # locks or file descriptors from the parent process.
+        ctx = multiprocessing.get_context("spawn")
+        _morph_kgc_pool = ProcessPoolExecutor(
+            max_workers=os.cpu_count() or 4,
+            mp_context=ctx,
+        )
+    return _morph_kgc_pool
+
 
 _SCHEMA_BASE = "https://schema.org"
 _SCHEMA_HTTP = "http://schema.org/"
@@ -1345,13 +1372,6 @@ def _normalize_materialization_error(error: Exception) -> RuntimeError:
 
 
 def _materialize_graph(mapping_path: Path) -> Graph:
-    try:
-        import morph_kgc
-    except ImportError as exc:
-        raise RuntimeError(
-            "morph-kgc is required. Install with: pip install morph-kgc"
-        ) from exc
-
     config = (
         "[CONFIGURATION]\n"
         "output_format = N-TRIPLES\n"
@@ -1364,8 +1384,13 @@ def _materialize_graph(mapping_path: Path) -> Graph:
         f"mappings = {mapping_path}\n"
     )
     try:
-        with _morph_kgc_lock:
-            return morph_kgc.materialize(config)
+        # Submit to subprocess pool — each worker has isolated pyparsing state,
+        # so calls are genuinely parallel across CPU cores with no lock needed.
+        # .result() blocks the calling thread (not the asyncio event loop).
+        ntriples = _get_morph_kgc_pool().submit(_morph_kgc_worker, config).result()
+        graph = Graph()
+        graph.parse(data=ntriples, format="nt")
+        return graph
     except RuntimeError:
         raise
     except Exception as exc:

From bec6f276a07ba85c1fee4a497e3faa516d52ba11 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 15:56:18 +0100
Subject: [PATCH 21/63] perf: expose morph_kgc pool size setting and track
 subprocess queue wait in timing log

---
 wordlift_sdk/kg_build/protocol.py      | 17 +++++++++-
 wordlift_sdk/structured_data/engine.py | 43 +++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index ba4205f..ba653fe 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -41,6 +41,7 @@
 )
 from .rml_mapping import RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
+from wordlift_sdk.structured_data.engine import init_morph_kgc_pool, _morph_kgc_tls
 
 logger = logging.getLogger(__name__)
 SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
@@ -198,6 +199,18 @@ def __init__(
         self._pp_executor = ThreadPoolExecutor(
             max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
         )
+        _mapping_pool_size = int(
+            self.profile.settings.get(
+                "mapping_pool_size",
+                self.profile.settings.get("MAPPING_POOL_SIZE", os.cpu_count() or 4),
+            )
+        )
+        logger.info(
+            "Mapping pool size for profile '%s': %d",
+            self.profile.name,
+            _mapping_pool_size,
+        )
+        init_morph_kgc_pool(_mapping_pool_size)
         # Wraps apply_mapping calls so they run in a thread rather than blocking
         # the asyncio event loop. The thread itself blocks on the morph_kgc
         # ProcessPoolExecutor slot, leaving the event loop free for I/O.
@@ -330,6 +343,7 @@ async def callback(
             ),
         )
         _t_mapping = int((time.perf_counter() - _t0) * 1000)
+        _t_mapping_wait = getattr(_morph_kgc_tls, "mapping_wait_ms", 0)
         if not graph or len(graph) == 0:
             logger.warning("No triples produced for %s", url)
             return
@@ -395,9 +409,10 @@ async def callback(
             raise RuntimeError(f"SHACL validation failed for {url} in fail mode.")
         await self._write_graph(graph)
         logger.info(
-            "Wrote %s triples for %s [mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]",
+            "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]",
             len(graph),
             url,
+            _t_mapping_wait,
             _t_mapping,
             _t_queue_wait,
             _t_postprocessors,
diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py
index 9543330..c62ad75 100644
--- a/wordlift_sdk/structured_data/engine.py
+++ b/wordlift_sdk/structured_data/engine.py
@@ -32,26 +32,48 @@
 from wordlift_sdk.validation.shacl import ValidationResult, validate_file
 
 
+import threading
+import time as _time
+
+
 # Top-level worker — must be module-level to be picklable for ProcessPoolExecutor.
-# Each subprocess has its own Python interpreter so pyparsing state is isolated;
-# no lock needed and genuine parallelism is possible.
-def _morph_kgc_worker(config: str) -> str:
+# Accepts submit_time so it can measure queue wait (time spent waiting for a
+# free subprocess slot). Returns (ntriples, queue_wait_ms).
+def _morph_kgc_worker(config: str, submit_time: float) -> tuple[str, int]:
     import morph_kgc as _mkgc
+    import time as _t
 
-    return _mkgc.materialize(config).serialize(format="nt")
+    queue_wait_ms = int((_t.time() - submit_time) * 1000)
+    ntriples = _mkgc.materialize(config).serialize(format="nt")
+    return ntriples, queue_wait_ms
 
 
+# Thread-local used to pass mapping_wait_ms back to the protocol layer without
+# changing the return type of _materialize_graph / apply_mapping.
+_morph_kgc_tls = threading.local()
+
 # Lazy process pool — created on first use in the main process only.
 # Worker subprocesses import this module but never call _get_morph_kgc_pool(),
 # so they do NOT create their own pools (no recursive process explosion).
 _morph_kgc_pool: ProcessPoolExecutor | None = None
 
 
+def init_morph_kgc_pool(max_workers: int) -> None:
+    """Pre-create the morph_kgc process pool with a specific worker count.
+    Call once from the protocol __init__ before any mapping work starts.
+    Subsequent calls are no-ops (pool is only created once).
+    """
+    global _morph_kgc_pool
+    if _morph_kgc_pool is not None:
+        return
+    ctx = multiprocessing.get_context("spawn")
+    _morph_kgc_pool = ProcessPoolExecutor(max_workers=max_workers, mp_context=ctx)
+
+
 def _get_morph_kgc_pool() -> ProcessPoolExecutor:
     global _morph_kgc_pool
     if _morph_kgc_pool is None:
-        # Use "spawn" context to start workers cleanly without inheriting any
-        # locks or file descriptors from the parent process.
+        # Fallback if init_morph_kgc_pool was never called.
         ctx = multiprocessing.get_context("spawn")
         _morph_kgc_pool = ProcessPoolExecutor(
             max_workers=os.cpu_count() or 4,
@@ -1387,7 +1409,14 @@ def _materialize_graph(mapping_path: Path) -> Graph:
         # Submit to subprocess pool — each worker has isolated pyparsing state,
         # so calls are genuinely parallel across CPU cores with no lock needed.
         # .result() blocks the calling thread (not the asyncio event loop).
-        ntriples = _get_morph_kgc_pool().submit(_morph_kgc_worker, config).result()
+        ntriples, queue_wait_ms = (
+            _get_morph_kgc_pool()
+            .submit(_morph_kgc_worker, config, _time.time())
+            .result()
+        )
+        # Store wait time in thread-local so protocol.py can read it without
+        # changing the return type of this function.
+        _morph_kgc_tls.mapping_wait_ms = queue_wait_ms
         graph = Graph()
         graph.parse(data=ntriples, format="nt")
         return graph

From c8ed0602f137eb36815b869bb568f5d3448b412d Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 16:46:42 +0100
Subject: [PATCH 22/63] fix: read morph_kgc queue wait from worker thread via
 closure to avoid thread-local race

---
 wordlift_sdk/kg_build/protocol.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index ba653fe..5d8bcfe 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -328,10 +328,13 @@ async def callback(
         # apply_mapping has no awaits — all work is synchronous (morph_kgc).
         # Run it in a thread so the event loop stays free for I/O while the
         # thread waits for its morph_kgc subprocess slot to become available.
-        _loop = asyncio.get_event_loop()
-        graph = await _loop.run_in_executor(
-            self._mapping_executor,
-            lambda: asyncio.run(
+        # _morph_kgc_tls is thread-local: capture it inside the worker thread
+        # and pass the value back via a closure dict.
+        _timing: dict[str, int] = {}
+
+        def _run_mapping() -> Graph | None:
+            _t_start = time.perf_counter()
+            result = asyncio.run(
                 self.rml_service.apply_mapping(
                     html=response.web_page.html,
                     url=url,
@@ -340,10 +343,18 @@ async def callback(
                     response=mapping_response,
                     debug_output=debug_output,
                 )
-            ),
-        )
-        _t_mapping = int((time.perf_counter() - _t0) * 1000)
-        _t_mapping_wait = getattr(_morph_kgc_tls, "mapping_wait_ms", 0)
+            )
+            mw = getattr(_morph_kgc_tls, "mapping_wait_ms", 0)
+            _timing["mapping_wait_ms"] = mw
+            # Subtract queue-wait so mapping= shows actual execution time only,
+            # consistent with how validation_wait/validation are reported.
+            _timing["mapping_ms"] = int((time.perf_counter() - _t_start) * 1000) - mw
+            return result
+
+        _loop = asyncio.get_event_loop()
+        graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping)
+        _t_mapping = _timing.get("mapping_ms", int((time.perf_counter() - _t0) * 1000))
+        _t_mapping_wait = _timing.get("mapping_wait_ms", 0)
         if not graph or len(graph) == 0:
             logger.warning("No triples produced for %s", url)
             return

From 8a0a34e6771ee2c6a11b282998e5531cfb8abb4a Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 17:10:13 +0100
Subject: [PATCH 23/63] perf: reuse a single persistent ApiClient across
 requests instead of creating one per graph

---
 wordlift_sdk/protocol/graph/graph_queue.py | 53 +++++++++++++++++-----
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py
index ff14016..053360e 100644
--- a/wordlift_sdk/protocol/graph/graph_queue.py
+++ b/wordlift_sdk/protocol/graph/graph_queue.py
@@ -26,6 +26,37 @@ class GraphQueue:
     def __init__(self, client_configuration: Configuration):
         self.client_configuration = client_configuration
         self.hashes = set()
+        self._api_client: wordlift_client.ApiClient | None = None
+        self._api_client_lock: asyncio.Lock | None = None
+
+    async def _get_api_client(self) -> wordlift_client.ApiClient:
+        # Lazy-init the lock (must be created on the event loop).
+        if self._api_client_lock is None:
+            self._api_client_lock = asyncio.Lock()
+        if self._api_client is not None:
+            return self._api_client
+        async with self._api_client_lock:
+            if self._api_client is None:
+                # ApiClient.__init__ calls ssl.create_default_context() synchronously.
+                # Run it in a thread so the event loop isn't blocked during cert loading.
+                loop = asyncio.get_event_loop()
+                client = await loop.run_in_executor(
+                    None,
+                    lambda: wordlift_client.ApiClient(
+                        configuration=self.client_configuration
+                    ),
+                )
+                await client.__aenter__()
+                self._api_client = client
+        return self._api_client
+
+    async def close(self) -> None:
+        if self._api_client is not None:
+            try:
+                await self._api_client.__aexit__(None, None, None)
+            except Exception:
+                pass
+            self._api_client = None
 
     @retry(
         stop=stop_after_attempt(5),
@@ -50,19 +81,17 @@ async def put(self, graph: Graph) -> None:
         if hash not in self.hashes:
             self.hashes.add(hash)
 
-            async with wordlift_client.ApiClient(
-                configuration=self.client_configuration
-            ) as api_client:
-                api_instance = wordlift_client.EntitiesApi(api_client)
+            api_client = await self._get_api_client()
+            api_instance = wordlift_client.EntitiesApi(api_client)
 
-                try:
-                    await api_instance.create_or_update_entities(
-                        graph.serialize(format="turtle"),
-                        _content_type="text/turtle",
-                    )
-                except Exception as e:
-                    logger.error(f"Failed to create entities: {e}", exc_info=e)
-                    raise e
+            try:
+                await api_instance.create_or_update_entities(
+                    graph.serialize(format="turtle"),
+                    _content_type="text/turtle",
+                )
+            except Exception as e:
+                logger.error(f"Failed to create entities: {e}", exc_info=e)
+                raise e
 
     @staticmethod
     def hash_graph(graph: Graph) -> str:

From a05328e8ef1f3e2058a90d08a053eecd45142945 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Wed, 18 Mar 2026 17:19:42 +0100
Subject: [PATCH 24/63] fix: create ApiClient directly on the event loop thread
 instead of in an executor

---
 wordlift_sdk/protocol/graph/graph_queue.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py
index 053360e..56e818a 100644
--- a/wordlift_sdk/protocol/graph/graph_queue.py
+++ b/wordlift_sdk/protocol/graph/graph_queue.py
@@ -37,14 +37,11 @@ async def _get_api_client(self) -> wordlift_client.ApiClient:
             return self._api_client
         async with self._api_client_lock:
             if self._api_client is None:
-                # ApiClient.__init__ calls ssl.create_default_context() synchronously.
-                # Run it in a thread so the event loop isn't blocked during cert loading.
-                loop = asyncio.get_event_loop()
-                client = await loop.run_in_executor(
-                    None,
-                    lambda: wordlift_client.ApiClient(
-                        configuration=self.client_configuration
-                    ),
+                # ApiClient.__init__ calls ssl.create_default_context() synchronously
+                # and must run on the event loop thread (it calls asyncio internals).
+                # Creating it once and caching avoids repeated SSL cert loading per put().
+                client = wordlift_client.ApiClient(
+                    configuration=self.client_configuration
                 )
                 await client.__aenter__()
                 self._api_client = client

From b358a10f387251286da5c108cf9545f4b6b66ebc Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 10:43:28 +0100
Subject: [PATCH 25/63] refactor: make load_shapes_graph and
 normalize_schema_org_uris public in shacl module

---
 wordlift_sdk/graph/audit/_entity_matrix.py |  4 ++--
 wordlift_sdk/kg_build/protocol.py          | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/wordlift_sdk/graph/audit/_entity_matrix.py b/wordlift_sdk/graph/audit/_entity_matrix.py
index 1c8101e..23b3048 100644
--- a/wordlift_sdk/graph/audit/_entity_matrix.py
+++ b/wordlift_sdk/graph/audit/_entity_matrix.py
@@ -15,7 +15,7 @@
     _find_webpage_urls,
 )
 from wordlift_sdk.validation.shacl import (
-    _normalize_schema_org_uris,  # type: ignore[attr-defined]
+    normalize_schema_org_uris,
 )
 
 _SCHEMA_ORG_PREFIXES = ("http://schema.org/", "https://schema.org/")
@@ -120,7 +120,7 @@ def build_entity_matrix(
     excl: set[str] = set(exclude_types or [])
 
     load_result = load_graph(path)
-    normalized = _normalize_schema_org_uris(load_result.graph)
+    normalized = normalize_schema_org_uris(load_result.graph)
     webpage_urls = _find_webpage_urls(normalized)
 
     if not webpage_urls:
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 5d8bcfe..104e5af 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -24,8 +24,8 @@
 from rdflib.namespace import SH
 from wordlift_sdk.validation.shacl import (
     ValidationResult,
-    _load_shapes_graph,
-    _normalize_schema_org_uris,
+    load_shapes_graph,
+    normalize_schema_org_uris,
     resolve_shape_specs,
 )
 
@@ -59,7 +59,7 @@ def _path_contains_part(path: str, part: str) -> bool:
 
 def _init_shacl_worker(shape_specs: list[str] | None) -> None:
     global _shacl_worker_shapes_graph, _shacl_worker_source_map
-    _shacl_worker_shapes_graph, _shacl_worker_source_map = _load_shapes_graph(
+    _shacl_worker_shapes_graph, _shacl_worker_source_map = load_shapes_graph(
         shape_specs
     )
 
@@ -69,7 +69,7 @@ def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict:
     _t_start = time.perf_counter()
     data_graph = Graph()
     data_graph.parse(data=ntriples, format="nt")
-    data_graph = _normalize_schema_org_uris(data_graph)
+    data_graph = normalize_schema_org_uris(data_graph)
     conforms, report_graph, _ = pyshacl_validate(
         data_graph,
         shacl_graph=_shacl_worker_shapes_graph,
@@ -920,7 +920,7 @@ def _validate_graph_if_enabled(
         return summary
 
     def _validate_graph(self, graph: Graph) -> ValidationResult:
-        data_graph = _normalize_schema_org_uris(graph)
+        data_graph = normalize_schema_org_uris(graph)
         conforms, report_graph, report_text = pyshacl_validate(
             data_graph,
             shacl_graph=self._shacl_shapes_graph,

From 61c0e367fc247602742145bb4ef3ff8efc818c6f Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 11:17:54 +0100
Subject: [PATCH 26/63] feat: add ShaclValidationService to validation package

---
 .../validation/shacl_validation_service.py    | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 wordlift_sdk/validation/shacl_validation_service.py

diff --git a/wordlift_sdk/validation/shacl_validation_service.py b/wordlift_sdk/validation/shacl_validation_service.py
new file mode 100644
index 0000000..60e01f7
--- /dev/null
+++ b/wordlift_sdk/validation/shacl_validation_service.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import asyncio
+import concurrent.futures
+import functools
+import logging
+import time
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+from pyshacl import validate as pyshacl_validate
+from rdflib import Graph
+from rdflib.namespace import SH
+
+from wordlift_sdk.validation.shacl import load_shapes_graph, normalize_schema_org_uris
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_VALIDATION_TIMEOUT_SECONDS = 120.0
+
+
+class ValidationMode(str, Enum):
+    OFF = "off"
+    WARN = "warn"
+    FAIL = "fail"
+
+
+# Module-level worker state — one copy per subprocess, initialised by _init_worker.
+# Must be module-level for picklability by ProcessPoolExecutor.
+_worker_shapes_graph: Graph | None = None
+_worker_source_map: dict = {}
+
+
+def _init_worker(shape_specs: list[str] | None) -> None:
+    global _worker_shapes_graph, _worker_source_map
+    _worker_shapes_graph, _worker_source_map = load_shapes_graph(shape_specs)
+
+
+def _validate_in_worker(ntriples: str, submit_time: float) -> dict:
+    queue_wait_ms = int((time.time() - submit_time) * 1000)
+    t_start = time.perf_counter()
+
+    data_graph = Graph()
+    data_graph.parse(data=ntriples, format="nt")
+    data_graph = normalize_schema_org_uris(data_graph)
+
+    conforms, report_graph, _ = pyshacl_validate(
+        data_graph,
+        shacl_graph=_worker_shapes_graph,
+        inference="rdfs",
+        abort_on_first=False,
+        allow_infos=True,
+        allow_warnings=True,
+    )
+
+    warning_sources: dict[str, int] = {}
+    error_sources: dict[str, int] = {}
+    for node in report_graph.subjects(SH.resultSeverity, SH.Warning):
+        shape = next(report_graph.objects(node, SH.sourceShape), None)
+        label = _worker_source_map.get(shape, "unknown")
+        warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1
+    for node in report_graph.subjects(SH.resultSeverity, SH.Violation):
+        shape = next(report_graph.objects(node, SH.sourceShape), None)
+        label = _worker_source_map.get(shape, "unknown")
+        error_sources[str(label)] = error_sources.get(str(label), 0) + 1
+
+    return {
+        "passed": bool(conforms),
+        "warning_sources": dict(sorted(warning_sources.items())),
+        "error_sources": dict(sorted(error_sources.items())),
+        "queue_wait_ms": queue_wait_ms,
+        "validation_ms": int((time.perf_counter() - t_start) * 1000),
+    }
+
+
+@dataclass
+class ValidationOutcome:
+    passed: bool
+    warning_sources: dict[str, int]
+    error_sources: dict[str, int]
+    queue_wait_ms: int
+    validation_ms: int
+
+    @property
+    def failed(self) -> bool:
+        return not self.passed
+
+    @property
+    def warning_count(self) -> int:
+        return sum(self.warning_sources.values())
+
+    @property
+    def error_count(self) -> int:
+        return sum(self.error_sources.values())
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "pass": self.passed,
+            "fail": self.failed,
+            "warnings": {"count": self.warning_count, "sources": self.warning_sources},
+            "errors": {"count": self.error_count, "sources": self.error_sources},
+        }
+
+
+class ShaclValidationService:
+    def __init__(
+        self,
+        shape_specs: list[str] | None,
+        mode: ValidationMode,
+        pool_size: int = 1,
+        timeout_seconds: float = DEFAULT_VALIDATION_TIMEOUT_SECONDS,
+    ) -> None:
+        self._mode = mode
+        self._timeout_seconds = timeout_seconds
+        self._executor: ProcessPoolExecutor | None = None
+        if mode != ValidationMode.OFF:
+            self._executor = ProcessPoolExecutor(
+                max_workers=pool_size,
+                initializer=_init_worker,
+                initargs=(shape_specs,),
+            )
+            logger.info(
+                "Created SHACL process pool with %d workers (mode=%s)",
+                pool_size,
+                mode,
+            )
+
+    @property
+    def mode(self) -> ValidationMode:
+        return self._mode
+
+    async def validate(self, graph: Graph) -> ValidationOutcome | None:
+        """Validate *graph* against the configured SHACL shapes.
+
+        Returns ``None`` when validation is disabled (mode=off) or skipped due
+        to a timeout or broken executor.
+        """
+        if self._mode == ValidationMode.OFF or self._executor is None:
+            return None
+        ntriples = graph.serialize(format="nt")
+        loop = asyncio.get_event_loop()
+        try:
+            result = await asyncio.wait_for(
+                loop.run_in_executor(
+                    self._executor,
+                    functools.partial(_validate_in_worker, ntriples, time.time()),
+                ),
+                timeout=self._timeout_seconds,
+            )
+        except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc:
+            logger.warning("SHACL validation skipped: %s (%s)", type(exc).__name__, exc)
+            return None
+        return ValidationOutcome(
+            passed=result["passed"],
+            warning_sources=result["warning_sources"],
+            error_sources=result["error_sources"],
+            queue_wait_ms=result["queue_wait_ms"],
+            validation_ms=result["validation_ms"],
+        )
+
+    def close(self) -> None:
+        if self._executor is not None:
+            self._executor.shutdown(wait=False)
+            self._executor = None

From 635dd51d70369a9a2b2b2b159d3cf8c0467b0579 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 11:34:04 +0100
Subject: [PATCH 27/63] refactor: wire ShaclValidationService into
 ProfileImportProtocol

---
 wordlift_sdk/kg_build/protocol.py | 319 +++++++-----------------------
 1 file changed, 70 insertions(+), 249 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 104e5af..e36026c 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -6,8 +6,7 @@
 import logging
 import os
 import time
-import concurrent.futures
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
@@ -20,13 +19,11 @@
 from wordlift_sdk.protocol.web_page_import_protocol import (
     WebPageImportProtocolInterface,
 )
-from pyshacl import validate as pyshacl_validate
-from rdflib.namespace import SH
-from wordlift_sdk.validation.shacl import (
-    ValidationResult,
-    load_shapes_graph,
-    normalize_schema_org_uris,
-    resolve_shape_specs,
+from wordlift_sdk.validation.shacl import resolve_shape_specs
+from wordlift_sdk.validation.shacl_validation_service import (
+    ShaclValidationService,
+    ValidationMode,
+    ValidationOutcome,
 )
 
 from .config import ProfileDefinition
@@ -52,63 +49,6 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
-# Module-level state for SHACL worker processes (one copy per process)
-_shacl_worker_shapes_graph: Graph | None = None
-_shacl_worker_source_map: dict = {}
-
-
-def _init_shacl_worker(shape_specs: list[str] | None) -> None:
-    global _shacl_worker_shapes_graph, _shacl_worker_source_map
-    _shacl_worker_shapes_graph, _shacl_worker_source_map = load_shapes_graph(
-        shape_specs
-    )
-
-
-def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict:
-    _queue_wait_ms = int((time.time() - submit_time) * 1000)
-    _t_start = time.perf_counter()
-    data_graph = Graph()
-    data_graph.parse(data=ntriples, format="nt")
-    data_graph = normalize_schema_org_uris(data_graph)
-    conforms, report_graph, _ = pyshacl_validate(
-        data_graph,
-        shacl_graph=_shacl_worker_shapes_graph,
-        inference="rdfs",
-        abort_on_first=False,
-        allow_infos=True,
-        allow_warnings=True,
-    )
-    warning_sources: dict[str, int] = {}
-    error_sources: dict[str, int] = {}
-    warning_count = 0
-    error_count = 0
-    for node in report_graph.subjects(SH.resultSeverity, SH.Warning):
-        warning_count += 1
-        shape = next(report_graph.objects(node, SH.sourceShape), None)
-        label = _shacl_worker_source_map.get(shape, "unknown")
-        warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1
-    for node in report_graph.subjects(SH.resultSeverity, SH.Violation):
-        error_count += 1
-        shape = next(report_graph.objects(node, SH.sourceShape), None)
-        label = _shacl_worker_source_map.get(shape, "unknown")
-        error_sources[str(label)] = error_sources.get(str(label), 0) + 1
-    return {
-        "total": 1,
-        "pass": bool(conforms),
-        "fail": not bool(conforms),
-        "warnings": {
-            "count": warning_count,
-            "sources": dict(sorted(warning_sources.items())),
-        },
-        "errors": {
-            "count": error_count,
-            "sources": dict(sorted(error_sources.items())),
-        },
-        "_queue_wait_ms": _queue_wait_ms,
-        "_validation_ms": int((time.perf_counter() - _t_start) * 1000),
-    }
-
-
 def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str:
     value = settings.get("postprocessor_runtime")
     if value is None:
@@ -226,7 +166,7 @@ def __init__(
                     runtime=self._postprocessor_runtime,
                 )
             )
-        self._shacl_mode = self._resolve_validation_mode(
+        shacl_mode = self._resolve_validation_mode(
             self.profile.settings.get(
                 "shacl_validate_mode",
                 self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"),
@@ -254,29 +194,17 @@ def __init__(
             exclude_builtin_shapes=shacl_exclude_builtin_shapes or None,
             extra_shapes=shacl_extra_shapes or None,
         )
-        if self._shacl_mode != "off":
-            _shacl_pool_size = int(
-                self.profile.settings.get(
-                    "shacl_pool_size",
-                    self.profile.settings.get(
-                        "SHACL_POOL_SIZE", max(2, _pool_size // 2)
-                    ),
-                )
-            )
-            self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor(
-                max_workers=_shacl_pool_size,
-                initializer=_init_shacl_worker,
-                initargs=(
-                    self._shacl_shape_specs if self._shacl_shape_specs else None,
-                ),
-            )
-            logger.info(
-                "Created SHACL process pool with %d workers for profile '%s'",
-                _shacl_pool_size,
-                self.profile.name,
+        _shacl_pool_size = int(
+            self.profile.settings.get(
+                "shacl_pool_size",
+                self.profile.settings.get("SHACL_POOL_SIZE", max(2, _pool_size // 2)),
             )
-        else:
-            self._process_executor = None
+        )
+        self._shacl_validator = ShaclValidationService(
+            shape_specs=self._shacl_shape_specs or None,
+            mode=shacl_mode,
+            pool_size=_shacl_pool_size,
+        )
         self._import_hash_mode = self._resolve_import_hash_mode(
             self.profile.settings.get(
                 "import_hash_mode",
@@ -285,7 +213,7 @@ def __init__(
         )
         self._kpi = KgBuildKpiCollector(
             dataset_uri=getattr(self.context.account, "dataset_uri", None),
-            validation_enabled=self._shacl_mode != "off",
+            validation_enabled=self._shacl_validator.mode != ValidationMode.OFF,
         )
         logger.debug(
             "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s",
@@ -396,11 +324,24 @@ def _run_mapping() -> Graph | None:
             )
             self._write_debug_graph(graph, url)
 
-        (
-            validation_payload,
-            _t_validation_wait,
-            _t_validation_actual,
-        ) = await self._async_validate_if_enabled(loop, graph, url)
+        outcome: ValidationOutcome | None = await self._shacl_validator.validate(graph)
+        if outcome is not None:
+            logger.info(
+                "SHACL validation for %s: pass=%s warnings=%d errors=%d",
+                url,
+                outcome.passed,
+                outcome.warning_count,
+                outcome.error_count,
+            )
+            self._kpi.record_validation(
+                passed=outcome.passed,
+                warning_count=outcome.warning_count,
+                error_count=outcome.error_count,
+                warning_sources=outcome.warning_sources,
+                error_sources=outcome.error_sources,
+            )
+        _t_validation_wait = outcome.queue_wait_ms if outcome else 0
+        _t_validation_actual = outcome.validation_ms if outcome else 0
         graph_metrics = self._kpi.graph_metrics(graph)
         self._emit_progress(
             {
@@ -408,14 +349,14 @@ def _run_mapping() -> Graph | None:
                 "profile": self.profile.name,
                 "url": url,
                 "graph": graph_metrics,
-                "validation": validation_payload,
+                "validation": outcome.to_dict() if outcome else None,
             }
         )
         self._kpi.record_graph(graph)
         if (
-            validation_payload is not None
-            and self._shacl_mode == "fail"
-            and not validation_payload["pass"]
+            outcome is not None
+            and self._shacl_validator.mode == ValidationMode.FAIL
+            and outcome.failed
         ):
             raise RuntimeError(f"SHACL validation failed for {url} in fail mode.")
         await self._write_graph(graph)
@@ -439,8 +380,7 @@ def close(self) -> None:
                 break
         self._pp_executor.shutdown(wait=False)
         self._mapping_executor.shutdown(wait=False)
-        if self._process_executor is not None:
-            self._process_executor.shutdown(wait=False)
+        self._shacl_validator.close()
 
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)
@@ -477,23 +417,34 @@ async def _patch_static_templates_once(self) -> None:
 
             self._ensure_templates_loaded()
             if self._template_graph and len(self._template_graph) > 0:
-                _loop = asyncio.get_event_loop()
-                validation_payload, _, _ = await self._async_validate_if_enabled(
-                    _loop, self._template_graph, "static_templates"
-                )
+                outcome = await self._shacl_validator.validate(self._template_graph)
+                if outcome is not None:
+                    logger.info(
+                        "SHACL validation for static_templates: pass=%s warnings=%d errors=%d",
+                        outcome.passed,
+                        outcome.warning_count,
+                        outcome.error_count,
+                    )
+                    self._kpi.record_validation(
+                        passed=outcome.passed,
+                        warning_count=outcome.warning_count,
+                        error_count=outcome.error_count,
+                        warning_sources=outcome.warning_sources,
+                        error_sources=outcome.error_sources,
+                    )
                 self._emit_progress(
                     {
                         "kind": "static_templates",
                         "profile": self.profile.name,
                         "graph": self._kpi.graph_metrics(self._template_graph),
-                        "validation": validation_payload,
+                        "validation": outcome.to_dict() if outcome else None,
                     }
                 )
                 self._kpi.record_graph(self._template_graph)
                 if (
-                    validation_payload is not None
-                    and self._shacl_mode == "fail"
-                    and not validation_payload["pass"]
+                    outcome is not None
+                    and self._shacl_validator.mode == ValidationMode.FAIL
+                    and outcome.failed
                 ):
                     raise RuntimeError(
                         "SHACL validation failed for static templates in fail mode."
@@ -856,139 +807,6 @@ def _mapping_response(
             web_page=response.web_page,
         )
 
-    async def _async_validate_if_enabled(
-        self, loop: Any, graph: Graph, url: str
-    ) -> tuple[dict[str, Any] | None, int, int]:
-        if self._shacl_mode == "off":
-            return None, 0, 0
-        ntriples = graph.serialize(format="nt")
-        try:
-            result = await asyncio.wait_for(
-                loop.run_in_executor(
-                    self._process_executor,
-                    functools.partial(_shacl_validate_in_worker, ntriples, time.time()),
-                ),
-                timeout=120.0,
-            )
-        except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc:
-            logger.warning(
-                "SHACL validation skipped for %s: %s (%s)",
-                url,
-                type(exc).__name__,
-                exc,
-            )
-            return None, 0, 0
-        validation_queue_wait_ms = result.pop("_queue_wait_ms", 0)
-        validation_ms = result.pop("_validation_ms", 0)
-        self._kpi.record_validation(
-            passed=result["pass"],
-            warning_count=result["warnings"]["count"],
-            error_count=result["errors"]["count"],
-            warning_sources=result["warnings"]["sources"],
-            error_sources=result["errors"]["sources"],
-        )
-        logger.info(
-            "SHACL validation for %s: pass=%s warnings=%s errors=%s",
-            url,
-            result["pass"],
-            result["warnings"]["count"],
-            result["errors"]["count"],
-        )
-        return result, validation_queue_wait_ms, validation_ms
-
-    def _validate_graph_if_enabled(
-        self, graph: Graph, url: str
-    ) -> dict[str, Any] | None:
-        if self._shacl_mode == "off":
-            return None
-        result = self._validate_graph(graph)
-        summary = self._summarize_validation(result)
-        self._kpi.record_validation(
-            passed=summary["pass"],
-            warning_count=summary["warnings"]["count"],
-            error_count=summary["errors"]["count"],
-            warning_sources=summary["warnings"]["sources"],
-            error_sources=summary["errors"]["sources"],
-        )
-        logger.info(
-            "SHACL validation for %s: pass=%s warnings=%s errors=%s",
-            url,
-            summary["pass"],
-            summary["warnings"]["count"],
-            summary["errors"]["count"],
-        )
-        return summary
-
-    def _validate_graph(self, graph: Graph) -> ValidationResult:
-        data_graph = normalize_schema_org_uris(graph)
-        conforms, report_graph, report_text = pyshacl_validate(
-            data_graph,
-            shacl_graph=self._shacl_shapes_graph,
-            inference="rdfs",
-            abort_on_first=False,
-            allow_infos=True,
-            allow_warnings=True,
-        )
-        warning_count = sum(
-            1 for _ in report_graph.subjects(SH.resultSeverity, SH.Warning)
-        )
-        return ValidationResult(
-            conforms=conforms,
-            report_text=report_text,
-            report_graph=report_graph,
-            data_graph=data_graph,
-            shape_source_map=self._shacl_source_map,
-            warning_count=warning_count,
-        )
-
-    def _summarize_validation(self, result: ValidationResult) -> dict[str, Any]:
-        sh = URIRef("http://www.w3.org/ns/shacl#")
-        sh_warning = URIRef(f"{sh}Warning")
-        sh_violation = URIRef(f"{sh}Violation")
-        sh_source_shape = URIRef(f"{sh}sourceShape")
-
-        warning_sources: dict[str, int] = {}
-        error_sources: dict[str, int] = {}
-        warning_count = 0
-        error_count = 0
-
-        for report_node in result.report_graph.subjects(
-            URIRef(f"{sh}resultSeverity"), sh_warning
-        ):
-            warning_count += 1
-            shape = next(
-                result.report_graph.objects(report_node, sh_source_shape), None
-            )
-            label = result.shape_source_map.get(shape, "unknown")
-            warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1
-
-        for report_node in result.report_graph.subjects(
-            URIRef(f"{sh}resultSeverity"), sh_violation
-        ):
-            error_count += 1
-            shape = next(
-                result.report_graph.objects(report_node, sh_source_shape), None
-            )
-            label = result.shape_source_map.get(shape, "unknown")
-            error_sources[str(label)] = error_sources.get(str(label), 0) + 1
-
-        return {
-            "total": 1,
-            "pass": bool(result.conforms),
-            "fail": not bool(result.conforms),
-            "warnings": {
-                "count": warning_count,
-                "sources": dict(
-                    sorted(warning_sources.items(), key=lambda item: item[0])
-                ),
-            },
-            "errors": {
-                "count": error_count,
-                "sources": dict(
-                    sorted(error_sources.items(), key=lambda item: item[0])
-                ),
-            },
-        }
 
     def _emit_progress(self, payload: dict[str, Any]) -> None:
         if not callable(self._on_progress):
@@ -1012,19 +830,22 @@ def _resolve_list_setting(self, value: Any) -> list[str]:
             return specs
         return [str(value).strip()] if str(value).strip() else []
 
-    def _resolve_validation_mode(self, value: Any) -> str:
+    def _resolve_validation_mode(self, value: Any) -> ValidationMode:
         if value is None:
-            return "warn"
+            return ValidationMode.WARN
         mode = str(value).strip().lower()
         if mode == "strict":
             logger.warning(
                 "Deprecated SHACL validation mode 'strict' detected; using 'fail'."
             )
-            return "fail"
-        if mode in {"off", "warn", "fail"}:
-            return mode
-        logger.warning("Unsupported SHACL validation mode '%s'; using 'warn'.", mode)
-        return "warn"
+            return ValidationMode.FAIL
+        try:
+            return ValidationMode(mode)
+        except ValueError:
+            logger.warning(
+                "Unsupported SHACL validation mode '%s'; using 'warn'.", mode
+            )
+            return ValidationMode.WARN
 
     def _resolve_import_hash_mode(self, value: Any) -> str:
         if value is None:

From e5afb05963b55c5a13eb19714086570255178338 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:09:36 +0100
Subject: [PATCH 28/63] refactor: delegate load_postprocessors_for_profile to
 load_postprocessors

---
 wordlift_sdk/kg_build/postprocessors.py | 82 +++++++++++--------------
 wordlift_sdk/kg_build/protocol.py       | 21 +++----
 wordlift_sdk/kg_build/rml_mapping.py    | 21 +++++--
 wordlift_sdk/structured_data/engine.py  |  6 +-
 4 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index 4e5a079..f9b7116 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -418,6 +418,14 @@ def _as_positive_int(value: Any, default: int) -> int:
     return value
 
 
+def _build_handler(
+    spec: PostprocessorSpec, root_dir: Path, runtime: str
+) -> GraphPostprocessor:
+    if runtime == _RUNTIME_INPROCESS:
+        return InProcessPostprocessor(class_path=spec.class_path)
+    return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime)
+
+
 def _normalize_runtime(value: str | None) -> str:
     runtime = (value or _RUNTIME_ONESHOT).strip().lower()
     if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}:
@@ -509,6 +517,21 @@ def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]:
     }
 
 
+def _load_from_specs(
+    specs: list[PostprocessorSpec],
+    root_dir: Path,
+    runtime: str,
+) -> list[LoadedPostprocessor]:
+    return [
+        LoadedPostprocessor(
+            name=spec.class_path,
+            handler=_build_handler(spec, root_dir, runtime),
+        )
+        for spec in specs
+        if spec.enabled
+    ]
+
+
 def load_postprocessors_for_profile(
     *,
     root_dir: Path,
@@ -518,73 +541,38 @@ def load_postprocessors_for_profile(
     base_manifest = root_dir / "profiles" / "_base" / "postprocessors.toml"
     profile_manifest = root_dir / "profiles" / profile_name / "postprocessors.toml"
 
-    selected_manifest: Path | None
     if profile_manifest.exists():
-        selected_manifest = profile_manifest
+        selected_manifest: Path | None = profile_manifest
     elif base_manifest.exists():
         selected_manifest = base_manifest
     else:
         selected_manifest = None
 
-    specs = _load_manifest_specs(selected_manifest) if selected_manifest else []
-
-    resolved_runtime = _normalize_runtime(runtime)
-    loaded: list[LoadedPostprocessor] = []
-    for spec in specs:
-        if not spec.enabled:
-            continue
-        if resolved_runtime == _RUNTIME_INPROCESS:
-            handler: GraphPostprocessor = InProcessPostprocessor(
-                class_path=spec.class_path
-            )
-        else:
-            handler = SubprocessPostprocessor(
-                spec=spec,
-                root_dir=root_dir,
-                runtime=resolved_runtime,
-            )
-        loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler))
-
-    logger.info(
-        "Loaded %s postprocessors for profile '%s' from manifest: %s (runtime=%s)",
-        len(loaded),
-        profile_name,
-        selected_manifest or "none",
-        resolved_runtime,
-    )
     logger.debug(
-        "Postprocessor manifest precedence for profile '%s': selected=%s base=%s chosen=%s",
+        "Postprocessor manifest precedence for profile '%s': profile=%s base=%s chosen=%s",
         profile_name,
         profile_manifest,
         base_manifest,
         selected_manifest or "none",
     )
-    return loaded
+    return load_postprocessors(selected_manifest, root_dir=root_dir, runtime=runtime)
 
 
 def load_postprocessors(
-    manifest_path: Path,
+    manifest_path: Path | None,
     *,
     root_dir: Path,
     runtime: str | None = None,
 ) -> list[LoadedPostprocessor]:
-    specs = _load_manifest_specs(manifest_path)
+    specs = _load_manifest_specs(manifest_path) if manifest_path else []
     resolved_runtime = _normalize_runtime(runtime)
-    loaded: list[LoadedPostprocessor] = []
-    for spec in specs:
-        if not spec.enabled:
-            continue
-        if resolved_runtime == _RUNTIME_INPROCESS:
-            handler: GraphPostprocessor = InProcessPostprocessor(
-                class_path=spec.class_path
-            )
-        else:
-            handler = SubprocessPostprocessor(
-                spec=spec,
-                root_dir=root_dir,
-                runtime=resolved_runtime,
-            )
-        loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler))
+    loaded = _load_from_specs(specs, root_dir, resolved_runtime)
+    logger.info(
+        "Loaded %s postprocessors from manifest: %s (runtime=%s)",
+        len(loaded),
+        manifest_path or "none",
+        resolved_runtime,
+    )
     return loaded
 
 
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index e36026c..47065f8 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -36,9 +36,9 @@
     close_loaded_postprocessors,
     load_postprocessors_for_profile,
 )
-from .rml_mapping import RmlMappingService
+from .rml_mapping import MappingResult, RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
-from wordlift_sdk.structured_data.engine import init_morph_kgc_pool, _morph_kgc_tls
+from wordlift_sdk.structured_data.engine import init_morph_kgc_pool
 
 logger = logging.getLogger(__name__)
 SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
@@ -252,17 +252,13 @@ async def callback(
         mapping_response = self._mapping_response(response, existing_web_page_id)
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
 
-        _t0 = time.perf_counter()
         # apply_mapping has no awaits — all work is synchronous (morph_kgc).
         # Run it in a thread so the event loop stays free for I/O while the
         # thread waits for its morph_kgc subprocess slot to become available.
-        # _morph_kgc_tls is thread-local: capture it inside the worker thread
-        # and pass the value back via a closure dict.
         _timing: dict[str, int] = {}
 
         def _run_mapping() -> Graph | None:
-            _t_start = time.perf_counter()
-            result = asyncio.run(
+            mapping: MappingResult = asyncio.run(
                 self.rml_service.apply_mapping(
                     html=response.web_page.html,
                     url=url,
@@ -272,16 +268,13 @@ def _run_mapping() -> Graph | None:
                     debug_output=debug_output,
                 )
             )
-            mw = getattr(_morph_kgc_tls, "mapping_wait_ms", 0)
-            _timing["mapping_wait_ms"] = mw
-            # Subtract queue-wait so mapping= shows actual execution time only,
-            # consistent with how validation_wait/validation are reported.
-            _timing["mapping_ms"] = int((time.perf_counter() - _t_start) * 1000) - mw
-            return result
+            _timing["mapping_wait_ms"] = mapping.queue_wait_ms
+            _timing["mapping_ms"] = mapping.mapping_ms
+            return mapping.graph
 
         _loop = asyncio.get_event_loop()
         graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping)
-        _t_mapping = _timing.get("mapping_ms", int((time.perf_counter() - _t0) * 1000))
+        _t_mapping = _timing.get("mapping_ms", 0)
         _t_mapping_wait = _timing.get("mapping_wait_ms", 0)
         if not graph or len(graph) == 0:
             logger.warning("No triples produced for %s", url)
diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py
index 5b40a91..b666ab6 100644
--- a/wordlift_sdk/kg_build/rml_mapping.py
+++ b/wordlift_sdk/kg_build/rml_mapping.py
@@ -4,17 +4,27 @@
 import logging
 import os
 import tempfile
+import time
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
 from rdflib import Graph
 from wordlift_sdk.protocol import Context
+from wordlift_sdk.structured_data.engine import _morph_kgc_tls
 from wordlift_sdk.structured_data.materialization import MaterializationPipeline
 from wordlift_sdk.utils.html_converter import HtmlConverter
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass(frozen=True)
+class MappingResult:
+    graph: Graph | None
+    queue_wait_ms: int
+    mapping_ms: int
+
+
 class RmlMappingService:
     def __init__(self, context: Context) -> None:
         self._context = context
@@ -32,7 +42,9 @@ async def apply_mapping(
         mapping_content: str | None = None,
         response: object | None = None,
         debug_output: dict[str, str] | None = None,
-    ) -> Graph | None:
+    ) -> MappingResult:
+        queue_wait_ms = 0
+        _t_start = time.perf_counter()
         try:
             xhtml_str = xhtml or self.to_xhtml(html)
             if debug_output is not None:
@@ -50,7 +62,7 @@ async def apply_mapping(
                             resolved_mapping_content = f.read()
                     except FileNotFoundError:
                         logger.error("Mapping file not found: %s", mapping_file_path)
-                        return None
+                        return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000))
 
                 dataset_uri = getattr(self._context.account, "dataset_uri", None)
                 if not dataset_uri:
@@ -70,6 +82,7 @@ async def apply_mapping(
                     url=url,
                     response=response,
                 )
+                queue_wait_ms = getattr(_morph_kgc_tls, "queue_wait_ms", 0)
                 jsonld_data = pipeline.postprocess(
                     jsonld_raw,
                     mappings,
@@ -93,7 +106,7 @@ async def apply_mapping(
                         "No triples generated from mapping %s.", mapping_file_path
                     )
 
-                return graph
+                return MappingResult(graph=graph, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000) - queue_wait_ms)
 
         except Exception as exc:
             logger.error(
@@ -102,7 +115,7 @@ async def apply_mapping(
                 exc,
                 exc_info=True,
             )
-            return None
+            return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000))
 
     def _normalize_schema_uris(self, payload: Any):
         if isinstance(payload, dict):
diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py
index c62ad75..3d27952 100644
--- a/wordlift_sdk/structured_data/engine.py
+++ b/wordlift_sdk/structured_data/engine.py
@@ -48,8 +48,10 @@ def _morph_kgc_worker(config: str, submit_time: float) -> tuple[str, int]:
     return ntriples, queue_wait_ms
 
 
-# Thread-local used to pass mapping_wait_ms back to the protocol layer without
-# changing the return type of _materialize_graph / apply_mapping.
+# Thread-local used to pass mapping_wait_ms out of _materialize_graph without
+# changing the return type of the public materialization API.
+# Consumed by rml_mapping.RmlMappingService.apply_mapping — callers above that
+# layer receive the timing as a regular return value.
 _morph_kgc_tls = threading.local()
 
 # Lazy process pool — created on first use in the main process only.

From bb1c123a369d6ea8ee3ad1f5a0ac0254af24c6bf Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:13:37 +0100
Subject: [PATCH 29/63] refactor: replace runtime string constants with
 PostprocessorRuntime enum

---
 wordlift_sdk/kg_build/postprocessors.py | 28 ++++++++++++++-----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index f9b7116..c73574a 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -10,6 +10,7 @@
 import subprocess
 import tempfile
 from dataclasses import dataclass, field
+from enum import Enum
 from pathlib import Path
 from typing import Any, Protocol, runtime_checkable
 
@@ -22,9 +23,11 @@
 except ModuleNotFoundError:  # pragma: no cover
     import tomli as tomllib
 
-_RUNTIME_ONESHOT = "oneshot"
-_RUNTIME_PERSISTENT = "persistent"
-_RUNTIME_INPROCESS = "inprocess"
+
+class PostprocessorRuntime(str, Enum):
+    ONESHOT = "oneshot"
+    PERSISTENT = "persistent"
+    INPROCESS = "inprocess"
 
 
 @dataclass(frozen=True)
@@ -256,7 +259,7 @@ def _terminate(self, process: subprocess.Popen[str]) -> None:
 class SubprocessPostprocessor:
     spec: PostprocessorSpec
     root_dir: Path
-    runtime: str = _RUNTIME_ONESHOT
+    runtime: PostprocessorRuntime = PostprocessorRuntime.ONESHOT
     _persistent_client: PersistentPostprocessorClient | None = field(
         init=False,
         default=None,
@@ -285,7 +288,7 @@ def process_graph(
                 encoding="utf-8",
             )
 
-            if self.runtime == _RUNTIME_PERSISTENT:
+            if self.runtime == PostprocessorRuntime.PERSISTENT:
                 self._run_persistent(
                     input_graph_path=input_graph_path,
                     output_graph_path=output_graph_path,
@@ -419,20 +422,21 @@ def _as_positive_int(value: Any, default: int) -> int:
 
 
 def _build_handler(
-    spec: PostprocessorSpec, root_dir: Path, runtime: str
+    spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime
 ) -> GraphPostprocessor:
-    if runtime == _RUNTIME_INPROCESS:
+    if runtime == PostprocessorRuntime.INPROCESS:
         return InProcessPostprocessor(class_path=spec.class_path)
     return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime)
 
 
-def _normalize_runtime(value: str | None) -> str:
-    runtime = (value or _RUNTIME_ONESHOT).strip().lower()
-    if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}:
+def _normalize_runtime(value: str | None) -> PostprocessorRuntime:
+    raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower()
+    try:
+        return PostprocessorRuntime(raw)
+    except ValueError:
         raise ValueError(
             "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess."
         )
-    return runtime
 
 
 def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]:
@@ -520,7 +524,7 @@ def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]:
 def _load_from_specs(
     specs: list[PostprocessorSpec],
     root_dir: Path,
-    runtime: str,
+    runtime: PostprocessorRuntime,
 ) -> list[LoadedPostprocessor]:
     return [
         LoadedPostprocessor(

From d2494001bb00dc18d477f6e6c5194f9c772eaf10 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:15:08 +0100
Subject: [PATCH 30/63] fix: remove to_xhtml from public contract of
 RmlMappingService

---
 wordlift_sdk/kg_build/rml_mapping.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py
index b666ab6..316ff11 100644
--- a/wordlift_sdk/kg_build/rml_mapping.py
+++ b/wordlift_sdk/kg_build/rml_mapping.py
@@ -30,7 +30,7 @@ def __init__(self, context: Context) -> None:
         self._context = context
         self._html_converter = HtmlConverter()
 
-    def to_xhtml(self, html: str) -> str:
+    def _to_xhtml(self, html: str) -> str:
         return self._html_converter.convert(html)
 
     async def apply_mapping(
@@ -46,7 +46,7 @@ async def apply_mapping(
         queue_wait_ms = 0
         _t_start = time.perf_counter()
         try:
-            xhtml_str = xhtml or self.to_xhtml(html)
+            xhtml_str = xhtml or self._to_xhtml(html)
             if debug_output is not None:
                 debug_output["xhtml"] = xhtml_str
 
@@ -62,7 +62,11 @@ async def apply_mapping(
                             resolved_mapping_content = f.read()
                     except FileNotFoundError:
                         logger.error("Mapping file not found: %s", mapping_file_path)
-                        return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000))
+                        return MappingResult(
+                            graph=None,
+                            queue_wait_ms=queue_wait_ms,
+                            mapping_ms=int((time.perf_counter() - _t_start) * 1000),
+                        )
 
                 dataset_uri = getattr(self._context.account, "dataset_uri", None)
                 if not dataset_uri:
@@ -106,7 +110,12 @@ async def apply_mapping(
                         "No triples generated from mapping %s.", mapping_file_path
                     )
 
-                return MappingResult(graph=graph, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000) - queue_wait_ms)
+                return MappingResult(
+                    graph=graph,
+                    queue_wait_ms=queue_wait_ms,
+                    mapping_ms=int((time.perf_counter() - _t_start) * 1000)
+                    - queue_wait_ms,
+                )
 
         except Exception as exc:
             logger.error(
@@ -115,7 +124,11 @@ async def apply_mapping(
                 exc,
                 exc_info=True,
             )
-            return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000))
+            return MappingResult(
+                graph=None,
+                queue_wait_ms=queue_wait_ms,
+                mapping_ms=int((time.perf_counter() - _t_start) * 1000),
+            )
 
     def _normalize_schema_uris(self, payload: Any):
         if isinstance(payload, dict):

From 092050ebadcc8a50ce639695b870bae9dfaac65c Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:20:46 +0100
Subject: [PATCH 31/63] refactor: extract Closeable protocol and use isinstance
 check in close_loaded_postprocessors

---
 wordlift_sdk/kg_build/postprocessors.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index c73574a..5843143 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -43,6 +43,11 @@ class PostprocessorContext:
     ids: Any | None = None
 
 
+@runtime_checkable
+class Closeable(Protocol):
+    def close(self) -> None: ...
+
+
 @runtime_checkable
 class GraphPostprocessor(Protocol):
     def process_graph(
@@ -582,9 +587,8 @@ def load_postprocessors(
 
 def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None:
     for processor in postprocessors:
-        close = getattr(processor.handler, "close", None)
-        if callable(close):
-            close()
+        if isinstance(processor.handler, Closeable):
+            processor.handler.close()
 
 
 def _write_graph_nquads(graph: Graph, path: Path) -> None:

From 6a4df850a8f49a5fcf97e96f8c0e4edf14594734 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:31:42 +0100
Subject: [PATCH 32/63] refactor: split SubprocessPostprocessor into Oneshot
 and Persistent variants

---
 wordlift_sdk/kg_build/postprocessors.py | 171 ++++++++++++++----------
 1 file changed, 101 insertions(+), 70 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index 5843143..7e16618 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -43,6 +43,17 @@ class PostprocessorContext:
     ids: Any | None = None
 
 
+class _SubprocessRunner(Protocol):
+    def __call__(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_path: Path,
+        context_payload: dict[str, Any],
+    ) -> None: ...
+
+
 @runtime_checkable
 class Closeable(Protocol):
     def close(self) -> None: ...
@@ -260,83 +271,79 @@ def _terminate(self, process: subprocess.Popen[str]) -> None:
                 pass
 
 
-@dataclass
-class SubprocessPostprocessor:
+def _run_subprocess(
+    spec: PostprocessorSpec,
+    root_dir: Path,
+    graph: Graph,
+    payload: dict[str, Any],
+    runner: _SubprocessRunner,
+) -> Graph | None:
+    """Shared scaffolding for subprocess-based postprocessors.
+
+    Handles temp-dir lifecycle, graph serialization, output verification,
+    and debug-copy on failure. *runner* is called with the prepared paths
+    and is responsible only for the actual subprocess execution step.
+    """
+    temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_"))
+    failed = False
+    try:
+        input_graph_path = temp_dir_path / "input_graph.nq"
+        output_graph_path = temp_dir_path / "output_graph.nq"
+        context_path = temp_dir_path / "context.json"
+
+        _write_graph_nquads(graph, input_graph_path)
+        context_path.write_text(
+            json.dumps(payload, ensure_ascii=True, default=str),
+            encoding="utf-8",
+        )
+
+        runner(
+            input_graph_path=input_graph_path,
+            output_graph_path=output_graph_path,
+            context_path=context_path,
+            context_payload=payload,
+        )
+
+        if not output_graph_path.exists():
+            failed = True
+            raise RuntimeError(
+                f"Postprocessor did not produce output graph: {spec.class_path}"
+            )
+
+        return _read_graph_nquads(output_graph_path)
+    except Exception:
+        failed = True
+        raise
+    finally:
+        if failed and spec.keep_temp_on_error:
+            debug_dir = root_dir / "output" / "postprocessor_debug"
+            debug_dir.mkdir(parents=True, exist_ok=True)
+            target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_"))
+            if target.exists():
+                shutil.rmtree(target)
+            shutil.copytree(temp_dir_path, target)
+            _redact_debug_context(target / "context.json")
+        if temp_dir_path.exists():
+            shutil.rmtree(temp_dir_path, ignore_errors=True)
+
+
+@dataclass(frozen=True)
+class OneshotSubprocessPostprocessor:
     spec: PostprocessorSpec
     root_dir: Path
-    runtime: PostprocessorRuntime = PostprocessorRuntime.ONESHOT
-    _persistent_client: PersistentPostprocessorClient | None = field(
-        init=False,
-        default=None,
-        repr=False,
-    )
-
-    def close(self) -> None:
-        if self._persistent_client is not None:
-            self._persistent_client.close()
-            self._persistent_client = None
 
     def process_graph(
         self, graph: Graph, context: PostprocessorContext
     ) -> Graph | None:
-        payload = _build_runner_payload(context)
-        temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_"))
-        failed = False
-        try:
-            input_graph_path = temp_dir_path / "input_graph.nq"
-            output_graph_path = temp_dir_path / "output_graph.nq"
-            context_path = temp_dir_path / "context.json"
-
-            _write_graph_nquads(graph, input_graph_path)
-            context_path.write_text(
-                json.dumps(payload, ensure_ascii=True, default=str),
-                encoding="utf-8",
-            )
+        return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run)
 
-            if self.runtime == PostprocessorRuntime.PERSISTENT:
-                self._run_persistent(
-                    input_graph_path=input_graph_path,
-                    output_graph_path=output_graph_path,
-                    context_payload=payload,
-                )
-            else:
-                self._run_oneshot(
-                    input_graph_path=input_graph_path,
-                    output_graph_path=output_graph_path,
-                    context_path=context_path,
-                )
-
-            if not output_graph_path.exists():
-                failed = True
-                raise RuntimeError(
-                    "Postprocessor did not produce output graph: "
-                    f"{self.spec.class_path}"
-                )
-
-            return _read_graph_nquads(output_graph_path)
-        except Exception:
-            failed = True
-            raise
-        finally:
-            if failed and self.spec.keep_temp_on_error:
-                debug_dir = self.root_dir / "output" / "postprocessor_debug"
-                debug_dir.mkdir(parents=True, exist_ok=True)
-                target = debug_dir / (
-                    self.spec.class_path.replace(":", "_").replace(".", "_")
-                )
-                if target.exists():
-                    shutil.rmtree(target)
-                shutil.copytree(temp_dir_path, target)
-                _redact_debug_context(target / "context.json")
-            if temp_dir_path.exists():
-                shutil.rmtree(temp_dir_path, ignore_errors=True)
-
-    def _run_oneshot(
+    def _run(
         self,
         *,
         input_graph_path: Path,
         output_graph_path: Path,
         context_path: Path,
+        **_: Any,
     ) -> None:
         cmd = [
             self.spec.python,
@@ -366,19 +373,41 @@ def _run_oneshot(
                 f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "")
             )
 
-    def _run_persistent(
+
+@dataclass
+class PersistentSubprocessPostprocessor:
+    spec: PostprocessorSpec
+    root_dir: Path
+    _client: PersistentPostprocessorClient | None = field(
+        init=False,
+        default=None,
+        repr=False,
+    )
+
+    def close(self) -> None:
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None:
+        return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run)
+
+    def _run(
         self,
         *,
         input_graph_path: Path,
         output_graph_path: Path,
         context_payload: dict[str, Any],
+        **_: Any,
     ) -> None:
-        if self._persistent_client is None:
-            self._persistent_client = PersistentPostprocessorClient(
+        if self._client is None:
+            self._client = PersistentPostprocessorClient(
                 spec=self.spec,
                 root_dir=self.root_dir,
             )
-        self._persistent_client.process_graph(
+        self._client.process_graph(
             input_graph_path=input_graph_path,
             output_graph_path=output_graph_path,
             context_payload=context_payload,
@@ -431,7 +460,9 @@ def _build_handler(
 ) -> GraphPostprocessor:
     if runtime == PostprocessorRuntime.INPROCESS:
         return InProcessPostprocessor(class_path=spec.class_path)
-    return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime)
+    if runtime == PostprocessorRuntime.PERSISTENT:
+        return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir)
+    return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir)
 
 
 def _normalize_runtime(value: str | None) -> PostprocessorRuntime:

From 5ed1ba7d6ad092e393578c6577c46983180968e5 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:38:25 +0100
Subject: [PATCH 33/63] refactor: introduce PostprocessorResult and remove dead
 _apply_postprocessors

---
 wordlift_sdk/kg_build/postprocessors.py | 15 +++++++--
 wordlift_sdk/kg_build/protocol.py       | 43 +++++++++++--------------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index 7e16618..d29b8f5 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -66,6 +66,13 @@ def process_graph(
     ) -> Graph | None: ...
 
 
+@dataclass(frozen=True)
+class PostprocessorResult:
+    graph: Graph
+    queue_wait_ms: int
+    postprocessors_ms: int
+
+
 @dataclass(frozen=True)
 class LoadedPostprocessor:
     name: str
@@ -335,7 +342,9 @@ class OneshotSubprocessPostprocessor:
     def process_graph(
         self, graph: Graph, context: PostprocessorContext
     ) -> Graph | None:
-        return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run)
+        return _run_subprocess(
+            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
+        )
 
     def _run(
         self,
@@ -392,7 +401,9 @@ def close(self) -> None:
     def process_graph(
         self, graph: Graph, context: PostprocessorContext
     ) -> Graph | None:
-        return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run)
+        return _run_subprocess(
+            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
+        )
 
     def _run(
         self,
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 47065f8..85fe5c2 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -33,6 +33,7 @@
 from .kpi import KgBuildKpiCollector
 from .postprocessors import (
     PostprocessorContext,
+    PostprocessorResult,
     close_loaded_postprocessors,
     load_postprocessors_for_profile,
 )
@@ -285,10 +286,9 @@ def _run_mapping() -> Graph | None:
         loop = asyncio.get_event_loop()
         _t1 = time.perf_counter()
         _postprocessors = await self._postprocessors_queue.get()
-        _t_queue_wait = int((time.perf_counter() - _t1) * 1000)
+        _queue_wait_ms = int((time.perf_counter() - _t1) * 1000)
         try:
-            _t2 = time.perf_counter()
-            graph = await loop.run_in_executor(
+            pp_result: PostprocessorResult = await loop.run_in_executor(
                 self._pp_executor,
                 functools.partial(
                     self._apply_postprocessors_with,
@@ -297,11 +297,12 @@ def _run_mapping() -> Graph | None:
                     response,
                     existing_web_page_id,
                     _postprocessors,
+                    _queue_wait_ms,
                 ),
             )
-            _t_postprocessors = int((time.perf_counter() - _t2) * 1000)
         finally:
             self._postprocessors_queue.put_nowait(_postprocessors)
+        graph = pp_result.graph
         # Canonical IDs must run after custom postprocessors so any nodes minted
         # by local logic are normalized before graph sync patching.
         graph = self._core_ids.process_graph(
@@ -359,8 +360,8 @@ def _run_mapping() -> Graph | None:
             url,
             _t_mapping_wait,
             _t_mapping,
-            _t_queue_wait,
-            _t_postprocessors,
+            pp_result.queue_wait_ms,
+            pp_result.postprocessors_ms,
             _t_validation_wait,
             _t_validation_actual,
         )
@@ -595,21 +596,6 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool:
             and existing_hash == import_hash
         )
 
-    def _apply_postprocessors(
-        self,
-        graph: Graph,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-    ) -> Graph:
-        return self._apply_postprocessors_with(
-            graph,
-            url,
-            response,
-            existing_web_page_id,
-            list(self._postprocessors_queue._queue),  # type: ignore[attr-defined]
-        )
-
     def _apply_postprocessors_with(
         self,
         graph: Graph,
@@ -617,9 +603,13 @@ def _apply_postprocessors_with(
         response: WebPageScrapeResponse,
         existing_web_page_id: str | None,
         postprocessors: list,
-    ) -> Graph:
+        queue_wait_ms: int,
+    ) -> PostprocessorResult:
+        _t_start = time.perf_counter()
         if not postprocessors:
-            return graph
+            return PostprocessorResult(
+                graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0
+            )
 
         pp_context = self._build_pp_context(url, response, existing_web_page_id)
         if not pp_context.account_key:
@@ -637,7 +627,11 @@ def _apply_postprocessors_with(
                 url,
                 int((time.perf_counter() - _tp) * 1000),
             )
-        return graph
+        return PostprocessorResult(
+            graph=graph,
+            queue_wait_ms=queue_wait_ms,
+            postprocessors_ms=int((time.perf_counter() - _t_start) * 1000),
+        )
 
     def _build_pp_context(
         self,
@@ -800,7 +794,6 @@ def _mapping_response(
             web_page=response.web_page,
         )
 
-
     def _emit_progress(self, payload: dict[str, Any]) -> None:
         if not callable(self._on_progress):
             return

From 310e5dcb09a5e4e4bcfc2df46b27cc75a0105a70 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:46:32 +0100
Subject: [PATCH 34/63] refactor: extract PostprocessorService from
 ProfileImportProtocol

---
 .../kg_build/postprocessor_service.py         | 195 ++++++++++++++++++
 wordlift_sdk/kg_build/protocol.py             | 164 ++-------------
 2 files changed, 212 insertions(+), 147 deletions(-)
 create mode 100644 wordlift_sdk/kg_build/postprocessor_service.py

diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessor_service.py
new file mode 100644
index 0000000..cd8c21d
--- /dev/null
+++ b/wordlift_sdk/kg_build/postprocessor_service.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+
+import asyncio
+import functools
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+
+from rdflib import Graph
+from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse
+from wordlift_sdk.protocol import Context
+
+from .config import ProfileDefinition
+from .id_allocator import IdAllocator
+from .postprocessors import (
+    PostprocessorContext,
+    PostprocessorResult,
+    close_loaded_postprocessors,
+    load_postprocessors_for_profile,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _clean_key(value: Any) -> str | None:
+    if value is None:
+        return None
+    key = str(value).strip()
+    return key or None
+
+
+class PostprocessorService:
+    def __init__(
+        self,
+        *,
+        root_dir: Path,
+        profile: ProfileDefinition,
+        context: Context,
+        pool_size: int,
+        runtime: str,
+    ) -> None:
+        self._profile = profile
+        self._context = context
+        self._executor = ThreadPoolExecutor(
+            max_workers=pool_size, thread_name_prefix="worai_pp"
+        )
+        self._queue: asyncio.Queue = asyncio.Queue()
+        for _ in range(pool_size):
+            self._queue.put_nowait(
+                load_postprocessors_for_profile(
+                    root_dir=root_dir,
+                    profile_name=profile.name,
+                    runtime=runtime,
+                )
+            )
+        logger.info(
+            "Created postprocessor pool for profile '%s' (pool_size=%d runtime=%s)",
+            profile.name,
+            pool_size,
+            runtime,
+        )
+
+    async def apply(
+        self,
+        graph: Graph,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        exports: dict[str, Any],
+    ) -> PostprocessorResult:
+        _t1 = time.perf_counter()
+        postprocessors = await self._queue.get()
+        queue_wait_ms = int((time.perf_counter() - _t1) * 1000)
+        loop = asyncio.get_event_loop()
+        try:
+            return await loop.run_in_executor(
+                self._executor,
+                functools.partial(
+                    self._run,
+                    graph,
+                    url,
+                    response,
+                    existing_web_page_id,
+                    postprocessors,
+                    queue_wait_ms,
+                    exports,
+                ),
+            )
+        finally:
+            self._queue.put_nowait(postprocessors)
+
+    def build_context(
+        self,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        exports: dict[str, Any],
+    ) -> PostprocessorContext:
+        dataset_uri = str(getattr(self._context.account, "dataset_uri", "")).rstrip("/")
+        ids = IdAllocator(dataset_uri) if dataset_uri else None
+        profile_payload = asdict(self._profile)
+        profile_settings = dict(profile_payload.get("settings", {}) or {})
+        profile_settings.setdefault("api_url", "https://api.wordlift.io")
+        profile_payload["settings"] = profile_settings
+        return PostprocessorContext(
+            profile_name=self._profile.name,
+            profile=profile_payload,
+            url=url,
+            account=self._context.account,
+            account_key=self._resolve_account_key(),
+            exports=exports,
+            response=response,
+            existing_web_page_id=existing_web_page_id,
+            ids=ids,
+        )
+
+    def close(self) -> None:
+        while not self._queue.empty():
+            try:
+                close_loaded_postprocessors(self._queue.get_nowait())
+            except asyncio.QueueEmpty:
+                break
+        self._executor.shutdown(wait=False)
+
+    def _run(
+        self,
+        graph: Graph,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        postprocessors: list,
+        queue_wait_ms: int,
+        exports: dict[str, Any],
+    ) -> PostprocessorResult:
+        _t_start = time.perf_counter()
+        if not postprocessors:
+            return PostprocessorResult(
+                graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0
+            )
+
+        pp_context = self.build_context(url, response, existing_web_page_id, exports)
+        if not pp_context.account_key:
+            raise RuntimeError(
+                "Postprocessor runtime requires an API key. Configure one via profile "
+                "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY."
+            )
+
+        for processor in postprocessors:
+            _tp = time.perf_counter()
+            graph = processor.run(graph, pp_context)
+            logger.info(
+                "Applied postprocessor '%s' for %s [%dms]",
+                processor.name,
+                url,
+                int((time.perf_counter() - _tp) * 1000),
+            )
+        return PostprocessorResult(
+            graph=graph,
+            queue_wait_ms=queue_wait_ms,
+            postprocessors_ms=int((time.perf_counter() - _t_start) * 1000),
+        )
+
+    def _resolve_account_key(self) -> str | None:
+        profile_key = _clean_key(self._profile.api_key)
+        if profile_key:
+            return profile_key
+
+        client_config = getattr(self._context, "client_configuration", None)
+        if client_config is not None:
+            api_key_map = getattr(client_config, "api_key", None)
+            if isinstance(api_key_map, dict):
+                runtime_key = _clean_key(api_key_map.get("ApiKey"))
+                if runtime_key:
+                    return runtime_key
+
+        provider = getattr(self._context, "configuration_provider", None)
+        if provider is not None:
+            for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
+                try:
+                    key = _clean_key(provider.get_value(name))
+                except Exception:
+                    key = None
+                if key:
+                    return key
+
+        for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
+            key = _clean_key(os.getenv(name))
+            if key:
+                return key
+
+        return None
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 85fe5c2..b998811 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -1,13 +1,10 @@
 from __future__ import annotations
 
 import asyncio
-import functools
 import hashlib
 import logging
 import os
-import time
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any
@@ -28,15 +25,9 @@
 
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
-from .id_allocator import IdAllocator
 from .id_postprocessor import CanonicalIdsPostprocessor
 from .kpi import KgBuildKpiCollector
-from .postprocessors import (
-    PostprocessorContext,
-    PostprocessorResult,
-    close_loaded_postprocessors,
-    load_postprocessors_for_profile,
-)
+from .postprocessor_service import PostprocessorService
 from .rml_mapping import MappingResult, RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
 from wordlift_sdk.structured_data.engine import init_morph_kgc_pool
@@ -111,13 +102,13 @@ def __init__(
             .lower()
         )
         self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
-        self._postprocessor_runtime = _resolve_postprocessor_runtime(
+        _postprocessor_runtime = _resolve_postprocessor_runtime(
             dict(self.profile.settings)
         )
         logger.info(
             "Resolved postprocessor runtime for profile '%s': %s (origin=%s)",
             self.profile.name,
-            self._postprocessor_runtime,
+            _postprocessor_runtime,
             self.profile.origins.get("postprocessor_runtime", "default"),
         )
         _pool_size = int(
@@ -137,8 +128,12 @@ def __init__(
             _pp_pool_size,
             _pool_size,
         )
-        self._pp_executor = ThreadPoolExecutor(
-            max_workers=_pp_pool_size, thread_name_prefix="worai_pp"
+        self._postprocessor_service = PostprocessorService(
+            root_dir=self.root_dir,
+            profile=self.profile,
+            context=context,
+            pool_size=_pp_pool_size,
+            runtime=_postprocessor_runtime,
         )
         _mapping_pool_size = int(
             self.profile.settings.get(
@@ -158,15 +153,6 @@ def __init__(
         self._mapping_executor = ThreadPoolExecutor(
             max_workers=_pool_size, thread_name_prefix="worai_ml"
         )
-        self._postprocessors_queue: asyncio.Queue = asyncio.Queue()
-        for _ in range(_pp_pool_size):
-            self._postprocessors_queue.put_nowait(
-                load_postprocessors_for_profile(
-                    root_dir=self.root_dir,
-                    profile_name=self.profile.name,
-                    runtime=self._postprocessor_runtime,
-                )
-            )
         shacl_mode = self._resolve_validation_mode(
             self.profile.settings.get(
                 "shacl_validate_mode",
@@ -283,30 +269,17 @@ def _run_mapping() -> Graph | None:
 
         if existing_web_page_id:
             self._reconcile_root_id(graph, existing_web_page_id)
-        loop = asyncio.get_event_loop()
-        _t1 = time.perf_counter()
-        _postprocessors = await self._postprocessors_queue.get()
-        _queue_wait_ms = int((time.perf_counter() - _t1) * 1000)
-        try:
-            pp_result: PostprocessorResult = await loop.run_in_executor(
-                self._pp_executor,
-                functools.partial(
-                    self._apply_postprocessors_with,
-                    graph,
-                    url,
-                    response,
-                    existing_web_page_id,
-                    _postprocessors,
-                    _queue_wait_ms,
-                ),
-            )
-        finally:
-            self._postprocessors_queue.put_nowait(_postprocessors)
+        pp_result = await self._postprocessor_service.apply(
+            graph, url, response, existing_web_page_id, self._template_exports or {}
+        )
         graph = pp_result.graph
         # Canonical IDs must run after custom postprocessors so any nodes minted
         # by local logic are normalized before graph sync patching.
         graph = self._core_ids.process_graph(
-            graph, self._build_pp_context(url, response, existing_web_page_id)
+            graph,
+            self._postprocessor_service.build_context(
+                url, response, existing_web_page_id, self._template_exports or {}
+            ),
         )
         self._set_source(graph, existing_web_page_id)
         self._set_existing_import_hash(graph, existing_import_hash)
@@ -367,12 +340,7 @@ def _run_mapping() -> Graph | None:
         )
 
     def close(self) -> None:
-        while not self._postprocessors_queue.empty():
-            try:
-                close_loaded_postprocessors(self._postprocessors_queue.get_nowait())
-            except asyncio.QueueEmpty:
-                break
-        self._pp_executor.shutdown(wait=False)
+        self._postprocessor_service.close()
         self._mapping_executor.shutdown(wait=False)
         self._shacl_validator.close()
 
@@ -596,104 +564,6 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool:
             and existing_hash == import_hash
         )
 
-    def _apply_postprocessors_with(
-        self,
-        graph: Graph,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-        postprocessors: list,
-        queue_wait_ms: int,
-    ) -> PostprocessorResult:
-        _t_start = time.perf_counter()
-        if not postprocessors:
-            return PostprocessorResult(
-                graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0
-            )
-
-        pp_context = self._build_pp_context(url, response, existing_web_page_id)
-        if not pp_context.account_key:
-            raise RuntimeError(
-                "Postprocessor runtime requires an API key. Configure one via profile "
-                "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY."
-            )
-
-        for processor in postprocessors:
-            _tp = time.perf_counter()
-            graph = processor.run(graph, pp_context)
-            logger.info(
-                "Applied postprocessor '%s' for %s [%dms]",
-                processor.name,
-                url,
-                int((time.perf_counter() - _tp) * 1000),
-            )
-        return PostprocessorResult(
-            graph=graph,
-            queue_wait_ms=queue_wait_ms,
-            postprocessors_ms=int((time.perf_counter() - _t_start) * 1000),
-        )
-
-    def _build_pp_context(
-        self,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-    ) -> PostprocessorContext:
-        dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/")
-        ids = IdAllocator(dataset_uri) if dataset_uri else None
-        profile_payload = asdict(self.profile)
-        profile_settings = dict(profile_payload.get("settings", {}) or {})
-        profile_settings.setdefault("api_url", "https://api.wordlift.io")
-        profile_payload["settings"] = profile_settings
-        return PostprocessorContext(
-            profile_name=self.profile.name,
-            profile=profile_payload,
-            url=url,
-            account=self.context.account,
-            account_key=self._resolve_postprocessor_account_key(),
-            exports=self._template_exports or {},
-            response=response,
-            existing_web_page_id=existing_web_page_id,
-            ids=ids,
-        )
-
-    def _resolve_postprocessor_account_key(self) -> str | None:
-        profile_key = self._clean_key(self.profile.api_key)
-        if profile_key:
-            return profile_key
-
-        client_config = getattr(self.context, "client_configuration", None)
-        if client_config is not None:
-            api_key_map = getattr(client_config, "api_key", None)
-            if isinstance(api_key_map, dict):
-                runtime_key = self._clean_key(api_key_map.get("ApiKey"))
-                if runtime_key:
-                    return runtime_key
-
-        provider = getattr(self.context, "configuration_provider", None)
-        if provider is not None:
-            for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
-                try:
-                    key = self._clean_key(provider.get_value(name))
-                except Exception:
-                    key = None
-                if key:
-                    return key
-
-        for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
-            key = self._clean_key(os.getenv(name))
-            if key:
-                return key
-
-        return None
-
-    @staticmethod
-    def _clean_key(value: Any) -> str | None:
-        if value is None:
-            return None
-        key = str(value).strip()
-        return key or None
-
     def _write_debug_graph(self, graph: Graph, url: str) -> None:
         assert self.debug_dir is not None
         self.debug_dir.mkdir(parents=True, exist_ok=True)

From e422d008b25a79ad4c505fffea84dcb6c2ee391e Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 12:49:47 +0100
Subject: [PATCH 35/63] fix+refactor: inject
 MaterializationPipeline/HtmlConverter and fix TLS attribute name

---
 wordlift_sdk/kg_build/rml_mapping.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py
index 316ff11..6d9ef39 100644
--- a/wordlift_sdk/kg_build/rml_mapping.py
+++ b/wordlift_sdk/kg_build/rml_mapping.py
@@ -26,9 +26,15 @@ class MappingResult:
 
 
 class RmlMappingService:
-    def __init__(self, context: Context) -> None:
+    def __init__(
+        self,
+        context: Context,
+        pipeline: MaterializationPipeline | None = None,
+        html_converter: HtmlConverter | None = None,
+    ) -> None:
         self._context = context
-        self._html_converter = HtmlConverter()
+        self._pipeline = pipeline or MaterializationPipeline()
+        self._html_converter = html_converter or HtmlConverter()
 
     def _to_xhtml(self, html: str) -> str:
         return self._html_converter.convert(html)
@@ -72,22 +78,21 @@ async def apply_mapping(
                 if not dataset_uri:
                     raise RuntimeError("Dataset URI not available on context.account.")
 
-                pipeline = MaterializationPipeline()
-                normalized_yarrrml, mappings = pipeline.normalize(
+                normalized_yarrrml, mappings = self._pipeline.normalize(
                     resolved_mapping_content,
                     url,
                     Path(data_path),
                     response=response,
                 )
-                jsonld_raw = pipeline.materialize(
+                jsonld_raw = self._pipeline.materialize(
                     normalized_yarrrml,
                     Path(data_path),
                     Path(temp_dir),
                     url=url,
                     response=response,
                 )
-                queue_wait_ms = getattr(_morph_kgc_tls, "queue_wait_ms", 0)
-                jsonld_data = pipeline.postprocess(
+                queue_wait_ms = getattr(_morph_kgc_tls, "mapping_wait_ms", 0)
+                jsonld_data = self._pipeline.postprocess(
                     jsonld_raw,
                     mappings,
                     xhtml_str,

From 23a1a1478059d74e0e0bb4e6d69fcb434d81f046 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:01:08 +0100
Subject: [PATCH 36/63] refactor: wire _setting helper into __init__ to
 eliminate nested .get() chains

---
 wordlift_sdk/kg_build/protocol.py | 75 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 40 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index b998811..9b117d8 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -41,11 +41,20 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
+def _setting(settings: dict, name: str, fallback: str, default: Any) -> Any:
+    """Read a profile setting by snake_case name, falling back to UPPER_CASE, then default."""
+    v = settings.get(name)
+    if v is None:
+        v = settings.get(fallback)
+    return default if v is None else v
+
+
 def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str:
-    value = settings.get("postprocessor_runtime")
-    if value is None:
-        value = settings.get("POSTPROCESSOR_RUNTIME")
-    return str(value or "persistent")
+    return str(
+        _setting(
+            settings, "postprocessor_runtime", "POSTPROCESSOR_RUNTIME", "persistent"
+        )
+    )
 
 
 class ProfileImportProtocol(WebPageImportProtocolInterface):
@@ -91,35 +100,32 @@ def __init__(
         self._mapping_cache: dict[Path, str] = {}
         self._static_templates_patched = False
         self._static_templates_lock = asyncio.Lock()
+
+        settings = dict(self.profile.settings)
         canonical_id_strategy = (
             str(
-                self.profile.settings.get(
-                    "canonical_id_strategy",
-                    self.profile.settings.get("CANONICAL_ID_STRATEGY", "legacy"),
+                _setting(
+                    settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy"
                 )
             )
             .strip()
             .lower()
         )
         self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
-        _postprocessor_runtime = _resolve_postprocessor_runtime(
-            dict(self.profile.settings)
-        )
+        _postprocessor_runtime = _resolve_postprocessor_runtime(settings)
         logger.info(
             "Resolved postprocessor runtime for profile '%s': %s (origin=%s)",
             self.profile.name,
             _postprocessor_runtime,
             self.profile.origins.get("postprocessor_runtime", "default"),
         )
-        _pool_size = int(
-            self.profile.settings.get(
-                "concurrency", self.profile.settings.get("CONCURRENCY", 4)
-            )
-        )
+        _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
         _pp_pool_size = int(
-            self.profile.settings.get(
+            _setting(
+                settings,
                 "postprocessor_pool_size",
-                self.profile.settings.get("POSTPROCESSOR_POOL_SIZE", _pool_size),
+                "POSTPROCESSOR_POOL_SIZE",
+                _pool_size,
             )
         )
         logger.info(
@@ -136,9 +142,8 @@ def __init__(
             runtime=_postprocessor_runtime,
         )
         _mapping_pool_size = int(
-            self.profile.settings.get(
-                "mapping_pool_size",
-                self.profile.settings.get("MAPPING_POOL_SIZE", os.cpu_count() or 4),
+            _setting(
+                settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4
             )
         )
         logger.info(
@@ -154,27 +159,21 @@ def __init__(
             max_workers=_pool_size, thread_name_prefix="worai_ml"
         )
         shacl_mode = self._resolve_validation_mode(
-            self.profile.settings.get(
-                "shacl_validate_mode",
-                self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"),
-            )
+            _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn")
         )
         shacl_builtin_shapes = self._resolve_list_setting(
-            self.profile.settings.get(
-                "shacl_builtin_shapes",
-                self.profile.settings.get("SHACL_BUILTIN_SHAPES"),
-            )
+            _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None)
         )
         shacl_exclude_builtin_shapes = self._resolve_list_setting(
-            self.profile.settings.get(
+            _setting(
+                settings,
                 "shacl_exclude_builtin_shapes",
-                self.profile.settings.get("SHACL_EXCLUDE_BUILTIN_SHAPES"),
+                "SHACL_EXCLUDE_BUILTIN_SHAPES",
+                None,
             )
         )
         shacl_extra_shapes = self._resolve_list_setting(
-            self.profile.settings.get(
-                "shacl_extra_shapes", self.profile.settings.get("SHACL_EXTRA_SHAPES")
-            )
+            _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None)
         )
         self._shacl_shape_specs = resolve_shape_specs(
             builtin_shapes=shacl_builtin_shapes or None,
@@ -182,9 +181,8 @@ def __init__(
             extra_shapes=shacl_extra_shapes or None,
         )
         _shacl_pool_size = int(
-            self.profile.settings.get(
-                "shacl_pool_size",
-                self.profile.settings.get("SHACL_POOL_SIZE", max(2, _pool_size // 2)),
+            _setting(
+                settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, _pool_size // 2)
             )
         )
         self._shacl_validator = ShaclValidationService(
@@ -193,10 +191,7 @@ def __init__(
             pool_size=_shacl_pool_size,
         )
         self._import_hash_mode = self._resolve_import_hash_mode(
-            self.profile.settings.get(
-                "import_hash_mode",
-                self.profile.settings.get("IMPORT_HASH_MODE", "on"),
-            )
+            _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
         )
         self._kpi = KgBuildKpiCollector(
             dataset_uri=getattr(self.context.account, "dataset_uri", None),

From d68f883fb307040434c601f626c278a4f8433b13 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:03:44 +0100
Subject: [PATCH 37/63] refactor: extract
 _init_postprocessor_service/_mapping_executor/_shacl_validator from __init__

---
 wordlift_sdk/kg_build/protocol.py | 91 +++++++++++++++++--------------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 9b117d8..6f3e1c9 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -112,36 +112,61 @@ def __init__(
             .lower()
         )
         self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
-        _postprocessor_runtime = _resolve_postprocessor_runtime(settings)
+        _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
+        self._init_postprocessor_service(settings, context, _pool_size)
+        self._init_mapping_executor(settings, _pool_size)
+        self._init_shacl_validator(settings, _pool_size)
+        self._import_hash_mode = self._resolve_import_hash_mode(
+            _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
+        )
+        self._kpi = KgBuildKpiCollector(
+            dataset_uri=getattr(self.context.account, "dataset_uri", None),
+            validation_enabled=self._shacl_validator.mode != ValidationMode.OFF,
+        )
+        logger.debug(
+            "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s",
+            self.profile.name,
+            self.mappings_dir,
+            self.profile.origins.get("mappings_dir", "default"),
+            len(self.profile.routes),
+            self.profile.origins.get("routes", "default"),
+            [str(p) for p in self._mapping_dirs],
+        )
+
+    def _init_postprocessor_service(
+        self, settings: dict, context: Context, pool_size: int
+    ) -> None:
+        runtime = _resolve_postprocessor_runtime(settings)
         logger.info(
             "Resolved postprocessor runtime for profile '%s': %s (origin=%s)",
             self.profile.name,
-            _postprocessor_runtime,
+            runtime,
             self.profile.origins.get("postprocessor_runtime", "default"),
         )
-        _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
-        _pp_pool_size = int(
+        pp_pool_size = int(
             _setting(
                 settings,
                 "postprocessor_pool_size",
                 "POSTPROCESSOR_POOL_SIZE",
-                _pool_size,
+                pool_size,
             )
         )
         logger.info(
             "Postprocessor pool size for profile '%s': %d (concurrency=%d)",
             self.profile.name,
-            _pp_pool_size,
-            _pool_size,
+            pp_pool_size,
+            pool_size,
         )
         self._postprocessor_service = PostprocessorService(
             root_dir=self.root_dir,
             profile=self.profile,
             context=context,
-            pool_size=_pp_pool_size,
-            runtime=_postprocessor_runtime,
+            pool_size=pp_pool_size,
+            runtime=runtime,
         )
-        _mapping_pool_size = int(
+
+    def _init_mapping_executor(self, settings: dict, pool_size: int) -> None:
+        mapping_pool_size = int(
             _setting(
                 settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4
             )
@@ -149,22 +174,24 @@ def __init__(
         logger.info(
             "Mapping pool size for profile '%s': %d",
             self.profile.name,
-            _mapping_pool_size,
+            mapping_pool_size,
         )
-        init_morph_kgc_pool(_mapping_pool_size)
+        init_morph_kgc_pool(mapping_pool_size)
         # Wraps apply_mapping calls so they run in a thread rather than blocking
         # the asyncio event loop. The thread itself blocks on the morph_kgc
         # ProcessPoolExecutor slot, leaving the event loop free for I/O.
         self._mapping_executor = ThreadPoolExecutor(
-            max_workers=_pool_size, thread_name_prefix="worai_ml"
+            max_workers=pool_size, thread_name_prefix="worai_ml"
         )
-        shacl_mode = self._resolve_validation_mode(
+
+    def _init_shacl_validator(self, settings: dict, pool_size: int) -> None:
+        mode = self._resolve_validation_mode(
             _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn")
         )
-        shacl_builtin_shapes = self._resolve_list_setting(
+        builtin_shapes = self._resolve_list_setting(
             _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None)
         )
-        shacl_exclude_builtin_shapes = self._resolve_list_setting(
+        exclude_builtin_shapes = self._resolve_list_setting(
             _setting(
                 settings,
                 "shacl_exclude_builtin_shapes",
@@ -172,39 +199,23 @@ def __init__(
                 None,
             )
         )
-        shacl_extra_shapes = self._resolve_list_setting(
+        extra_shapes = self._resolve_list_setting(
             _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None)
         )
         self._shacl_shape_specs = resolve_shape_specs(
-            builtin_shapes=shacl_builtin_shapes or None,
-            exclude_builtin_shapes=shacl_exclude_builtin_shapes or None,
-            extra_shapes=shacl_extra_shapes or None,
+            builtin_shapes=builtin_shapes or None,
+            exclude_builtin_shapes=exclude_builtin_shapes or None,
+            extra_shapes=extra_shapes or None,
         )
-        _shacl_pool_size = int(
+        shacl_pool_size = int(
             _setting(
-                settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, _pool_size // 2)
+                settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, pool_size // 2)
             )
         )
         self._shacl_validator = ShaclValidationService(
             shape_specs=self._shacl_shape_specs or None,
-            mode=shacl_mode,
-            pool_size=_shacl_pool_size,
-        )
-        self._import_hash_mode = self._resolve_import_hash_mode(
-            _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
-        )
-        self._kpi = KgBuildKpiCollector(
-            dataset_uri=getattr(self.context.account, "dataset_uri", None),
-            validation_enabled=self._shacl_validator.mode != ValidationMode.OFF,
-        )
-        logger.debug(
-            "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s",
-            self.profile.name,
-            self.mappings_dir,
-            self.profile.origins.get("mappings_dir", "default"),
-            len(self.profile.routes),
-            self.profile.origins.get("routes", "default"),
-            [str(p) for p in self._mapping_dirs],
+            mode=mode,
+            pool_size=shacl_pool_size,
         )
 
     async def callback(

From ec977d20fa9ca7e26bc96296cf4729889f6a7ff1 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:08:40 +0100
Subject: [PATCH 38/63] refactor: split callback into _run_mapping_stage and
 _run_postprocessing_stage

---
 wordlift_sdk/kg_build/protocol.py | 124 +++++++++++++++++-------------
 1 file changed, 69 insertions(+), 55 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 6f3e1c9..8c03701 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -229,66 +229,26 @@ async def callback(
             if hasattr(response, "web_page") and response.web_page
             else "Unknown URL"
         )
-
         if hasattr(response, "errors") and response.errors:
             logger.error("Cloud callback error for %s: %s", url, response.errors)
             return
-
         if not response.web_page or not response.web_page.html:
             logger.warning("No HTML content for %s, skipping mapping", url)
             return
 
         await self._patch_static_templates_once()
 
-        mapping_path = self._resolve_mapping_path(url)
-        rendered_mapping = self._get_mapping_content(mapping_path)
-        mapping_response = self._mapping_response(response, existing_web_page_id)
         debug_output: dict[str, str] | None = {} if self.debug_dir else None
-
-        # apply_mapping has no awaits — all work is synchronous (morph_kgc).
-        # Run it in a thread so the event loop stays free for I/O while the
-        # thread waits for its morph_kgc subprocess slot to become available.
-        _timing: dict[str, int] = {}
-
-        def _run_mapping() -> Graph | None:
-            mapping: MappingResult = asyncio.run(
-                self.rml_service.apply_mapping(
-                    html=response.web_page.html,
-                    url=url,
-                    mapping_file_path=mapping_path,
-                    mapping_content=rendered_mapping,
-                    response=mapping_response,
-                    debug_output=debug_output,
-                )
-            )
-            _timing["mapping_wait_ms"] = mapping.queue_wait_ms
-            _timing["mapping_ms"] = mapping.mapping_ms
-            return mapping.graph
-
-        _loop = asyncio.get_event_loop()
-        graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping)
-        _t_mapping = _timing.get("mapping_ms", 0)
-        _t_mapping_wait = _timing.get("mapping_wait_ms", 0)
-        if not graph or len(graph) == 0:
+        mapping = await self._run_mapping_stage(
+            response, url, existing_web_page_id, debug_output
+        )
+        if not mapping.graph or len(mapping.graph) == 0:
             logger.warning("No triples produced for %s", url)
             return
 
-        if existing_web_page_id:
-            self._reconcile_root_id(graph, existing_web_page_id)
-        pp_result = await self._postprocessor_service.apply(
-            graph, url, response, existing_web_page_id, self._template_exports or {}
+        graph, pp_result = await self._run_postprocessing_stage(
+            mapping.graph, url, response, existing_web_page_id, existing_import_hash
         )
-        graph = pp_result.graph
-        # Canonical IDs must run after custom postprocessors so any nodes minted
-        # by local logic are normalized before graph sync patching.
-        graph = self._core_ids.process_graph(
-            graph,
-            self._postprocessor_service.build_context(
-                url, response, existing_web_page_id, self._template_exports or {}
-            ),
-        )
-        self._set_source(graph, existing_web_page_id)
-        self._set_existing_import_hash(graph, existing_import_hash)
 
         if self.debug_dir:
             xhtml = (debug_output or {}).get("xhtml")
@@ -313,19 +273,16 @@ def _run_mapping() -> Graph | None:
                 warning_sources=outcome.warning_sources,
                 error_sources=outcome.error_sources,
             )
-        _t_validation_wait = outcome.queue_wait_ms if outcome else 0
-        _t_validation_actual = outcome.validation_ms if outcome else 0
-        graph_metrics = self._kpi.graph_metrics(graph)
+        self._kpi.record_graph(graph)
         self._emit_progress(
             {
                 "kind": "graph",
                 "profile": self.profile.name,
                 "url": url,
-                "graph": graph_metrics,
+                "graph": self._kpi.graph_metrics(graph),
                 "validation": outcome.to_dict() if outcome else None,
             }
         )
-        self._kpi.record_graph(graph)
         if (
             outcome is not None
             and self._shacl_validator.mode == ValidationMode.FAIL
@@ -337,12 +294,12 @@ def _run_mapping() -> Graph | None:
             "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]",
             len(graph),
             url,
-            _t_mapping_wait,
-            _t_mapping,
+            mapping.queue_wait_ms,
+            mapping.mapping_ms,
             pp_result.queue_wait_ms,
             pp_result.postprocessors_ms,
-            _t_validation_wait,
-            _t_validation_actual,
+            outcome.queue_wait_ms if outcome else 0,
+            outcome.validation_ms if outcome else 0,
         )
 
     def close(self) -> None:
@@ -353,6 +310,63 @@ def close(self) -> None:
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)
 
+    async def _run_mapping_stage(
+        self,
+        response: WebPageScrapeResponse,
+        url: str,
+        existing_web_page_id: str | None,
+        debug_output: dict[str, str] | None,
+    ) -> MappingResult:
+        mapping_path = self._resolve_mapping_path(url)
+        rendered_mapping = self._get_mapping_content(mapping_path)
+        mapping_response = self._mapping_response(response, existing_web_page_id)
+
+        def _run() -> MappingResult:
+            # apply_mapping has no awaits — all work is synchronous (morph_kgc).
+            # Run in a thread so the event loop stays free for I/O while the
+            # thread waits for its morph_kgc subprocess slot.
+            return asyncio.run(
+                self.rml_service.apply_mapping(
+                    html=response.web_page.html,
+                    url=url,
+                    mapping_file_path=mapping_path,
+                    mapping_content=rendered_mapping,
+                    response=mapping_response,
+                    debug_output=debug_output,
+                )
+            )
+
+        return await asyncio.get_event_loop().run_in_executor(
+            self._mapping_executor, _run
+        )
+
+    async def _run_postprocessing_stage(
+        self,
+        graph: Graph,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        existing_import_hash: str | None,
+    ) -> tuple[Graph, Any]:
+        if existing_web_page_id:
+            self._reconcile_root_id(graph, existing_web_page_id)
+        exports = self._template_exports or {}
+        pp_result = await self._postprocessor_service.apply(
+            graph, url, response, existing_web_page_id, exports
+        )
+        graph = pp_result.graph
+        # Canonical IDs must run after custom postprocessors so any nodes minted
+        # by local logic are normalised before graph sync patching.
+        graph = self._core_ids.process_graph(
+            graph,
+            self._postprocessor_service.build_context(
+                url, response, existing_web_page_id, exports
+            ),
+        )
+        self._set_source(graph, existing_web_page_id)
+        self._set_existing_import_hash(graph, existing_import_hash)
+        return graph, pp_result
+
     def _resolve_path(self, raw_path: str) -> Path:
         path = Path(raw_path)
         if path.is_absolute():

From 2f141e96a0dd8117f60d8fae8b1f8d06055efdc1 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:12:46 +0100
Subject: [PATCH 39/63] refactor: move canonical_id_strategy and _core_ids into
 _init_postprocessor_service

---
 wordlift_sdk/kg_build/protocol.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 8c03701..87bbc72 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -102,16 +102,6 @@ def __init__(
         self._static_templates_lock = asyncio.Lock()
 
         settings = dict(self.profile.settings)
-        canonical_id_strategy = (
-            str(
-                _setting(
-                    settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy"
-                )
-            )
-            .strip()
-            .lower()
-        )
-        self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
         _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
         self._init_postprocessor_service(settings, context, _pool_size)
         self._init_mapping_executor(settings, _pool_size)
@@ -136,6 +126,16 @@ def __init__(
     def _init_postprocessor_service(
         self, settings: dict, context: Context, pool_size: int
     ) -> None:
+        canonical_id_strategy = (
+            str(
+                _setting(
+                    settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy"
+                )
+            )
+            .strip()
+            .lower()
+        )
+        self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
         runtime = _resolve_postprocessor_runtime(settings)
         logger.info(
             "Resolved postprocessor runtime for profile '%s': %s (origin=%s)",

From a7dddad1bb7b4a032d992cbf1156823bef79d80f Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:15:46 +0100
Subject: [PATCH 40/63] refactor: move RmlMappingService construction into
 _init_mapping_service

---
 wordlift_sdk/kg_build/protocol.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 87bbc72..9b1e943 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -90,7 +90,6 @@ def __init__(
             self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,)
         )
 
-        self.rml_service = RmlMappingService(context)
         self.patcher = EntityPatcher(context)
         self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs)
         self.text_renderer = TemplateTextRenderer()
@@ -104,7 +103,7 @@ def __init__(
         settings = dict(self.profile.settings)
         _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
         self._init_postprocessor_service(settings, context, _pool_size)
-        self._init_mapping_executor(settings, _pool_size)
+        self._init_mapping_service(settings, context, _pool_size)
         self._init_shacl_validator(settings, _pool_size)
         self._import_hash_mode = self._resolve_import_hash_mode(
             _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
@@ -165,7 +164,10 @@ def _init_postprocessor_service(
             runtime=runtime,
         )
 
-    def _init_mapping_executor(self, settings: dict, pool_size: int) -> None:
+    def _init_mapping_service(
+        self, settings: dict, context: Context, pool_size: int
+    ) -> None:
+        self.rml_service = RmlMappingService(context)
         mapping_pool_size = int(
             _setting(
                 settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4

From 1fd0273be3fb5714e3b9db4a846200dc1bfdbb81 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:17:45 +0100
Subject: [PATCH 41/63] refactor: extract _init_graph_writer, moving patcher
 and import_hash_mode together

---
 wordlift_sdk/kg_build/protocol.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 9b1e943..1ee81c1 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -90,7 +90,6 @@ def __init__(
             self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,)
         )
 
-        self.patcher = EntityPatcher(context)
         self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs)
         self.text_renderer = TemplateTextRenderer()
 
@@ -105,9 +104,7 @@ def __init__(
         self._init_postprocessor_service(settings, context, _pool_size)
         self._init_mapping_service(settings, context, _pool_size)
         self._init_shacl_validator(settings, _pool_size)
-        self._import_hash_mode = self._resolve_import_hash_mode(
-            _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
-        )
+        self._init_graph_writer(settings, context)
         self._kpi = KgBuildKpiCollector(
             dataset_uri=getattr(self.context.account, "dataset_uri", None),
             validation_enabled=self._shacl_validator.mode != ValidationMode.OFF,
@@ -220,6 +217,12 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None:
             pool_size=shacl_pool_size,
         )
 
+    def _init_graph_writer(self, settings: dict, context: Context) -> None:
+        self.patcher = EntityPatcher(context)
+        self._import_hash_mode = self._resolve_import_hash_mode(
+            _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
+        )
+
     async def callback(
         self,
         response: WebPageScrapeResponse,

From 31437f1088f5d93e29981ac45679a835ee56dfc4 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 14:29:09 +0100
Subject: [PATCH 42/63] refactor(kg_build): decompose
 ProfileImportProtocol.__init__ and reduce class surface

---
 wordlift_sdk/kg_build/protocol.py | 169 +++++++++++++++---------------
 1 file changed, 82 insertions(+), 87 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 1ee81c1..f642b7a 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -27,7 +27,7 @@
 from .entity_patcher import EntityPatcher
 from .id_postprocessor import CanonicalIdsPostprocessor
 from .kpi import KgBuildKpiCollector
-from .postprocessor_service import PostprocessorService
+from .postprocessor_service import PostprocessorService, PostprocessorResult
 from .rml_mapping import MappingResult, RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
 from wordlift_sdk.structured_data.engine import init_morph_kgc_pool
@@ -41,6 +41,59 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
+def _find_web_page_iri(graph: Graph) -> URIRef | None:
+    for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")):
+        return subject
+    for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")):
+        return subject
+    return None
+
+
+def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None:
+    for subject, predicate, obj in list(graph.triples((old_iri, None, None))):
+        graph.remove((subject, predicate, obj))
+        graph.add((new_iri, predicate, obj))
+    for subject, predicate, obj in list(graph.triples((None, None, old_iri))):
+        graph.remove((subject, predicate, obj))
+        graph.add((subject, predicate, new_iri))
+
+
+def _resolve_list_setting(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [part.strip() for part in value.split(",") if part.strip()]
+    if isinstance(value, (list, tuple)):
+        return [text for item in value if (text := str(item).strip())]
+    return [str(value).strip()] if str(value).strip() else []
+
+
+def _resolve_validation_mode(value: Any) -> ValidationMode:
+    if value is None:
+        return ValidationMode.WARN
+    mode = str(value).strip().lower()
+    if mode == "strict":
+        logger.warning(
+            "Deprecated SHACL validation mode 'strict' detected; using 'fail'."
+        )
+        return ValidationMode.FAIL
+    try:
+        return ValidationMode(mode)
+    except ValueError:
+        logger.warning("Unsupported SHACL validation mode '%s'; using 'warn'.", mode)
+        return ValidationMode.WARN
+
+
+def _resolve_import_hash_mode(value: Any) -> str:
+    if value is None:
+        return "on"
+    mode = str(value).strip().lower()
+    if mode in {"on", "write", "off"}:
+        return mode
+    logger.warning("Unsupported import hash mode '%s'; using 'on'.", mode)
+    return "on"
+
+
 def _setting(settings: dict, name: str, fallback: str, default: Any) -> Any:
     """Read a profile setting by snake_case name, falling back to UPPER_CASE, then default."""
     v = settings.get(name)
@@ -81,23 +134,6 @@ def __init__(
         self._graph_write_strategy = graph_write_strategy
 
         self.profile_dir = self.root_dir / "profiles" / self.profile.name
-        self.templates_dir = self._resolve_path(self.profile.templates_dir)
-        self.mappings_dir = self._resolve_path(self.profile.mappings_dir)
-        self._template_dirs = self._resolve_overlay_paths(
-            self.profile.template_overlay_dirs or (self.profile.templates_dir,)
-        )
-        self._mapping_dirs = self._resolve_overlay_paths(
-            self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,)
-        )
-
-        self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs)
-        self.text_renderer = TemplateTextRenderer()
-
-        self._template_graph: Graph | None = None
-        self._template_exports: dict[str, Any] | None = None
-        self._mapping_cache: dict[Path, str] = {}
-        self._static_templates_patched = False
-        self._static_templates_lock = asyncio.Lock()
 
         settings = dict(self.profile.settings)
         _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4))
@@ -164,6 +200,21 @@ def _init_postprocessor_service(
     def _init_mapping_service(
         self, settings: dict, context: Context, pool_size: int
     ) -> None:
+        self.templates_dir = self._resolve_path(self.profile.templates_dir)
+        self.mappings_dir = self._resolve_path(self.profile.mappings_dir)
+        self._template_dirs = self._resolve_overlay_paths(
+            self.profile.template_overlay_dirs or (self.profile.templates_dir,)
+        )
+        self._mapping_dirs = self._resolve_overlay_paths(
+            self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,)
+        )
+        self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs)
+        self.text_renderer = TemplateTextRenderer()
+        self._template_graph: Graph | None = None
+        self._template_exports: dict[str, Any] | None = None
+        self._mapping_cache: dict[Path, str] = {}
+        self._static_templates_patched = False
+        self._static_templates_lock = asyncio.Lock()
         self.rml_service = RmlMappingService(context)
         mapping_pool_size = int(
             _setting(
@@ -184,13 +235,13 @@ def _init_mapping_service(
         )
 
     def _init_shacl_validator(self, settings: dict, pool_size: int) -> None:
-        mode = self._resolve_validation_mode(
+        mode = _resolve_validation_mode(
             _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn")
         )
-        builtin_shapes = self._resolve_list_setting(
+        builtin_shapes = _resolve_list_setting(
             _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None)
         )
-        exclude_builtin_shapes = self._resolve_list_setting(
+        exclude_builtin_shapes = _resolve_list_setting(
             _setting(
                 settings,
                 "shacl_exclude_builtin_shapes",
@@ -198,10 +249,10 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None:
                 None,
             )
         )
-        extra_shapes = self._resolve_list_setting(
+        extra_shapes = _resolve_list_setting(
             _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None)
         )
-        self._shacl_shape_specs = resolve_shape_specs(
+        shape_specs = resolve_shape_specs(
             builtin_shapes=builtin_shapes or None,
             exclude_builtin_shapes=exclude_builtin_shapes or None,
             extra_shapes=extra_shapes or None,
@@ -212,14 +263,14 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None:
             )
         )
         self._shacl_validator = ShaclValidationService(
-            shape_specs=self._shacl_shape_specs or None,
+            shape_specs=shape_specs or None,
             mode=mode,
             pool_size=shacl_pool_size,
         )
 
     def _init_graph_writer(self, settings: dict, context: Context) -> None:
         self.patcher = EntityPatcher(context)
-        self._import_hash_mode = self._resolve_import_hash_mode(
+        self._import_hash_mode = _resolve_import_hash_mode(
             _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on")
         )
 
@@ -352,7 +403,7 @@ async def _run_postprocessing_stage(
         response: WebPageScrapeResponse,
         existing_web_page_id: str | None,
         existing_import_hash: str | None,
-    ) -> tuple[Graph, Any]:
+    ) -> tuple[Graph, PostprocessorResult]:
         if existing_web_page_id:
             self._reconcile_root_id(graph, existing_web_page_id)
         exports = self._template_exports or {}
@@ -368,7 +419,7 @@ async def _run_postprocessing_stage(
                 url, response, existing_web_page_id, exports
             ),
         )
-        self._set_source(graph, existing_web_page_id)
+        self._set_source(graph)
         self._set_existing_import_hash(graph, existing_import_hash)
         return graph, pp_result
 
@@ -609,27 +660,11 @@ def _write_debug_source_documents(
             xhtml_file.write_text(xhtml, encoding="utf-8")
 
     def _reconcile_root_id(self, graph: Graph, root_id: str) -> None:
-        old_iri = self._find_web_page_iri(graph)
+        old_iri = _find_web_page_iri(graph)
         if old_iri and str(old_iri) != root_id:
-            self._swap_iris(graph, old_iri, URIRef(root_id))
-
-    def _find_web_page_iri(self, graph: Graph) -> URIRef | None:
-        for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")):
-            return subject
-        for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")):
-            return subject
-        return None
-
-    def _swap_iris(self, graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None:
-        for subject, predicate, obj in list(graph.triples((old_iri, None, None))):
-            graph.remove((subject, predicate, obj))
-            graph.add((new_iri, predicate, obj))
-        for subject, predicate, obj in list(graph.triples((None, None, old_iri))):
-            graph.remove((subject, predicate, obj))
-            graph.add((subject, predicate, new_iri))
-
-    def _set_source(self, graph: Graph, existing_web_page_id: str | None) -> None:
-        del existing_web_page_id
+            _swap_iris(graph, old_iri, URIRef(root_id))
+
+    def _set_source(self, graph: Graph) -> None:
         for subject in self._first_level_subjects(graph):
             graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import")))
 
@@ -696,43 +731,3 @@ def _emit_progress(self, payload: dict[str, Any]) -> None:
             self._on_progress(payload)
         except Exception:
             logger.warning("Failed to emit kg_build progress payload.", exc_info=True)
-
-    def _resolve_list_setting(self, value: Any) -> list[str]:
-        if value is None:
-            return []
-        if isinstance(value, str):
-            return [part.strip() for part in value.split(",") if part.strip()]
-        if isinstance(value, (list, tuple)):
-            specs: list[str] = []
-            for item in value:
-                text = str(item).strip()
-                if text:
-                    specs.append(text)
-            return specs
-        return [str(value).strip()] if str(value).strip() else []
-
-    def _resolve_validation_mode(self, value: Any) -> ValidationMode:
-        if value is None:
-            return ValidationMode.WARN
-        mode = str(value).strip().lower()
-        if mode == "strict":
-            logger.warning(
-                "Deprecated SHACL validation mode 'strict' detected; using 'fail'."
-            )
-            return ValidationMode.FAIL
-        try:
-            return ValidationMode(mode)
-        except ValueError:
-            logger.warning(
-                "Unsupported SHACL validation mode '%s'; using 'warn'.", mode
-            )
-            return ValidationMode.WARN
-
-    def _resolve_import_hash_mode(self, value: Any) -> str:
-        if value is None:
-            return "on"
-        mode = str(value).strip().lower()
-        if mode in {"on", "write", "off"}:
-            return mode
-        logger.warning("Unsupported import hash mode '%s'; using 'on'.", mode)
-        return "on"

From f72e457204ccb68df33365aa46a8bdafbcb8c5a6 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:00:34 +0100
Subject: [PATCH 43/63] refactor: extract RootIdReconcilerPostprocessor from
 protocol._reconcile_root_id

---
 wordlift_sdk/kg_build/id_postprocessor.py | 38 ++++++++++++++++++++++-
 wordlift_sdk/kg_build/protocol.py         | 31 +++---------------
 2 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/wordlift_sdk/kg_build/id_postprocessor.py b/wordlift_sdk/kg_build/id_postprocessor.py
index 7660a13..ae4326e 100644
--- a/wordlift_sdk/kg_build/id_postprocessor.py
+++ b/wordlift_sdk/kg_build/id_postprocessor.py
@@ -1,11 +1,47 @@
 from __future__ import annotations
 
-from rdflib import Graph
+from rdflib import Graph, RDF, URIRef
 
 from .id_generator import CanonicalIdGenerator
 from .iri_lookup import IriLookup
 
 
+def _find_web_page_iri(graph: Graph) -> URIRef | None:
+    for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")):
+        return subject
+    for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")):
+        return subject
+    return None
+
+
+def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None:
+    for subject, predicate, obj in list(graph.triples((old_iri, None, None))):
+        graph.remove((subject, predicate, obj))
+        graph.add((new_iri, predicate, obj))
+    for subject, predicate, obj in list(graph.triples((None, None, old_iri))):
+        graph.remove((subject, predicate, obj))
+        graph.add((subject, predicate, new_iri))
+
+
+class RootIdReconcilerPostprocessor:
+    """Rewrites the WebPage node IRI to match the existing web page ID.
+
+    When a page has been imported before, the mapping may generate a different
+    IRI than the one already stored. This postprocessor swaps all triples
+    referencing the old IRI to use the canonical one from the system.
+    Runs before custom postprocessors so they always see the correct subject.
+    """
+
+    def process_graph(self, graph: Graph, context) -> Graph:
+        root_id = getattr(context, "existing_web_page_id", None)
+        if not root_id:
+            return graph
+        old_iri = _find_web_page_iri(graph)
+        if old_iri and str(old_iri) != root_id:
+            _swap_iris(graph, old_iri, URIRef(root_id))
+        return graph
+
+
 class CanonicalIdsPostprocessor:
     """Postprocessor adapter that applies canonical ID generation to a graph."""
 
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index f642b7a..02e4e54 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -10,7 +10,7 @@
 from typing import Any
 
 from jinja2 import UndefinedError
-from rdflib import Graph, Literal, RDF, URIRef
+from rdflib import Graph, Literal, URIRef
 from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse
 from wordlift_sdk.protocol import Context
 from wordlift_sdk.protocol.web_page_import_protocol import (
@@ -25,7 +25,7 @@
 
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
-from .id_postprocessor import CanonicalIdsPostprocessor
+from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor
 from .kpi import KgBuildKpiCollector
 from .postprocessor_service import PostprocessorService, PostprocessorResult
 from .rml_mapping import MappingResult, RmlMappingService
@@ -41,23 +41,6 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
-def _find_web_page_iri(graph: Graph) -> URIRef | None:
-    for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")):
-        return subject
-    for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")):
-        return subject
-    return None
-
-
-def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None:
-    for subject, predicate, obj in list(graph.triples((old_iri, None, None))):
-        graph.remove((subject, predicate, obj))
-        graph.add((new_iri, predicate, obj))
-    for subject, predicate, obj in list(graph.triples((None, None, old_iri))):
-        graph.remove((subject, predicate, obj))
-        graph.add((subject, predicate, new_iri))
-
-
 def _resolve_list_setting(value: Any) -> list[str]:
     if value is None:
         return []
@@ -404,8 +387,9 @@ async def _run_postprocessing_stage(
         existing_web_page_id: str | None,
         existing_import_hash: str | None,
     ) -> tuple[Graph, PostprocessorResult]:
-        if existing_web_page_id:
-            self._reconcile_root_id(graph, existing_web_page_id)
+        graph = RootIdReconcilerPostprocessor().process_graph(
+            graph, SimpleNamespace(existing_web_page_id=existing_web_page_id)
+        )
         exports = self._template_exports or {}
         pp_result = await self._postprocessor_service.apply(
             graph, url, response, existing_web_page_id, exports
@@ -659,11 +643,6 @@ def _write_debug_source_documents(
             xhtml_file = self.debug_dir / f"{safe_name}.xhtml"
             xhtml_file.write_text(xhtml, encoding="utf-8")
 
-    def _reconcile_root_id(self, graph: Graph, root_id: str) -> None:
-        old_iri = _find_web_page_iri(graph)
-        if old_iri and str(old_iri) != root_id:
-            _swap_iris(graph, old_iri, URIRef(root_id))
-
     def _set_source(self, graph: Graph) -> None:
         for subject in self._first_level_subjects(graph):
             graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import")))

From 9be8fd57297aa1070413cc3b42794952f59f0e73 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:04:04 +0100
Subject: [PATCH 44/63] refactor: extract ImportAnnotationPostprocessor

---
 wordlift_sdk/kg_build/graph_annotation.py | 60 ++++++++++++++++++++++
 wordlift_sdk/kg_build/protocol.py         | 62 ++++-------------------
 2 files changed, 70 insertions(+), 52 deletions(-)
 create mode 100644 wordlift_sdk/kg_build/graph_annotation.py

diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/graph_annotation.py
new file mode 100644
index 0000000..731d57c
--- /dev/null
+++ b/wordlift_sdk/kg_build/graph_annotation.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from rdflib import Graph, Literal, URIRef
+
+SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
+SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash")
+
+
+def _first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]:
+    subjects = {s for s in graph.subjects() if isinstance(s, URIRef)}
+    if dataset_uri:
+        first_level_by_id = {
+            s
+            for s in subjects
+            if str(s).startswith(f"{dataset_uri}/")
+            and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2
+        }
+        if first_level_by_id:
+            return first_level_by_id
+
+    referenced = {
+        obj
+        for _, _, obj in graph.triples((None, None, None))
+        if isinstance(obj, URIRef) and obj in subjects
+    }
+    first_level = subjects - referenced
+    return first_level or subjects
+
+
+class ImportAnnotationPostprocessor:
+    """Stamps first-level graph subjects with web-page-import provenance metadata.
+
+    Sets seovoc:source to 'web-page-import' on every first-level subject, and
+    optionally propagates the existing import hash to all URIRef subjects when
+    import_hash_mode is not 'off'. Both are needed before graph persistence so
+    the KG can track provenance and skip unchanged imports.
+
+    Reads from context:
+      - account.dataset_uri  — for first-level subject resolution
+      - existing_import_hash — hash from a prior import of the same page
+      - import_hash_mode     — 'on' | 'write' | 'off'
+    """
+
+    def process_graph(self, graph: Graph, context) -> Graph:
+        dataset_uri = str(
+            getattr(getattr(context, "account", None), "dataset_uri", "") or ""
+        ).rstrip("/")
+        for subject in _first_level_subjects(graph, dataset_uri):
+            graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import")))
+
+        import_hash_mode = getattr(context, "import_hash_mode", "on")
+        if import_hash_mode == "off":
+            return graph
+        existing_import_hash = getattr(context, "existing_import_hash", None)
+        if not existing_import_hash:
+            return graph
+        for subject in (s for s in graph.subjects() if isinstance(s, URIRef)):
+            graph.set((subject, SEOVOC_IMPORT_HASH, Literal(existing_import_hash)))
+
+        return graph
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 02e4e54..b06844d 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -10,7 +10,7 @@
 from typing import Any
 
 from jinja2 import UndefinedError
-from rdflib import Graph, Literal, URIRef
+from rdflib import Graph, URIRef
 from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse
 from wordlift_sdk.protocol import Context
 from wordlift_sdk.protocol.web_page_import_protocol import (
@@ -25,6 +25,7 @@
 
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
+from .graph_annotation import ImportAnnotationPostprocessor
 from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor
 from .kpi import KgBuildKpiCollector
 from .postprocessor_service import PostprocessorService, PostprocessorResult
@@ -33,8 +34,6 @@
 from wordlift_sdk.structured_data.engine import init_morph_kgc_pool
 
 logger = logging.getLogger(__name__)
-SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
-SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash")
 
 
 def _path_contains_part(path: str, part: str) -> bool:
@@ -403,8 +402,14 @@ async def _run_postprocessing_stage(
                 url, response, existing_web_page_id, exports
             ),
         )
-        self._set_source(graph)
-        self._set_existing_import_hash(graph, existing_import_hash)
+        graph = ImportAnnotationPostprocessor().process_graph(
+            graph,
+            SimpleNamespace(
+                account=self.context.account,
+                existing_import_hash=existing_import_hash,
+                import_hash_mode=self._import_hash_mode,
+            ),
+        )
         return graph, pp_result
 
     def _resolve_path(self, raw_path: str) -> Path:
@@ -643,53 +648,6 @@ def _write_debug_source_documents(
             xhtml_file = self.debug_dir / f"{safe_name}.xhtml"
             xhtml_file.write_text(xhtml, encoding="utf-8")
 
-    def _set_source(self, graph: Graph) -> None:
-        for subject in self._first_level_subjects(graph):
-            graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import")))
-
-    def _set_existing_import_hash(self, graph: Graph, import_hash: str | None) -> None:
-        if self._import_hash_mode == "off":
-            return
-        if not import_hash:
-            return
-        subjects = {
-            subject for subject in graph.subjects() if isinstance(subject, URIRef)
-        }
-        for subject in subjects:
-            graph.set((subject, SEOVOC_IMPORT_HASH, Literal(import_hash)))
-
-    def _first_level_subjects(self, graph: Graph) -> set[URIRef]:
-        subjects = {
-            subject for subject in graph.subjects() if isinstance(subject, URIRef)
-        }
-        dataset_uri = str(
-            getattr(self.context.account, "dataset_uri", "") or ""
-        ).rstrip("/")
-        if dataset_uri:
-            first_level_by_id = {
-                subject
-                for subject in subjects
-                if str(subject).startswith(f"{dataset_uri}/")
-                and len(
-                    [
-                        part
-                        for part in str(subject)[len(dataset_uri) + 1 :].split("/")
-                        if part
-                    ]
-                )
-                == 2
-            }
-            if first_level_by_id:
-                return first_level_by_id
-
-        referenced = {
-            obj
-            for _, _, obj in graph.triples((None, None, None))
-            if isinstance(obj, URIRef) and obj in subjects
-        }
-        first_level = subjects - referenced
-        return first_level or subjects
-
     def _mapping_response(
         self,
         response: WebPageScrapeResponse,

From bb020adcda7d380ba21f8d74fc8342cd19823585 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:21:47 +0100
Subject: [PATCH 45/63] refactor: make PostprocessorService profile-agnostic,
 unify postprocessor pipeline

---
 .../kg_build/postprocessor_service.py         | 137 +++---------------
 wordlift_sdk/kg_build/postprocessors.py       |   2 +
 wordlift_sdk/kg_build/protocol.py             | 128 ++++++++++++----
 3 files changed, 116 insertions(+), 151 deletions(-)

diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessor_service.py
index cd8c21d..c9d4d82 100644
--- a/wordlift_sdk/kg_build/postprocessor_service.py
+++ b/wordlift_sdk/kg_build/postprocessor_service.py
@@ -3,74 +3,48 @@
 import asyncio
 import functools
 import logging
-import os
 import time
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import asdict
-from pathlib import Path
-from typing import Any
+from collections.abc import Iterable
+from typing import Callable
 
 from rdflib import Graph
-from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse
-from wordlift_sdk.protocol import Context
 
-from .config import ProfileDefinition
-from .id_allocator import IdAllocator
 from .postprocessors import (
+    LoadedPostprocessor,
     PostprocessorContext,
     PostprocessorResult,
     close_loaded_postprocessors,
-    load_postprocessors_for_profile,
 )
 
 logger = logging.getLogger(__name__)
 
 
-def _clean_key(value: Any) -> str | None:
-    if value is None:
-        return None
-    key = str(value).strip()
-    return key or None
+class PostprocessorService:
+    """Executes an ordered list of postprocessors against a graph.
 
+    Completely agnostic to profiles and pipeline composition — callers are
+    responsible for assembling the postprocessor list and building the context.
+    """
 
-class PostprocessorService:
     def __init__(
         self,
         *,
-        root_dir: Path,
-        profile: ProfileDefinition,
-        context: Context,
+        postprocessors_factory: Callable[[], Iterable[LoadedPostprocessor]],
         pool_size: int,
-        runtime: str,
     ) -> None:
-        self._profile = profile
-        self._context = context
         self._executor = ThreadPoolExecutor(
             max_workers=pool_size, thread_name_prefix="worai_pp"
         )
         self._queue: asyncio.Queue = asyncio.Queue()
         for _ in range(pool_size):
-            self._queue.put_nowait(
-                load_postprocessors_for_profile(
-                    root_dir=root_dir,
-                    profile_name=profile.name,
-                    runtime=runtime,
-                )
-            )
-        logger.info(
-            "Created postprocessor pool for profile '%s' (pool_size=%d runtime=%s)",
-            profile.name,
-            pool_size,
-            runtime,
-        )
+            self._queue.put_nowait(postprocessors_factory())
+        logger.info("Created postprocessor pool (pool_size=%d)", pool_size)
 
     async def apply(
         self,
         graph: Graph,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-        exports: dict[str, Any],
+        context: PostprocessorContext,
     ) -> PostprocessorResult:
         _t1 = time.perf_counter()
         postprocessors = await self._queue.get()
@@ -80,44 +54,12 @@ async def apply(
             return await loop.run_in_executor(
                 self._executor,
                 functools.partial(
-                    self._run,
-                    graph,
-                    url,
-                    response,
-                    existing_web_page_id,
-                    postprocessors,
-                    queue_wait_ms,
-                    exports,
+                    self._run, graph, context, postprocessors, queue_wait_ms
                 ),
             )
         finally:
             self._queue.put_nowait(postprocessors)
 
-    def build_context(
-        self,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-        exports: dict[str, Any],
-    ) -> PostprocessorContext:
-        dataset_uri = str(getattr(self._context.account, "dataset_uri", "")).rstrip("/")
-        ids = IdAllocator(dataset_uri) if dataset_uri else None
-        profile_payload = asdict(self._profile)
-        profile_settings = dict(profile_payload.get("settings", {}) or {})
-        profile_settings.setdefault("api_url", "https://api.wordlift.io")
-        profile_payload["settings"] = profile_settings
-        return PostprocessorContext(
-            profile_name=self._profile.name,
-            profile=profile_payload,
-            url=url,
-            account=self._context.account,
-            account_key=self._resolve_account_key(),
-            exports=exports,
-            response=response,
-            existing_web_page_id=existing_web_page_id,
-            ids=ids,
-        )
-
     def close(self) -> None:
         while not self._queue.empty():
             try:
@@ -129,33 +71,18 @@ def close(self) -> None:
     def _run(
         self,
         graph: Graph,
-        url: str,
-        response: WebPageScrapeResponse,
-        existing_web_page_id: str | None,
-        postprocessors: list,
+        context: PostprocessorContext,
+        postprocessors: Iterable[LoadedPostprocessor],
         queue_wait_ms: int,
-        exports: dict[str, Any],
     ) -> PostprocessorResult:
         _t_start = time.perf_counter()
-        if not postprocessors:
-            return PostprocessorResult(
-                graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0
-            )
-
-        pp_context = self.build_context(url, response, existing_web_page_id, exports)
-        if not pp_context.account_key:
-            raise RuntimeError(
-                "Postprocessor runtime requires an API key. Configure one via profile "
-                "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY."
-            )
-
         for processor in postprocessors:
             _tp = time.perf_counter()
-            graph = processor.run(graph, pp_context)
+            graph = processor.run(graph, context)
             logger.info(
                 "Applied postprocessor '%s' for %s [%dms]",
                 processor.name,
-                url,
+                context.url,
                 int((time.perf_counter() - _tp) * 1000),
             )
         return PostprocessorResult(
@@ -163,33 +90,3 @@ def _run(
             queue_wait_ms=queue_wait_ms,
             postprocessors_ms=int((time.perf_counter() - _t_start) * 1000),
         )
-
-    def _resolve_account_key(self) -> str | None:
-        profile_key = _clean_key(self._profile.api_key)
-        if profile_key:
-            return profile_key
-
-        client_config = getattr(self._context, "client_configuration", None)
-        if client_config is not None:
-            api_key_map = getattr(client_config, "api_key", None)
-            if isinstance(api_key_map, dict):
-                runtime_key = _clean_key(api_key_map.get("ApiKey"))
-                if runtime_key:
-                    return runtime_key
-
-        provider = getattr(self._context, "configuration_provider", None)
-        if provider is not None:
-            for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
-                try:
-                    key = _clean_key(provider.get_value(name))
-                except Exception:
-                    key = None
-                if key:
-                    return key
-
-        for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
-            key = _clean_key(os.getenv(name))
-            if key:
-                return key
-
-        return None
diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py
index d29b8f5..0acd84a 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors.py
@@ -40,6 +40,8 @@ class PostprocessorContext:
     exports: dict[str, Any]
     response: Any
     existing_web_page_id: str | None
+    existing_import_hash: str | None = None
+    import_hash_mode: str = "on"
     ids: Any | None = None
 
 
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index b06844d..0c91671 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -5,6 +5,7 @@
 import logging
 import os
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any
@@ -26,9 +27,16 @@
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
 from .graph_annotation import ImportAnnotationPostprocessor
+from .id_allocator import IdAllocator
 from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor
 from .kpi import KgBuildKpiCollector
-from .postprocessor_service import PostprocessorService, PostprocessorResult
+from .postprocessor_service import PostprocessorService
+from .postprocessors import (
+    LoadedPostprocessor,
+    PostprocessorContext,
+    PostprocessorResult,
+    load_postprocessors_for_profile,
+)
 from .rml_mapping import MappingResult, RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
 from wordlift_sdk.structured_data.engine import init_morph_kgc_pool
@@ -40,6 +48,34 @@ def _path_contains_part(path: str, part: str) -> bool:
     return part in Path(path).parts
 
 
+def _clean_key(value: Any) -> str | None:
+    key = str(value).strip() if value is not None else ""
+    return key or None
+
+
+def _resolve_account_key(profile: Any, context: Any) -> str | None:
+    if key := _clean_key(getattr(profile, "api_key", None)):
+        return key
+    api_key_map = getattr(
+        getattr(context, "client_configuration", None), "api_key", None
+    )
+    if isinstance(api_key_map, dict):
+        if key := _clean_key(api_key_map.get("ApiKey")):
+            return key
+    provider = getattr(context, "configuration_provider", None)
+    if provider is not None:
+        for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
+            try:
+                if key := _clean_key(provider.get_value(name)):
+                    return key
+            except Exception:
+                pass
+    for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"):
+        if key := _clean_key(os.getenv(name)):
+            return key
+    return None
+
+
 def _resolve_list_setting(value: Any) -> list[str]:
     if value is None:
         return []
@@ -149,7 +185,7 @@ def _init_postprocessor_service(
             .strip()
             .lower()
         )
-        self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
+        core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy)
         runtime = _resolve_postprocessor_runtime(settings)
         logger.info(
             "Resolved postprocessor runtime for profile '%s': %s (origin=%s)",
@@ -171,12 +207,35 @@ def _init_postprocessor_service(
             pp_pool_size,
             pool_size,
         )
+        account_key = _resolve_account_key(self.profile, context)
+        root_dir = self.root_dir
+        profile = self.profile
+
+        def _postprocessors_factory() -> list[LoadedPostprocessor]:
+            leading = [
+                LoadedPostprocessor(
+                    name="root_id_reconciler",
+                    handler=RootIdReconcilerPostprocessor(),
+                )
+            ]
+            custom = load_postprocessors_for_profile(
+                root_dir=root_dir,
+                profile_name=profile.name,
+                runtime=runtime,
+            )
+            trailing = [
+                LoadedPostprocessor(name="canonical_ids", handler=core_ids),
+                LoadedPostprocessor(
+                    name="import_annotation",
+                    handler=ImportAnnotationPostprocessor(),
+                ),
+            ]
+            return leading + custom + trailing
+
+        self._account_key = account_key
         self._postprocessor_service = PostprocessorService(
-            root_dir=self.root_dir,
-            profile=self.profile,
-            context=context,
+            postprocessors_factory=_postprocessors_factory,
             pool_size=pp_pool_size,
-            runtime=runtime,
         )
 
     def _init_mapping_service(
@@ -386,31 +445,38 @@ async def _run_postprocessing_stage(
         existing_web_page_id: str | None,
         existing_import_hash: str | None,
     ) -> tuple[Graph, PostprocessorResult]:
-        graph = RootIdReconcilerPostprocessor().process_graph(
-            graph, SimpleNamespace(existing_web_page_id=existing_web_page_id)
-        )
-        exports = self._template_exports or {}
-        pp_result = await self._postprocessor_service.apply(
-            graph, url, response, existing_web_page_id, exports
-        )
-        graph = pp_result.graph
-        # Canonical IDs must run after custom postprocessors so any nodes minted
-        # by local logic are normalised before graph sync patching.
-        graph = self._core_ids.process_graph(
-            graph,
-            self._postprocessor_service.build_context(
-                url, response, existing_web_page_id, exports
-            ),
-        )
-        graph = ImportAnnotationPostprocessor().process_graph(
-            graph,
-            SimpleNamespace(
-                account=self.context.account,
-                existing_import_hash=existing_import_hash,
-                import_hash_mode=self._import_hash_mode,
-            ),
-        )
-        return graph, pp_result
+        context = self._build_pp_context(
+            url, response, existing_web_page_id, existing_import_hash
+        )
+        pp_result = await self._postprocessor_service.apply(graph, context)
+        return pp_result.graph, pp_result
+
+    def _build_pp_context(
+        self,
+        url: str,
+        response: WebPageScrapeResponse,
+        existing_web_page_id: str | None,
+        existing_import_hash: str | None,
+    ) -> PostprocessorContext:
+        dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/")
+        ids = IdAllocator(dataset_uri) if dataset_uri else None
+        profile_payload = asdict(self.profile)
+        profile_settings = dict(profile_payload.get("settings", {}) or {})
+        profile_settings.setdefault("api_url", "https://api.wordlift.io")
+        profile_payload["settings"] = profile_settings
+        return PostprocessorContext(
+            profile_name=self.profile.name,
+            profile=profile_payload,
+            url=url,
+            account=self.context.account,
+            account_key=self._account_key,
+            exports=self._template_exports or {},
+            response=response,
+            existing_web_page_id=existing_web_page_id,
+            existing_import_hash=existing_import_hash,
+            import_hash_mode=self._import_hash_mode,
+            ids=ids,
+        )
 
     def _resolve_path(self, raw_path: str) -> Path:
         path = Path(raw_path)

From d60a654e665fe0fa9649a2c6a79c482cd4b90f56 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:34:44 +0100
Subject: [PATCH 46/63] refactor: extract first_level_subjects into graph_utils
 helper

---
 wordlift_sdk/kg_build/graph_annotation.py | 25 +++---------------
 wordlift_sdk/kg_build/graph_utils.py      | 31 +++++++++++++++++++++++
 wordlift_sdk/kg_build/protocol.py         | 11 ++++----
 3 files changed, 40 insertions(+), 27 deletions(-)
 create mode 100644 wordlift_sdk/kg_build/graph_utils.py

diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/graph_annotation.py
index 731d57c..281cee0 100644
--- a/wordlift_sdk/kg_build/graph_annotation.py
+++ b/wordlift_sdk/kg_build/graph_annotation.py
@@ -2,31 +2,12 @@
 
 from rdflib import Graph, Literal, URIRef
 
+from .graph_utils import first_level_subjects
+
 SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
 SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash")
 
 
-def _first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]:
-    subjects = {s for s in graph.subjects() if isinstance(s, URIRef)}
-    if dataset_uri:
-        first_level_by_id = {
-            s
-            for s in subjects
-            if str(s).startswith(f"{dataset_uri}/")
-            and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2
-        }
-        if first_level_by_id:
-            return first_level_by_id
-
-    referenced = {
-        obj
-        for _, _, obj in graph.triples((None, None, None))
-        if isinstance(obj, URIRef) and obj in subjects
-    }
-    first_level = subjects - referenced
-    return first_level or subjects
-
-
 class ImportAnnotationPostprocessor:
     """Stamps first-level graph subjects with web-page-import provenance metadata.
 
@@ -45,7 +26,7 @@ def process_graph(self, graph: Graph, context) -> Graph:
         dataset_uri = str(
             getattr(getattr(context, "account", None), "dataset_uri", "") or ""
         ).rstrip("/")
-        for subject in _first_level_subjects(graph, dataset_uri):
+        for subject in first_level_subjects(graph, dataset_uri):
             graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import")))
 
         import_hash_mode = getattr(context, "import_hash_mode", "on")
diff --git a/wordlift_sdk/kg_build/graph_utils.py b/wordlift_sdk/kg_build/graph_utils.py
new file mode 100644
index 0000000..df35268
--- /dev/null
+++ b/wordlift_sdk/kg_build/graph_utils.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from rdflib import Graph, URIRef
+
+
+def first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]:
+    """Return the first-level URIRef subjects of *graph*.
+
+    When *dataset_uri* is set, first-level subjects are those whose IRI matches
+    ``<dataset_uri>/<type>/<id>`` (exactly two non-empty path segments after the
+    base URI).  Falls back to subjects that are not referenced as objects by any
+    other triple; if every subject is referenced, returns all subjects.
+    """
+    subjects = {s for s in graph.subjects() if isinstance(s, URIRef)}
+    if dataset_uri:
+        first_level_by_id = {
+            s
+            for s in subjects
+            if str(s).startswith(f"{dataset_uri}/")
+            and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2
+        }
+        if first_level_by_id:
+            return first_level_by_id
+
+    referenced = {
+        obj
+        for _, _, obj in graph.triples((None, None, None))
+        if isinstance(obj, URIRef) and obj in subjects
+    }
+    first_level = subjects - referenced
+    return first_level or subjects
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 0c91671..7f49aad 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -27,6 +27,7 @@
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
 from .graph_annotation import ImportAnnotationPostprocessor
+from .graph_utils import first_level_subjects
 from .id_allocator import IdAllocator
 from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor
 from .kpi import KgBuildKpiCollector
@@ -670,23 +671,23 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool:
         if not subjects:
             return False
 
-        first_level_subjects = {
+        page_subjects = {
             subject
-            for subject in self._first_level_subjects(graph)
+            for subject in first_level_subjects(graph, dataset_uri)
             if subject in subjects
         }
-        if not first_level_subjects:
+        if not page_subjects:
             return False
 
         if self._import_hash_mode == "off":
             return True
 
-        representative = next(iter(first_level_subjects))
+        representative = next(iter(page_subjects))
         existing_hash = self.patcher._existing_import_hash(representative, graph)
         import_hash = self.patcher._compute_import_hash(
             representative, graph, dataset_uri
         )
-        for subject in first_level_subjects:
+        for subject in page_subjects:
             self.patcher._set_import_hash(subject, graph, import_hash)
 
         return not (

From 37d928b5e3d0dff8082818718401cc5ee82d49fd Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:36:48 +0100
Subject: [PATCH 47/63] refactor: drop redundant tuple from
 _run_postprocessing_stage

---
 wordlift_sdk/kg_build/protocol.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 7f49aad..015a45d 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -344,7 +344,7 @@ async def callback(
             logger.warning("No triples produced for %s", url)
             return
 
-        graph, pp_result = await self._run_postprocessing_stage(
+        pp_result = await self._run_postprocessing_stage(
             mapping.graph, url, response, existing_web_page_id, existing_import_hash
         )
 
@@ -353,9 +353,11 @@ async def callback(
             self._write_debug_source_documents(
                 url=url, html=response.web_page.html, xhtml=xhtml
             )
-            self._write_debug_graph(graph, url)
+            self._write_debug_graph(pp_result.graph, url)
 
-        outcome: ValidationOutcome | None = await self._shacl_validator.validate(graph)
+        outcome: ValidationOutcome | None = await self._shacl_validator.validate(
+            pp_result.graph
+        )
         if outcome is not None:
             logger.info(
                 "SHACL validation for %s: pass=%s warnings=%d errors=%d",
@@ -371,13 +373,13 @@ async def callback(
                 warning_sources=outcome.warning_sources,
                 error_sources=outcome.error_sources,
             )
-        self._kpi.record_graph(graph)
+        self._kpi.record_graph(pp_result.graph)
         self._emit_progress(
             {
                 "kind": "graph",
                 "profile": self.profile.name,
                 "url": url,
-                "graph": self._kpi.graph_metrics(graph),
+                "graph": self._kpi.graph_metrics(pp_result.graph),
                 "validation": outcome.to_dict() if outcome else None,
             }
         )
@@ -387,10 +389,10 @@ async def callback(
             and outcome.failed
         ):
             raise RuntimeError(f"SHACL validation failed for {url} in fail mode.")
-        await self._write_graph(graph)
+        await self._write_graph(pp_result.graph)
         logger.info(
             "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]",
-            len(graph),
+            len(pp_result.graph),
             url,
             mapping.queue_wait_ms,
             mapping.mapping_ms,
@@ -445,12 +447,11 @@ async def _run_postprocessing_stage(
         response: WebPageScrapeResponse,
         existing_web_page_id: str | None,
         existing_import_hash: str | None,
-    ) -> tuple[Graph, PostprocessorResult]:
+    ) -> PostprocessorResult:
         context = self._build_pp_context(
             url, response, existing_web_page_id, existing_import_hash
         )
-        pp_result = await self._postprocessor_service.apply(graph, context)
-        return pp_result.graph, pp_result
+        return await self._postprocessor_service.apply(graph, context)
 
     def _build_pp_context(
         self,

From bb3826d57251af9e9a85e14406b08ae01a9b3b75 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:38:50 +0100
Subject: [PATCH 48/63] refactor: extract _dataset_uri property and _url_hash
 helper

---
 wordlift_sdk/kg_build/protocol.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 015a45d..44f93ff 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -410,6 +410,14 @@ def close(self) -> None:
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)
 
+    @property
+    def _dataset_uri(self) -> str:
+        return str(getattr(self.context.account, "dataset_uri", "") or "").rstrip("/")
+
+    @staticmethod
+    def _url_hash(url: str) -> str:
+        return hashlib.sha256(url.encode("utf-8")).hexdigest()
+
     async def _run_mapping_stage(
         self,
         response: WebPageScrapeResponse,
@@ -460,7 +468,7 @@ def _build_pp_context(
         existing_web_page_id: str | None,
         existing_import_hash: str | None,
     ) -> PostprocessorContext:
-        dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/")
+        dataset_uri = self._dataset_uri
         ids = IdAllocator(dataset_uri) if dataset_uri else None
         profile_payload = asdict(self.profile)
         profile_settings = dict(profile_payload.get("settings", {}) or {})
@@ -561,13 +569,13 @@ def _ensure_templates_loaded(self) -> None:
         if self._template_graph is not None and self._template_exports is not None:
             return
 
-        dataset_uri = getattr(self.context.account, "dataset_uri", None)
+        dataset_uri = self._dataset_uri
         if not dataset_uri:
             raise RuntimeError("Dataset URI not available on context.account.")
 
         base_context = {
             "account": self.context.account,
-            "dataset_uri": str(dataset_uri).rstrip("/"),
+            "dataset_uri": dataset_uri,
         }
         exports, exports_summary = self.text_renderer.load_exports_with_summary(
             self._template_dirs, base_context
@@ -633,7 +641,7 @@ def _get_mapping_content(self, mapping_path: Path) -> str:
         if cached is not None:
             return cached
 
-        dataset_uri = getattr(self.context.account, "dataset_uri", None)
+        dataset_uri = self._dataset_uri
         if not dataset_uri:
             raise RuntimeError("Dataset URI not available on context.account.")
 
@@ -641,7 +649,7 @@ def _get_mapping_content(self, mapping_path: Path) -> str:
 
         context = {
             "account": self.context.account,
-            "dataset_uri": str(dataset_uri).rstrip("/"),
+            "dataset_uri": dataset_uri,
             "exports": self._template_exports or {},
         }
         template_path = self.text_renderer.resolve_mapping_template(mapping_path)
@@ -658,9 +666,7 @@ async def _write_graph(self, graph: Graph) -> None:
         await self.patcher.patch_all(graph, import_hash_mode=self._import_hash_mode)
 
     def _prepare_graph_for_put(self, graph: Graph) -> bool:
-        dataset_uri = str(
-            getattr(self.context.account, "dataset_uri", "") or ""
-        ).rstrip("/")
+        dataset_uri = self._dataset_uri
         if not dataset_uri:
             return False
 
@@ -700,7 +706,7 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool:
     def _write_debug_graph(self, graph: Graph, url: str) -> None:
         assert self.debug_dir is not None
         self.debug_dir.mkdir(parents=True, exist_ok=True)
-        safe_name = hashlib.sha256(url.encode("utf-8")).hexdigest()
+        safe_name = self._url_hash(url)
         debug_file = self.debug_dir / f"{safe_name}.ttl"
         graph.serialize(destination=debug_file, format="turtle")
 
@@ -709,7 +715,7 @@ def _write_debug_source_documents(
     ) -> None:
         assert self.debug_dir is not None
         self.debug_dir.mkdir(parents=True, exist_ok=True)
-        safe_name = hashlib.sha256(url.encode("utf-8")).hexdigest()
+        safe_name = self._url_hash(url)
         html_file = self.debug_dir / f"{safe_name}.html"
         html_file.write_text(html, encoding="utf-8")
         if xhtml:

From 5f62d496dbef2c553002ee321b37f583c0f57aa5 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 15:43:37 +0100
Subject: [PATCH 49/63] refactor: simplify record_validation to accept
 ValidationOutcome directly

---
 wordlift_sdk/kg_build/kpi.py      | 26 ++++++++++----------------
 wordlift_sdk/kg_build/protocol.py | 16 ++--------------
 2 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/wordlift_sdk/kg_build/kpi.py b/wordlift_sdk/kg_build/kpi.py
index 5edea07..f6c0822 100644
--- a/wordlift_sdk/kg_build/kpi.py
+++ b/wordlift_sdk/kg_build/kpi.py
@@ -6,6 +6,8 @@
 
 from rdflib import Graph, RDF, URIRef
 
+from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome
+
 
 @dataclass
 class KgBuildKpiCollector:
@@ -98,26 +100,18 @@ def record_graph(self, graph: Graph) -> None:
             self._property_assertions_total += 1
             self._properties_by_predicate[str(predicate)] += 1
 
-    def record_validation(
-        self,
-        *,
-        passed: bool,
-        warning_count: int,
-        error_count: int,
-        warning_sources: dict[str, int] | Counter[str] | None = None,
-        error_sources: dict[str, int] | Counter[str] | None = None,
-    ) -> None:
+    def record_validation(self, outcome: ValidationOutcome) -> None:
         self._validation_total += 1
-        if passed:
+        if outcome.passed:
             self._validation_pass += 1
         else:
             self._validation_fail += 1
-        self._warning_count += warning_count
-        self._error_count += error_count
-        if warning_sources:
-            self._warning_sources.update(warning_sources)
-        if error_sources:
-            self._error_sources.update(error_sources)
+        self._warning_count += outcome.warning_count
+        self._error_count += outcome.error_count
+        if outcome.warning_sources:
+            self._warning_sources.update(outcome.warning_sources)
+        if outcome.error_sources:
+            self._error_sources.update(outcome.error_sources)
 
     def summary(self, profile_name: str) -> dict[str, object]:
         entities_by_type = {
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 44f93ff..7cb025e 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -366,13 +366,7 @@ async def callback(
                 outcome.warning_count,
                 outcome.error_count,
             )
-            self._kpi.record_validation(
-                passed=outcome.passed,
-                warning_count=outcome.warning_count,
-                error_count=outcome.error_count,
-                warning_sources=outcome.warning_sources,
-                error_sources=outcome.error_sources,
-            )
+            self._kpi.record_validation(outcome)
         self._kpi.record_graph(pp_result.graph)
         self._emit_progress(
             {
@@ -528,13 +522,7 @@ async def _patch_static_templates_once(self) -> None:
                         outcome.warning_count,
                         outcome.error_count,
                     )
-                    self._kpi.record_validation(
-                        passed=outcome.passed,
-                        warning_count=outcome.warning_count,
-                        error_count=outcome.error_count,
-                        warning_sources=outcome.warning_sources,
-                        error_sources=outcome.error_sources,
-                    )
+                    self._kpi.record_validation(outcome)
                 self._emit_progress(
                     {
                         "kind": "static_templates",

From a5be4d9196f10ba7889f8c75531d189e9fe2e5e5 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 16:00:04 +0100
Subject: [PATCH 50/63] refactor: reorganise into postprocessors/ subpackage

---
 tests/kg_build/test_id_allocator.py                 |  7 +++++--
 tests/kg_build/test_id_postprocessor.py             |  4 +++-
 tests/kg_build/test_kg_build_id_generator.py        |  4 +++-
 tests/kg_build/test_postprocessors.py               |  4 ++--
 .../__init__.py}                                    |  4 ++--
 .../kg_build/postprocessors/processors/__init__.py  |  0
 .../processors}/graph_annotation.py                 |  2 +-
 .../{ => postprocessors/processors}/id_allocator.py |  2 +-
 .../{ => postprocessors/processors}/id_generator.py |  4 ++--
 .../processors}/id_postprocessor.py                 |  2 +-
 .../runner.py}                                      |  6 +++---
 .../service.py}                                     |  2 +-
 .../worker.py}                                      |  2 +-
 wordlift_sdk/kg_build/protocol.py                   | 13 ++++++++-----
 14 files changed, 33 insertions(+), 23 deletions(-)
 rename wordlift_sdk/kg_build/{postprocessors.py => postprocessors/__init__.py} (99%)
 create mode 100644 wordlift_sdk/kg_build/postprocessors/processors/__init__.py
 rename wordlift_sdk/kg_build/{ => postprocessors/processors}/graph_annotation.py (97%)
 rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_allocator.py (99%)
 rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_generator.py (99%)
 rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_postprocessor.py (98%)
 rename wordlift_sdk/kg_build/{postprocessor_runner.py => postprocessors/runner.py} (96%)
 rename wordlift_sdk/kg_build/{postprocessor_service.py => postprocessors/service.py} (98%)
 rename wordlift_sdk/kg_build/{postprocessor_worker.py => postprocessors/worker.py} (98%)

diff --git a/tests/kg_build/test_id_allocator.py b/tests/kg_build/test_id_allocator.py
index bbd77a6..c420626 100644
--- a/tests/kg_build/test_id_allocator.py
+++ b/tests/kg_build/test_id_allocator.py
@@ -2,8 +2,11 @@
 
 from rdflib import Graph, Literal, RDF, URIRef
 
-import wordlift_sdk.kg_build.id_allocator as id_allocator_module
-from wordlift_sdk.kg_build.id_allocator import IdAllocator, normalize_slug
+import wordlift_sdk.kg_build.postprocessors.processors.id_allocator as id_allocator_module
+from wordlift_sdk.kg_build.postprocessors.processors.id_allocator import (
+    IdAllocator,
+    normalize_slug,
+)
 
 
 def _graph(subject: URIRef) -> Graph:
diff --git a/tests/kg_build/test_id_postprocessor.py b/tests/kg_build/test_id_postprocessor.py
index 9f2a0c5..d5a94ee 100644
--- a/tests/kg_build/test_id_postprocessor.py
+++ b/tests/kg_build/test_id_postprocessor.py
@@ -4,7 +4,9 @@
 
 from rdflib import Graph, Literal, RDF, URIRef
 
-from wordlift_sdk.kg_build.id_postprocessor import CanonicalIdsPostprocessor
+from wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor import (
+    CanonicalIdsPostprocessor,
+)
 
 
 def test_id_postprocessor_no_dataset_uri_returns_original_graph() -> None:
diff --git a/tests/kg_build/test_kg_build_id_generator.py b/tests/kg_build/test_kg_build_id_generator.py
index d59d394..b46acfa 100644
--- a/tests/kg_build/test_kg_build_id_generator.py
+++ b/tests/kg_build/test_kg_build_id_generator.py
@@ -3,7 +3,9 @@
 from rdflib import Graph, Literal, RDF, URIRef
 from rdflib.namespace import XSD
 
-from wordlift_sdk.kg_build.id_generator import CanonicalIdGenerator
+from wordlift_sdk.kg_build.postprocessors.processors.id_generator import (
+    CanonicalIdGenerator,
+)
 from wordlift_sdk.kg_build.iri_lookup import IriLookup
 from wordlift_sdk.kg_build.id_policy import DEFAULT_ID_POLICY, IdPolicy
 
diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py
index 92afadc..a6f5aac 100644
--- a/tests/kg_build/test_postprocessors.py
+++ b/tests/kg_build/test_postprocessors.py
@@ -12,7 +12,7 @@
 import pytest
 from rdflib import Dataset, Graph, Literal, URIRef
 
-from wordlift_sdk.kg_build.postprocessor_runner import (
+from wordlift_sdk.kg_build.postprocessors.runner import (
     _build_context,
     _read_graph_nquads,
 )
@@ -471,7 +471,7 @@ def process_graph(self, graph, context):
         [
             sys.executable,
             "-m",
-            "wordlift_sdk.kg_build.postprocessor_runner",
+            "wordlift_sdk.kg_build.postprocessors.runner",
             "--class",
             "test_pp:AddRunnerTriple",
             "--input-graph",
diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors/__init__.py
similarity index 99%
rename from wordlift_sdk/kg_build/postprocessors.py
rename to wordlift_sdk/kg_build/postprocessors/__init__.py
index 0acd84a..8dfee63 100644
--- a/wordlift_sdk/kg_build/postprocessors.py
+++ b/wordlift_sdk/kg_build/postprocessors/__init__.py
@@ -197,7 +197,7 @@ def _ensure_started(self) -> subprocess.Popen[str]:
         cmd = [
             self._spec.python,
             "-m",
-            "wordlift_sdk.kg_build.postprocessor_worker",
+            "wordlift_sdk.kg_build.postprocessors.worker",
             "--class",
             self._spec.class_path,
         ]
@@ -359,7 +359,7 @@ def _run(
         cmd = [
             self.spec.python,
             "-m",
-            "wordlift_sdk.kg_build.postprocessor_runner",
+            "wordlift_sdk.kg_build.postprocessors.runner",
             "--class",
             self.spec.class_path,
             "--input-graph",
diff --git a/wordlift_sdk/kg_build/postprocessors/processors/__init__.py b/wordlift_sdk/kg_build/postprocessors/processors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py
similarity index 97%
rename from wordlift_sdk/kg_build/graph_annotation.py
rename to wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py
index 281cee0..5769615 100644
--- a/wordlift_sdk/kg_build/graph_annotation.py
+++ b/wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py
@@ -2,7 +2,7 @@
 
 from rdflib import Graph, Literal, URIRef
 
-from .graph_utils import first_level_subjects
+from ...graph_utils import first_level_subjects
 
 SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source")
 SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash")
diff --git a/wordlift_sdk/kg_build/id_allocator.py b/wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py
similarity index 99%
rename from wordlift_sdk/kg_build/id_allocator.py
rename to wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py
index d597272..de4e6ce 100644
--- a/wordlift_sdk/kg_build/id_allocator.py
+++ b/wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py
@@ -6,7 +6,7 @@
 
 from rdflib import Graph, Literal, RDF, URIRef
 
-from .id_policy import DEFAULT_ID_POLICY, IdPolicy
+from ...id_policy import DEFAULT_ID_POLICY, IdPolicy
 
 SCHEMA = "http://schema.org/"
 
diff --git a/wordlift_sdk/kg_build/id_generator.py b/wordlift_sdk/kg_build/postprocessors/processors/id_generator.py
similarity index 99%
rename from wordlift_sdk/kg_build/id_generator.py
rename to wordlift_sdk/kg_build/postprocessors/processors/id_generator.py
index 3741c6f..d063f59 100644
--- a/wordlift_sdk/kg_build/id_generator.py
+++ b/wordlift_sdk/kg_build/postprocessors/processors/id_generator.py
@@ -7,8 +7,8 @@
 
 from rdflib import Graph, Literal, RDF, URIRef
 
-from .id_policy import DEFAULT_ID_POLICY, IdPolicy
-from .iri_lookup import IriLookup
+from ...id_policy import DEFAULT_ID_POLICY, IdPolicy
+from ...iri_lookup import IriLookup
 
 SCHEMA = "http://schema.org/"
 
diff --git a/wordlift_sdk/kg_build/id_postprocessor.py b/wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py
similarity index 98%
rename from wordlift_sdk/kg_build/id_postprocessor.py
rename to wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py
index ae4326e..ae51a92 100644
--- a/wordlift_sdk/kg_build/id_postprocessor.py
+++ b/wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py
@@ -3,7 +3,7 @@
 from rdflib import Graph, RDF, URIRef
 
 from .id_generator import CanonicalIdGenerator
-from .iri_lookup import IriLookup
+from ...iri_lookup import IriLookup
 
 
 def _find_web_page_iri(graph: Graph) -> URIRef | None:
diff --git a/wordlift_sdk/kg_build/postprocessor_runner.py b/wordlift_sdk/kg_build/postprocessors/runner.py
similarity index 96%
rename from wordlift_sdk/kg_build/postprocessor_runner.py
rename to wordlift_sdk/kg_build/postprocessors/runner.py
index f85fce6..7601a5d 100644
--- a/wordlift_sdk/kg_build/postprocessor_runner.py
+++ b/wordlift_sdk/kg_build/postprocessors/runner.py
@@ -10,8 +10,8 @@
 
 from rdflib import Dataset, Graph
 
-from .id_allocator import IdAllocator
-from .postprocessors import PostprocessorContext
+from . import PostprocessorContext
+from .processors.id_allocator import IdAllocator
 
 
 def _build_context(payload: dict[str, Any]) -> PostprocessorContext:
@@ -90,7 +90,7 @@ def main() -> None:
         output_graph = graph if result is None else result
         _write_graph_nquads(output_graph, Path(args.output_graph))
     except Exception as exc:  # pragma: no cover - process boundary
-        print(f"[postprocessor_runner] {exc}", file=sys.stderr)
+        print(f"[postprocessors.runner] {exc}", file=sys.stderr)
         raise SystemExit(1) from exc
 
 
diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessors/service.py
similarity index 98%
rename from wordlift_sdk/kg_build/postprocessor_service.py
rename to wordlift_sdk/kg_build/postprocessors/service.py
index c9d4d82..a2b266a 100644
--- a/wordlift_sdk/kg_build/postprocessor_service.py
+++ b/wordlift_sdk/kg_build/postprocessors/service.py
@@ -10,7 +10,7 @@
 
 from rdflib import Graph
 
-from .postprocessors import (
+from . import (
     LoadedPostprocessor,
     PostprocessorContext,
     PostprocessorResult,
diff --git a/wordlift_sdk/kg_build/postprocessor_worker.py b/wordlift_sdk/kg_build/postprocessors/worker.py
similarity index 98%
rename from wordlift_sdk/kg_build/postprocessor_worker.py
rename to wordlift_sdk/kg_build/postprocessors/worker.py
index 3a62fbb..a1dd25c 100644
--- a/wordlift_sdk/kg_build/postprocessor_worker.py
+++ b/wordlift_sdk/kg_build/postprocessors/worker.py
@@ -12,7 +12,7 @@
 
 from rdflib import Dataset, Graph
 
-from .postprocessor_runner import _build_context
+from .runner import _build_context
 
 
 def _load_class(class_path: str):
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 7cb025e..d6cb99b 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -26,18 +26,21 @@
 
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
-from .graph_annotation import ImportAnnotationPostprocessor
-from .graph_utils import first_level_subjects
-from .id_allocator import IdAllocator
-from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor
 from .kpi import KgBuildKpiCollector
-from .postprocessor_service import PostprocessorService
 from .postprocessors import (
     LoadedPostprocessor,
     PostprocessorContext,
     PostprocessorResult,
     load_postprocessors_for_profile,
 )
+from .postprocessors.processors.graph_annotation import ImportAnnotationPostprocessor
+from .graph_utils import first_level_subjects
+from .postprocessors.processors.id_allocator import IdAllocator
+from .postprocessors.processors.id_postprocessor import (
+    CanonicalIdsPostprocessor,
+    RootIdReconcilerPostprocessor,
+)
+from .postprocessors.service import PostprocessorService
 from .rml_mapping import MappingResult, RmlMappingService
 from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer
 from wordlift_sdk.structured_data.engine import init_morph_kgc_pool

From 9abfdbd909d1ed4158d33239f32ee1e72761def8 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 16:10:16 +0100
Subject: [PATCH 51/63] =?UTF-8?q?rename:=20runner.py=20=E2=86=92=20oneshot?=
 =?UTF-8?q?.py,=20worker.py=20=E2=86=92=20persistent.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/kg_build/test_postprocessors.py         |   2 +-
 .../kg_build/postprocessors/__init__.py       | 591 ++----------------
 .../kg_build/postprocessors/graph_io.py       | 101 +++
 .../postprocessors/{runner.py => oneshot.py}  |   2 +-
 .../{worker.py => persistent.py}              |   2 +-
 .../kg_build/postprocessors/service.py        |   8 +-
 .../kg_build/postprocessors/subprocess.py     | 396 ++++++++++++
 wordlift_sdk/kg_build/postprocessors/types.py |  86 +++
 8 files changed, 625 insertions(+), 563 deletions(-)
 create mode 100644 wordlift_sdk/kg_build/postprocessors/graph_io.py
 rename wordlift_sdk/kg_build/postprocessors/{runner.py => oneshot.py} (98%)
 rename wordlift_sdk/kg_build/postprocessors/{worker.py => persistent.py} (99%)
 create mode 100644 wordlift_sdk/kg_build/postprocessors/subprocess.py
 create mode 100644 wordlift_sdk/kg_build/postprocessors/types.py

diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py
index a6f5aac..6ce1284 100644
--- a/tests/kg_build/test_postprocessors.py
+++ b/tests/kg_build/test_postprocessors.py
@@ -12,7 +12,7 @@
 import pytest
 from rdflib import Dataset, Graph, Literal, URIRef
 
-from wordlift_sdk.kg_build.postprocessors.runner import (
+from wordlift_sdk.kg_build.postprocessors.oneshot import (
     _build_context,
     _read_graph_nquads,
 )
diff --git a/wordlift_sdk/kg_build/postprocessors/__init__.py b/wordlift_sdk/kg_build/postprocessors/__init__.py
index 8dfee63..b05f6f4 100644
--- a/wordlift_sdk/kg_build/postprocessors/__init__.py
+++ b/wordlift_sdk/kg_build/postprocessors/__init__.py
@@ -1,20 +1,24 @@
 from __future__ import annotations
 
-import asyncio
-import importlib
-import inspect
-import json
 import logging
-import select
-import shutil
-import subprocess
-import tempfile
-from dataclasses import dataclass, field
-from enum import Enum
 from pathlib import Path
-from typing import Any, Protocol, runtime_checkable
 
-from rdflib import Dataset, Graph
+from .graph_io import close_loaded_postprocessors
+from .subprocess import (
+    _build_handler,
+    _normalize_runtime,
+)
+from .types import (
+    Closeable,
+    GraphPostprocessor,
+    LoadedPostprocessor,
+    PostprocessorContext,
+    PostprocessorResult,
+    PostprocessorRuntime,
+    PostprocessorSpec,
+    PersistentWorkerJobError,
+    PersistentWorkerTransportError,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -24,427 +28,7 @@
     import tomli as tomllib
 
 
-class PostprocessorRuntime(str, Enum):
-    ONESHOT = "oneshot"
-    PERSISTENT = "persistent"
-    INPROCESS = "inprocess"
-
-
-@dataclass(frozen=True)
-class PostprocessorContext:
-    profile_name: str
-    profile: dict[str, Any]
-    url: str
-    account: Any
-    account_key: str | None
-    exports: dict[str, Any]
-    response: Any
-    existing_web_page_id: str | None
-    existing_import_hash: str | None = None
-    import_hash_mode: str = "on"
-    ids: Any | None = None
-
-
-class _SubprocessRunner(Protocol):
-    def __call__(
-        self,
-        *,
-        input_graph_path: Path,
-        output_graph_path: Path,
-        context_path: Path,
-        context_payload: dict[str, Any],
-    ) -> None: ...
-
-
-@runtime_checkable
-class Closeable(Protocol):
-    def close(self) -> None: ...
-
-
-@runtime_checkable
-class GraphPostprocessor(Protocol):
-    def process_graph(
-        self, graph: Graph, context: PostprocessorContext
-    ) -> Graph | None: ...
-
-
-@dataclass(frozen=True)
-class PostprocessorResult:
-    graph: Graph
-    queue_wait_ms: int
-    postprocessors_ms: int
-
-
-@dataclass(frozen=True)
-class LoadedPostprocessor:
-    name: str
-    handler: GraphPostprocessor
-
-    def run(self, graph: Graph, context: PostprocessorContext) -> Graph:
-        result = self.handler.process_graph(graph, context)
-        return graph if result is None else result
-
-
-@dataclass(frozen=True)
-class PostprocessorSpec:
-    class_path: str
-    python: str
-    timeout_seconds: int
-    enabled: bool
-    keep_temp_on_error: bool
-
-
-class PersistentWorkerTransportError(RuntimeError):
-    pass
-
-
-class PersistentWorkerJobError(RuntimeError):
-    pass
-
-
-class PersistentPostprocessorClient:
-    def __init__(self, *, spec: PostprocessorSpec, root_dir: Path) -> None:
-        self._spec = spec
-        self._root_dir = root_dir
-        self._process: subprocess.Popen[str] | None = None
-        self._next_job_id = 0
-
-    def close(self) -> None:
-        process = self._process
-        self._process = None
-        if process is None:
-            return
-
-        try:
-            if process.poll() is None and process.stdin is not None:
-                process.stdin.write(json.dumps({"op": "shutdown"}) + "\n")
-                process.stdin.flush()
-        except Exception:
-            pass
-
-        self._terminate(process)
-
-    def process_graph(
-        self,
-        *,
-        input_graph_path: Path,
-        output_graph_path: Path,
-        context_payload: dict[str, Any],
-    ) -> None:
-        for attempt in range(2):
-            try:
-                self._process_graph_once(
-                    input_graph_path=input_graph_path,
-                    output_graph_path=output_graph_path,
-                    context_payload=context_payload,
-                )
-                return
-            except PersistentWorkerTransportError:
-                self.close()
-                if attempt == 1:
-                    raise
-
-    def _process_graph_once(
-        self,
-        *,
-        input_graph_path: Path,
-        output_graph_path: Path,
-        context_payload: dict[str, Any],
-    ) -> None:
-        process = self._ensure_started()
-        self._next_job_id += 1
-        job_id = self._next_job_id
-
-        payload = {
-            "op": "process",
-            "id": job_id,
-            "input_graph": str(input_graph_path),
-            "output_graph": str(output_graph_path),
-            "context": context_payload,
-        }
-
-        try:
-            assert process.stdin is not None
-            process.stdin.write(
-                json.dumps(payload, ensure_ascii=True, default=str) + "\n"
-            )
-            process.stdin.flush()
-        except Exception as exc:
-            raise PersistentWorkerTransportError(
-                f"Postprocessor worker stdin failed: {self._spec.class_path}"
-            ) from exc
-
-        message = self._read_message(
-            process, timeout_seconds=self._spec.timeout_seconds
-        )
-        if message.get("id") != job_id:
-            raise PersistentWorkerTransportError(
-                f"Postprocessor worker returned invalid response id for {self._spec.class_path}."
-            )
-        if message.get("ok") is True:
-            return
-
-        error = str(message.get("error") or "unknown worker error")
-        raise PersistentWorkerJobError(
-            f"Postprocessor failed: {self._spec.class_path}\n{error}".strip()
-        )
-
-    def _ensure_started(self) -> subprocess.Popen[str]:
-        process = self._process
-        if process is not None and process.poll() is None:
-            return process
-
-        cmd = [
-            self._spec.python,
-            "-m",
-            "wordlift_sdk.kg_build.postprocessors.worker",
-            "--class",
-            self._spec.class_path,
-        ]
-        process = subprocess.Popen(
-            cmd,
-            text=True,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            cwd=str(self._root_dir),
-            bufsize=1,
-        )
-
-        try:
-            ready = self._read_message(
-                process, timeout_seconds=min(self._spec.timeout_seconds, 60)
-            )
-        except Exception:
-            self._terminate(process)
-            raise
-
-        if ready.get("op") != "ready" or ready.get("ok") is not True:
-            stderr = self._read_stderr(process)
-            self._terminate(process)
-            raise PersistentWorkerTransportError(
-                f"Postprocessor worker failed to start: {self._spec.class_path}"
-                + (f"\n{stderr}" if stderr else "")
-            )
-
-        self._process = process
-        return process
-
-    def _read_message(
-        self,
-        process: subprocess.Popen[str],
-        *,
-        timeout_seconds: int,
-    ) -> dict[str, Any]:
-        if process.stdout is None:
-            raise PersistentWorkerTransportError("Worker stdout is unavailable.")
-
-        ready, _, _ = select.select([process.stdout], [], [], timeout_seconds)
-        if not ready:
-            self._terminate(process)
-            cmd = (
-                process.args if isinstance(process.args, list) else [str(process.args)]
-            )
-            raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout_seconds)
-
-        line = process.stdout.readline()
-        if not line:
-            stderr = self._read_stderr(process)
-            self._terminate(process)
-            raise PersistentWorkerTransportError(
-                f"Postprocessor worker exited unexpectedly: {self._spec.class_path}"
-                + (f"\n{stderr}" if stderr else "")
-            )
-
-        try:
-            return json.loads(line)
-        except json.JSONDecodeError as exc:
-            raise PersistentWorkerTransportError(
-                "Postprocessor worker returned invalid JSON response."
-            ) from exc
-
-    def _read_stderr(self, process: subprocess.Popen[str]) -> str:
-        if process.stderr is None:
-            return ""
-        try:
-            return (process.stderr.read() or "").strip()
-        except Exception:
-            return ""
-
-    def _terminate(self, process: subprocess.Popen[str]) -> None:
-        if process.poll() is None:
-            process.kill()
-            try:
-                process.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                pass
-
-
-def _run_subprocess(
-    spec: PostprocessorSpec,
-    root_dir: Path,
-    graph: Graph,
-    payload: dict[str, Any],
-    runner: _SubprocessRunner,
-) -> Graph | None:
-    """Shared scaffolding for subprocess-based postprocessors.
-
-    Handles temp-dir lifecycle, graph serialization, output verification,
-    and debug-copy on failure. *runner* is called with the prepared paths
-    and is responsible only for the actual subprocess execution step.
-    """
-    temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_"))
-    failed = False
-    try:
-        input_graph_path = temp_dir_path / "input_graph.nq"
-        output_graph_path = temp_dir_path / "output_graph.nq"
-        context_path = temp_dir_path / "context.json"
-
-        _write_graph_nquads(graph, input_graph_path)
-        context_path.write_text(
-            json.dumps(payload, ensure_ascii=True, default=str),
-            encoding="utf-8",
-        )
-
-        runner(
-            input_graph_path=input_graph_path,
-            output_graph_path=output_graph_path,
-            context_path=context_path,
-            context_payload=payload,
-        )
-
-        if not output_graph_path.exists():
-            failed = True
-            raise RuntimeError(
-                f"Postprocessor did not produce output graph: {spec.class_path}"
-            )
-
-        return _read_graph_nquads(output_graph_path)
-    except Exception:
-        failed = True
-        raise
-    finally:
-        if failed and spec.keep_temp_on_error:
-            debug_dir = root_dir / "output" / "postprocessor_debug"
-            debug_dir.mkdir(parents=True, exist_ok=True)
-            target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_"))
-            if target.exists():
-                shutil.rmtree(target)
-            shutil.copytree(temp_dir_path, target)
-            _redact_debug_context(target / "context.json")
-        if temp_dir_path.exists():
-            shutil.rmtree(temp_dir_path, ignore_errors=True)
-
-
-@dataclass(frozen=True)
-class OneshotSubprocessPostprocessor:
-    spec: PostprocessorSpec
-    root_dir: Path
-
-    def process_graph(
-        self, graph: Graph, context: PostprocessorContext
-    ) -> Graph | None:
-        return _run_subprocess(
-            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
-        )
-
-    def _run(
-        self,
-        *,
-        input_graph_path: Path,
-        output_graph_path: Path,
-        context_path: Path,
-        **_: Any,
-    ) -> None:
-        cmd = [
-            self.spec.python,
-            "-m",
-            "wordlift_sdk.kg_build.postprocessors.runner",
-            "--class",
-            self.spec.class_path,
-            "--input-graph",
-            str(input_graph_path),
-            "--output-graph",
-            str(output_graph_path),
-            "--context",
-            str(context_path),
-        ]
-        completed = subprocess.run(
-            cmd,
-            text=True,
-            capture_output=True,
-            cwd=str(self.root_dir),
-            timeout=self.spec.timeout_seconds,
-            check=False,
-        )
-        if completed.returncode != 0:
-            stderr = (completed.stderr or "").strip()
-            raise RuntimeError(
-                f"Postprocessor failed: {self.spec.class_path} "
-                f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "")
-            )
-
-
-@dataclass
-class PersistentSubprocessPostprocessor:
-    spec: PostprocessorSpec
-    root_dir: Path
-    _client: PersistentPostprocessorClient | None = field(
-        init=False,
-        default=None,
-        repr=False,
-    )
-
-    def close(self) -> None:
-        if self._client is not None:
-            self._client.close()
-            self._client = None
-
-    def process_graph(
-        self, graph: Graph, context: PostprocessorContext
-    ) -> Graph | None:
-        return _run_subprocess(
-            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
-        )
-
-    def _run(
-        self,
-        *,
-        input_graph_path: Path,
-        output_graph_path: Path,
-        context_payload: dict[str, Any],
-        **_: Any,
-    ) -> None:
-        if self._client is None:
-            self._client = PersistentPostprocessorClient(
-                spec=self.spec,
-                root_dir=self.root_dir,
-            )
-        self._client.process_graph(
-            input_graph_path=input_graph_path,
-            output_graph_path=output_graph_path,
-            context_payload=context_payload,
-        )
-
-
-@dataclass(frozen=True)
-class InProcessPostprocessor:
-    class_path: str
-
-    def process_graph(
-        self, graph: Graph, context: PostprocessorContext
-    ) -> Graph | None:
-        module_name, class_name = self.class_path.split(":", 1)
-        module = importlib.import_module(module_name)
-        klass = getattr(module, class_name)
-        processor = klass()
-        result = processor.process_graph(graph, context)
-        if inspect.isawaitable(result):
-            result = asyncio.run(result)
-        return result
-
-
-def _as_bool(value: Any, default: bool) -> bool:
+def _as_bool(value, default: bool) -> bool:
     if value is None:
         return default
     if isinstance(value, bool):
@@ -452,7 +36,7 @@ def _as_bool(value: Any, default: bool) -> bool:
     raise TypeError("Expected boolean value.")
 
 
-def _as_str(value: Any, default: str) -> str:
+def _as_str(value, default: str) -> str:
     if value is None:
         return default
     if not isinstance(value, str) or not value.strip():
@@ -460,7 +44,7 @@ def _as_str(value: Any, default: str) -> str:
     return value
 
 
-def _as_positive_int(value: Any, default: int) -> int:
+def _as_positive_int(value, default: int) -> int:
     if value is None:
         return default
     if not isinstance(value, int) or value <= 0:
@@ -468,26 +52,6 @@ def _as_positive_int(value: Any, default: int) -> int:
     return value
 
 
-def _build_handler(
-    spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime
-) -> GraphPostprocessor:
-    if runtime == PostprocessorRuntime.INPROCESS:
-        return InProcessPostprocessor(class_path=spec.class_path)
-    if runtime == PostprocessorRuntime.PERSISTENT:
-        return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir)
-    return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir)
-
-
-def _normalize_runtime(value: str | None) -> PostprocessorRuntime:
-    raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower()
-    try:
-        return PostprocessorRuntime(raw)
-    except ValueError:
-        raise ValueError(
-            "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess."
-        )
-
-
 def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]:
     if not manifest_path.exists():
         return []
@@ -519,57 +83,16 @@ def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]:
                 f"{manifest_path}: postprocessors[{index}].class must be "
                 "'package.module:ClassName'."
             )
-        spec = PostprocessorSpec(
+        specs.append(PostprocessorSpec(
             class_path=class_path.strip(),
             python=_as_str(row.get("python"), default_python),
-            timeout_seconds=_as_positive_int(
-                row.get("timeout_seconds"), default_timeout
-            ),
+            timeout_seconds=_as_positive_int(row.get("timeout_seconds"), default_timeout),
             enabled=_as_bool(row.get("enabled"), default_enabled),
-            keep_temp_on_error=_as_bool(
-                row.get("keep_temp_on_error"), default_keep_temp
-            ),
-        )
-        specs.append(spec)
+            keep_temp_on_error=_as_bool(row.get("keep_temp_on_error"), default_keep_temp),
+        ))
     return specs
 
 
-def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]:
-    account = getattr(context, "account", None)
-    dataset_uri = str(getattr(account, "dataset_uri", "")).rstrip("/")
-    country_code = str(getattr(account, "country_code", "")).strip().lower()
-    account_key = (
-        str(context.account_key).strip()
-        if getattr(context, "account_key", None) is not None
-        else ""
-    )
-    profile = dict(getattr(context, "profile", {}) or {})
-    if "settings" not in profile or not isinstance(profile.get("settings"), dict):
-        profile["settings"] = {}
-    profile_settings = dict(profile.get("settings", {}) or {})
-    profile_settings.setdefault("api_url", "https://api.wordlift.io")
-    profile["settings"] = profile_settings
-    response = getattr(context, "response", None)
-    web_page = getattr(response, "web_page", None) if response else None
-    return {
-        "profile_name": context.profile_name,
-        "profile": profile,
-        "url": context.url,
-        "dataset_uri": dataset_uri,
-        "country_code": country_code,
-        "account_key": account_key or None,
-        "exports": context.exports,
-        "existing_web_page_id": context.existing_web_page_id,
-        "response": {
-            "id": getattr(response, "id", None) or context.existing_web_page_id,
-            "web_page": {
-                "url": getattr(web_page, "url", None),
-                "html": getattr(web_page, "html", None),
-            },
-        },
-    }
-
-
 def _load_from_specs(
     specs: list[PostprocessorSpec],
     root_dir: Path,
@@ -629,57 +152,17 @@ def load_postprocessors(
     return loaded
 
 
-def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None:
-    for processor in postprocessors:
-        if isinstance(processor.handler, Closeable):
-            processor.handler.close()
-
-
-def _write_graph_nquads(graph: Graph, path: Path) -> None:
-    dataset = Dataset()
-    for triple in graph:
-        dataset.add(triple)
-    dataset.serialize(destination=path, format="nquads")
-
-
-def _read_graph_nquads(path: Path) -> Graph:
-    dataset = Dataset()
-    dataset.parse(path, format="nquads")
-    graph = Graph()
-    for triple in dataset.triples((None, None, None)):
-        graph.add(triple)
-    return graph
-
-
-def _redact_debug_context(path: Path) -> None:
-    if not path.exists():
-        return
-    try:
-        payload = json.loads(path.read_text(encoding="utf-8"))
-    except Exception:
-        return
-    if not isinstance(payload, dict):
-        return
-    if payload.get("account_key"):
-        payload["account_key"] = "***REDACTED***"
-    profile = payload.get("profile")
-    if isinstance(profile, dict) and profile.get("api_key"):
-        profile["api_key"] = "***REDACTED***"
-    settings = (
-        profile.get("settings")
-        if isinstance(profile, dict) and isinstance(profile.get("settings"), dict)
-        else None
-    )
-    if settings and settings.get("api_key"):
-        settings["api_key"] = "***REDACTED***"
-    if settings and settings.get("wordlift_key"):
-        settings["wordlift_key"] = "***REDACTED***"
-    if settings and settings.get("WORDLIFT_KEY"):
-        settings["WORDLIFT_KEY"] = "***REDACTED***"
-    if settings and settings.get("WORDLIFT_API_KEY"):
-        settings["WORDLIFT_API_KEY"] = "***REDACTED***"
-    payload["profile"] = profile
-    path.write_text(
-        json.dumps(payload, ensure_ascii=True, default=str),
-        encoding="utf-8",
-    )
+__all__ = [
+    "Closeable",
+    "GraphPostprocessor",
+    "LoadedPostprocessor",
+    "PostprocessorContext",
+    "PostprocessorResult",
+    "PostprocessorRuntime",
+    "PostprocessorSpec",
+    "PersistentWorkerJobError",
+    "PersistentWorkerTransportError",
+    "close_loaded_postprocessors",
+    "load_postprocessors",
+    "load_postprocessors_for_profile",
+]
diff --git a/wordlift_sdk/kg_build/postprocessors/graph_io.py b/wordlift_sdk/kg_build/postprocessors/graph_io.py
new file mode 100644
index 0000000..866189c
--- /dev/null
+++ b/wordlift_sdk/kg_build/postprocessors/graph_io.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from rdflib import Dataset, Graph
+
+from .types import Closeable, LoadedPostprocessor, PostprocessorContext
+
+
+def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]:
+    account = getattr(context, "account", None)
+    dataset_uri = str(getattr(account, "dataset_uri", "")).rstrip("/")
+    country_code = str(getattr(account, "country_code", "")).strip().lower()
+    account_key = (
+        str(context.account_key).strip()
+        if getattr(context, "account_key", None) is not None
+        else ""
+    )
+    profile = dict(getattr(context, "profile", {}) or {})
+    if "settings" not in profile or not isinstance(profile.get("settings"), dict):
+        profile["settings"] = {}
+    profile_settings = dict(profile.get("settings", {}) or {})
+    profile_settings.setdefault("api_url", "https://api.wordlift.io")
+    profile["settings"] = profile_settings
+    response = getattr(context, "response", None)
+    web_page = getattr(response, "web_page", None) if response else None
+    return {
+        "profile_name": context.profile_name,
+        "profile": profile,
+        "url": context.url,
+        "dataset_uri": dataset_uri,
+        "country_code": country_code,
+        "account_key": account_key or None,
+        "exports": context.exports,
+        "existing_web_page_id": context.existing_web_page_id,
+        "response": {
+            "id": getattr(response, "id", None) or context.existing_web_page_id,
+            "web_page": {
+                "url": getattr(web_page, "url", None),
+                "html": getattr(web_page, "html", None),
+            },
+        },
+    }
+
+
+def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None:
+    for processor in postprocessors:
+        if isinstance(processor.handler, Closeable):
+            processor.handler.close()
+
+
+def _write_graph_nquads(graph: Graph, path: Path) -> None:
+    dataset = Dataset()
+    for triple in graph:
+        dataset.add(triple)
+    dataset.serialize(destination=path, format="nquads")
+
+
+def _read_graph_nquads(path: Path) -> Graph:
+    dataset = Dataset()
+    dataset.parse(path, format="nquads")
+    graph = Graph()
+    for triple in dataset.triples((None, None, None)):
+        graph.add(triple)
+    return graph
+
+
+def _redact_debug_context(path: Path) -> None:
+    if not path.exists():
+        return
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return
+    if not isinstance(payload, dict):
+        return
+    if payload.get("account_key"):
+        payload["account_key"] = "***REDACTED***"
+    profile = payload.get("profile")
+    if isinstance(profile, dict) and profile.get("api_key"):
+        profile["api_key"] = "***REDACTED***"
+    settings = (
+        profile.get("settings")
+        if isinstance(profile, dict) and isinstance(profile.get("settings"), dict)
+        else None
+    )
+    if settings and settings.get("api_key"):
+        settings["api_key"] = "***REDACTED***"
+    if settings and settings.get("wordlift_key"):
+        settings["wordlift_key"] = "***REDACTED***"
+    if settings and settings.get("WORDLIFT_KEY"):
+        settings["WORDLIFT_KEY"] = "***REDACTED***"
+    if settings and settings.get("WORDLIFT_API_KEY"):
+        settings["WORDLIFT_API_KEY"] = "***REDACTED***"
+    payload["profile"] = profile
+    path.write_text(
+        json.dumps(payload, ensure_ascii=True, default=str),
+        encoding="utf-8",
+    )
diff --git a/wordlift_sdk/kg_build/postprocessors/runner.py b/wordlift_sdk/kg_build/postprocessors/oneshot.py
similarity index 98%
rename from wordlift_sdk/kg_build/postprocessors/runner.py
rename to wordlift_sdk/kg_build/postprocessors/oneshot.py
index 7601a5d..6b8ceda 100644
--- a/wordlift_sdk/kg_build/postprocessors/runner.py
+++ b/wordlift_sdk/kg_build/postprocessors/oneshot.py
@@ -10,7 +10,7 @@
 
 from rdflib import Dataset, Graph
 
-from . import PostprocessorContext
+from .types import PostprocessorContext
 from .processors.id_allocator import IdAllocator
 
 
diff --git a/wordlift_sdk/kg_build/postprocessors/worker.py b/wordlift_sdk/kg_build/postprocessors/persistent.py
similarity index 99%
rename from wordlift_sdk/kg_build/postprocessors/worker.py
rename to wordlift_sdk/kg_build/postprocessors/persistent.py
index a1dd25c..eb04efb 100644
--- a/wordlift_sdk/kg_build/postprocessors/worker.py
+++ b/wordlift_sdk/kg_build/postprocessors/persistent.py
@@ -12,7 +12,7 @@
 
 from rdflib import Dataset, Graph
 
-from .runner import _build_context
+from .oneshot import _build_context
 
 
 def _load_class(class_path: str):
diff --git a/wordlift_sdk/kg_build/postprocessors/service.py b/wordlift_sdk/kg_build/postprocessors/service.py
index a2b266a..f508bb6 100644
--- a/wordlift_sdk/kg_build/postprocessors/service.py
+++ b/wordlift_sdk/kg_build/postprocessors/service.py
@@ -10,12 +10,8 @@
 
 from rdflib import Graph
 
-from . import (
-    LoadedPostprocessor,
-    PostprocessorContext,
-    PostprocessorResult,
-    close_loaded_postprocessors,
-)
+from .graph_io import close_loaded_postprocessors
+from .types import LoadedPostprocessor, PostprocessorContext, PostprocessorResult
 
 logger = logging.getLogger(__name__)
 
diff --git a/wordlift_sdk/kg_build/postprocessors/subprocess.py b/wordlift_sdk/kg_build/postprocessors/subprocess.py
new file mode 100644
index 0000000..52b1dfe
--- /dev/null
+++ b/wordlift_sdk/kg_build/postprocessors/subprocess.py
@@ -0,0 +1,396 @@
+from __future__ import annotations
+
+import asyncio
+import importlib
+import inspect
+import json
+import logging
+import select
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from rdflib import Graph
+
+from .types import (
+    Closeable,
+    GraphPostprocessor,
+    LoadedPostprocessor,
+    PostprocessorContext,
+    PostprocessorRuntime,
+    PostprocessorSpec,
+    PersistentWorkerJobError,
+    PersistentWorkerTransportError,
+    _SubprocessRunner,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PersistentPostprocessorClient:
+    def __init__(self, *, spec: PostprocessorSpec, root_dir: Path) -> None:
+        self._spec = spec
+        self._root_dir = root_dir
+        self._process: subprocess.Popen[str] | None = None
+        self._next_job_id = 0
+
+    def close(self) -> None:
+        process = self._process
+        self._process = None
+        if process is None:
+            return
+
+        try:
+            if process.poll() is None and process.stdin is not None:
+                process.stdin.write(json.dumps({"op": "shutdown"}) + "\n")
+                process.stdin.flush()
+        except Exception:
+            pass
+
+        self._terminate(process)
+
+    def process_graph(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_payload: dict[str, Any],
+    ) -> None:
+        for attempt in range(2):
+            try:
+                self._process_graph_once(
+                    input_graph_path=input_graph_path,
+                    output_graph_path=output_graph_path,
+                    context_payload=context_payload,
+                )
+                return
+            except PersistentWorkerTransportError:
+                self.close()
+                if attempt == 1:
+                    raise
+
+    def _process_graph_once(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_payload: dict[str, Any],
+    ) -> None:
+        process = self._ensure_started()
+        self._next_job_id += 1
+        job_id = self._next_job_id
+
+        payload = {
+            "op": "process",
+            "id": job_id,
+            "input_graph": str(input_graph_path),
+            "output_graph": str(output_graph_path),
+            "context": context_payload,
+        }
+
+        try:
+            assert process.stdin is not None
+            process.stdin.write(
+                json.dumps(payload, ensure_ascii=True, default=str) + "\n"
+            )
+            process.stdin.flush()
+        except Exception as exc:
+            raise PersistentWorkerTransportError(
+                f"Postprocessor worker stdin failed: {self._spec.class_path}"
+            ) from exc
+
+        message = self._read_message(
+            process, timeout_seconds=self._spec.timeout_seconds
+        )
+        if message.get("id") != job_id:
+            raise PersistentWorkerTransportError(
+                f"Postprocessor worker returned invalid response id for {self._spec.class_path}."
+            )
+        if message.get("ok") is True:
+            return
+
+        error = str(message.get("error") or "unknown worker error")
+        raise PersistentWorkerJobError(
+            f"Postprocessor failed: {self._spec.class_path}\n{error}".strip()
+        )
+
+    def _ensure_started(self) -> subprocess.Popen[str]:
+        process = self._process
+        if process is not None and process.poll() is None:
+            return process
+
+        cmd = [
+            self._spec.python,
+            "-m",
+            "wordlift_sdk.kg_build.postprocessors.persistent",
+            "--class",
+            self._spec.class_path,
+        ]
+        process = subprocess.Popen(
+            cmd,
+            text=True,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            cwd=str(self._root_dir),
+            bufsize=1,
+        )
+
+        try:
+            ready = self._read_message(
+                process, timeout_seconds=min(self._spec.timeout_seconds, 60)
+            )
+        except Exception:
+            self._terminate(process)
+            raise
+
+        if ready.get("op") != "ready" or ready.get("ok") is not True:
+            stderr = self._read_stderr(process)
+            self._terminate(process)
+            raise PersistentWorkerTransportError(
+                f"Postprocessor worker failed to start: {self._spec.class_path}"
+                + (f"\n{stderr}" if stderr else "")
+            )
+
+        self._process = process
+        return process
+
+    def _read_message(
+        self,
+        process: subprocess.Popen[str],
+        *,
+        timeout_seconds: int,
+    ) -> dict[str, Any]:
+        if process.stdout is None:
+            raise PersistentWorkerTransportError("Worker stdout is unavailable.")
+
+        ready, _, _ = select.select([process.stdout], [], [], timeout_seconds)
+        if not ready:
+            self._terminate(process)
+            cmd = (
+                process.args if isinstance(process.args, list) else [str(process.args)]
+            )
+            raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout_seconds)
+
+        line = process.stdout.readline()
+        if not line:
+            stderr = self._read_stderr(process)
+            self._terminate(process)
+            raise PersistentWorkerTransportError(
+                f"Postprocessor worker exited unexpectedly: {self._spec.class_path}"
+                + (f"\n{stderr}" if stderr else "")
+            )
+
+        try:
+            return json.loads(line)
+        except json.JSONDecodeError as exc:
+            raise PersistentWorkerTransportError(
+                "Postprocessor worker returned invalid JSON response."
+            ) from exc
+
+    def _read_stderr(self, process: subprocess.Popen[str]) -> str:
+        if process.stderr is None:
+            return ""
+        try:
+            return (process.stderr.read() or "").strip()
+        except Exception:
+            return ""
+
+    def _terminate(self, process: subprocess.Popen[str]) -> None:
+        if process.poll() is None:
+            process.kill()
+            try:
+                process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                pass
+
+
+def _run_subprocess(
+    spec: PostprocessorSpec,
+    root_dir: Path,
+    graph: Graph,
+    payload: dict[str, Any],
+    runner: _SubprocessRunner,
+) -> Graph | None:
+    """Shared scaffolding for subprocess-based postprocessors.
+
+    Handles temp-dir lifecycle, graph serialization, output verification,
+    and debug-copy on failure. *runner* is called with the prepared paths
+    and is responsible only for the actual subprocess execution step.
+    """
+    from .graph_io import _redact_debug_context, _read_graph_nquads, _write_graph_nquads
+
+    temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_"))
+    failed = False
+    try:
+        input_graph_path = temp_dir_path / "input_graph.nq"
+        output_graph_path = temp_dir_path / "output_graph.nq"
+        context_path = temp_dir_path / "context.json"
+
+        _write_graph_nquads(graph, input_graph_path)
+        context_path.write_text(
+            json.dumps(payload, ensure_ascii=True, default=str),
+            encoding="utf-8",
+        )
+
+        runner(
+            input_graph_path=input_graph_path,
+            output_graph_path=output_graph_path,
+            context_path=context_path,
+            context_payload=payload,
+        )
+
+        if not output_graph_path.exists():
+            failed = True
+            raise RuntimeError(
+                f"Postprocessor did not produce output graph: {spec.class_path}"
+            )
+
+        return _read_graph_nquads(output_graph_path)
+    except Exception:
+        failed = True
+        raise
+    finally:
+        if failed and spec.keep_temp_on_error:
+            debug_dir = root_dir / "output" / "postprocessor_debug"
+            debug_dir.mkdir(parents=True, exist_ok=True)
+            target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_"))
+            if target.exists():
+                shutil.rmtree(target)
+            shutil.copytree(temp_dir_path, target)
+            _redact_debug_context(target / "context.json")
+        if temp_dir_path.exists():
+            shutil.rmtree(temp_dir_path, ignore_errors=True)
+
+
+@dataclass(frozen=True)
+class OneshotSubprocessPostprocessor:
+    spec: PostprocessorSpec
+    root_dir: Path
+
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None:
+        from .graph_io import _build_runner_payload
+        return _run_subprocess(
+            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
+        )
+
+    def _run(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_path: Path,
+        **_: Any,
+    ) -> None:
+        cmd = [
+            self.spec.python,
+            "-m",
+            "wordlift_sdk.kg_build.postprocessors.oneshot",
+            "--class",
+            self.spec.class_path,
+            "--input-graph",
+            str(input_graph_path),
+            "--output-graph",
+            str(output_graph_path),
+            "--context",
+            str(context_path),
+        ]
+        completed = subprocess.run(
+            cmd,
+            text=True,
+            capture_output=True,
+            cwd=str(self.root_dir),
+            timeout=self.spec.timeout_seconds,
+            check=False,
+        )
+        if completed.returncode != 0:
+            stderr = (completed.stderr or "").strip()
+            raise RuntimeError(
+                f"Postprocessor failed: {self.spec.class_path} "
+                f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "")
+            )
+
+
+@dataclass
+class PersistentSubprocessPostprocessor:
+    spec: PostprocessorSpec
+    root_dir: Path
+    _client: PersistentPostprocessorClient | None = field(
+        init=False,
+        default=None,
+        repr=False,
+    )
+
+    def close(self) -> None:
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None:
+        from .graph_io import _build_runner_payload
+        return _run_subprocess(
+            self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
+        )
+
+    def _run(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_payload: dict[str, Any],
+        **_: Any,
+    ) -> None:
+        if self._client is None:
+            self._client = PersistentPostprocessorClient(
+                spec=self.spec,
+                root_dir=self.root_dir,
+            )
+        self._client.process_graph(
+            input_graph_path=input_graph_path,
+            output_graph_path=output_graph_path,
+            context_payload=context_payload,
+        )
+
+
+@dataclass(frozen=True)
+class InProcessPostprocessor:
+    class_path: str
+
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None:
+        module_name, class_name = self.class_path.split(":", 1)
+        module = importlib.import_module(module_name)
+        klass = getattr(module, class_name)
+        processor = klass()
+        result = processor.process_graph(graph, context)
+        if inspect.isawaitable(result):
+            result = asyncio.run(result)
+        return result
+
+
+def _build_handler(
+    spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime
+) -> GraphPostprocessor:
+    if runtime == PostprocessorRuntime.INPROCESS:
+        return InProcessPostprocessor(class_path=spec.class_path)
+    if runtime == PostprocessorRuntime.PERSISTENT:
+        return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir)
+    return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir)
+
+
+def _normalize_runtime(value: str | None) -> PostprocessorRuntime:
+    raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower()
+    try:
+        return PostprocessorRuntime(raw)
+    except ValueError:
+        raise ValueError(
+            "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess."
+        )
diff --git a/wordlift_sdk/kg_build/postprocessors/types.py b/wordlift_sdk/kg_build/postprocessors/types.py
new file mode 100644
index 0000000..a9323bd
--- /dev/null
+++ b/wordlift_sdk/kg_build/postprocessors/types.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+
+from rdflib import Graph
+
+
+class PostprocessorRuntime(str, Enum):
+    ONESHOT = "oneshot"
+    PERSISTENT = "persistent"
+    INPROCESS = "inprocess"
+
+
+@dataclass(frozen=True)
+class PostprocessorContext:
+    profile_name: str
+    profile: dict[str, Any]
+    url: str
+    account: Any
+    account_key: str | None
+    exports: dict[str, Any]
+    response: Any
+    existing_web_page_id: str | None
+    existing_import_hash: str | None = None
+    import_hash_mode: str = "on"
+    ids: Any | None = None
+
+
+@dataclass(frozen=True)
+class PostprocessorSpec:
+    class_path: str
+    python: str
+    timeout_seconds: int
+    enabled: bool
+    keep_temp_on_error: bool
+
+
+class _SubprocessRunner(Protocol):
+    def __call__(
+        self,
+        *,
+        input_graph_path: Path,
+        output_graph_path: Path,
+        context_path: Path,
+        context_payload: dict[str, Any],
+    ) -> None: ...
+
+
+@runtime_checkable
+class Closeable(Protocol):
+    def close(self) -> None: ...
+
+
+@runtime_checkable
+class GraphPostprocessor(Protocol):
+    def process_graph(
+        self, graph: Graph, context: PostprocessorContext
+    ) -> Graph | None: ...
+
+
+@dataclass(frozen=True)
+class PostprocessorResult:
+    graph: Graph
+    queue_wait_ms: int
+    postprocessors_ms: int
+
+
+@dataclass(frozen=True)
+class LoadedPostprocessor:
+    name: str
+    handler: GraphPostprocessor
+
+    def run(self, graph: Graph, context: PostprocessorContext) -> Graph:
+        result = self.handler.process_graph(graph, context)
+        return graph if result is None else result
+
+
+class PersistentWorkerTransportError(RuntimeError):
+    pass
+
+
+class PersistentWorkerJobError(RuntimeError):
+    pass

From 2af8809a4a315521124bac917f4d1709f94ef160 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 17:12:02 +0100
Subject: [PATCH 52/63] fix: close GraphQueue ApiClient on protocol shutdown

---
 wordlift_sdk/kg_build/protocol.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index d6cb99b..1295f64 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -399,10 +399,11 @@ async def callback(
             outcome.validation_ms if outcome else 0,
         )
 
-    def close(self) -> None:
+    async def close(self) -> None:
         self._postprocessor_service.close()
         self._mapping_executor.shutdown(wait=False)
         self._shacl_validator.close()
+        await self.context.graph_queue.close()
 
     def get_kpi_summary(self) -> dict[str, object]:
         return self._kpi.summary(self.profile.name)

From 1b7fa15a8dd5c543b6e5d9e635a9e9fd49f9f167 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Thu, 19 Mar 2026 19:13:14 +0100
Subject: [PATCH 53/63] feat(tests): align kg_build tests with refactored
 protocol and postprocessor API

---
 tests/kg_build/test_graph_utils.py            | 100 ++++
 tests/kg_build/test_kpi.py                    |  13 +-
 .../test_postprocessor_runner_helpers.py      |   2 +-
 .../test_postprocessor_runner_main.py         |   2 +-
 tests/kg_build/test_postprocessor_service.py  | 164 ++++++
 tests/kg_build/test_postprocessor_worker.py   |   2 +-
 tests/kg_build/test_postprocessors.py         |  54 +-
 tests/kg_build/test_profile_inheritance.py    |   3 +-
 tests/kg_build/test_protocol.py               | 521 +++++++++---------
 tests/kg_build/test_rml_mapping.py            |  31 +-
 .../kg_build/postprocessors/subprocess.py     |   7 +-
 11 files changed, 575 insertions(+), 324 deletions(-)
 create mode 100644 tests/kg_build/test_graph_utils.py
 create mode 100644 tests/kg_build/test_postprocessor_service.py

diff --git a/tests/kg_build/test_graph_utils.py b/tests/kg_build/test_graph_utils.py
new file mode 100644
index 0000000..4f1dec4
--- /dev/null
+++ b/tests/kg_build/test_graph_utils.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from rdflib import Graph, Literal, URIRef
+
+from wordlift_sdk.kg_build.graph_utils import first_level_subjects
+
+DATASET = "https://data.example.com"
+
+
+def _uri(path: str) -> URIRef:
+    return URIRef(f"{DATASET}/{path}")
+
+
+def _ext(path: str) -> URIRef:
+    return URIRef(f"https://external.example.com/{path}")
+
+
+def test_empty_graph_returns_empty_set() -> None:
+    assert first_level_subjects(Graph(), DATASET) == set()
+
+
+def test_dataset_uri_match_returns_two_segment_subjects() -> None:
+    g = Graph()
+    canonical = _uri("articles/my-article")  # 2 segments → first-level
+    deep = _uri("articles/my-article/comments/1")  # 4 segments → not first-level
+    g.add((canonical, URIRef("https://schema.org/name"), Literal("Article")))
+    g.add((deep, URIRef("https://schema.org/name"), Literal("Comment")))
+
+    result = first_level_subjects(g, DATASET)
+    assert canonical in result
+    assert deep not in result
+
+
+def test_dataset_uri_match_ignores_single_segment() -> None:
+    g = Graph()
+    one_seg = _uri("articles")  # 1 segment → not first-level by-id
+    two_seg = _uri("articles/slug")  # 2 segments → first-level
+    g.add((one_seg, URIRef("https://schema.org/name"), Literal("Collection")))
+    g.add((two_seg, URIRef("https://schema.org/name"), Literal("Item")))
+
+    result = first_level_subjects(g, DATASET)
+    assert two_seg in result
+    assert one_seg not in result
+
+
+def test_fallback_to_unreferenced_subjects_when_no_dataset_match() -> None:
+    g = Graph()
+    root = _ext("root")
+    child = _ext("child")
+    # child is referenced by root, so root is the unreferenced subject
+    g.add((root, URIRef("https://schema.org/hasPart"), child))
+    g.add((child, URIRef("https://schema.org/name"), Literal("Child")))
+
+    # No dataset_uri prefix match; fall back to "not referenced" logic
+    result = first_level_subjects(g, "")
+    assert root in result
+    assert child not in result
+
+
+def test_fallback_returns_all_when_everything_is_referenced() -> None:
+    g = Graph()
+    a = _ext("a")
+    b = _ext("b")
+    # mutual references: both are referenced
+    g.add((a, URIRef("https://schema.org/hasPart"), b))
+    g.add((b, URIRef("https://schema.org/hasPart"), a))
+
+    result = first_level_subjects(g, "")
+    assert result == {a, b}
+
+
+def test_blank_dataset_uri_uses_reference_fallback() -> None:
+    g = Graph()
+    page = _ext("page")
+    product = _ext("product")
+    g.add((page, URIRef("https://schema.org/mentions"), product))
+    g.add((product, URIRef("https://schema.org/name"), Literal("Product")))
+
+    result = first_level_subjects(g, "")
+    assert page in result
+    assert product not in result
+
+
+def test_dataset_uri_prefix_no_match_falls_back_gracefully() -> None:
+    g = Graph()
+    ext_subject = _ext("item")
+    g.add((ext_subject, URIRef("https://schema.org/name"), Literal("External")))
+
+    # dataset_uri set but no subject matches the prefix
+    result = first_level_subjects(g, DATASET)
+    assert ext_subject in result
+
+
+def test_literal_objects_are_not_counted_as_subjects() -> None:
+    g = Graph()
+    s = _uri("things/item")
+    g.add((s, URIRef("https://schema.org/name"), Literal("Name")))
+
+    result = first_level_subjects(g, DATASET)
+    assert s in result
diff --git a/tests/kg_build/test_kpi.py b/tests/kg_build/test_kpi.py
index 905a54b..69ef8e2 100644
--- a/tests/kg_build/test_kpi.py
+++ b/tests/kg_build/test_kpi.py
@@ -1,6 +1,7 @@
 from rdflib import Graph, Literal, RDF, URIRef
 
 from wordlift_sdk.kg_build.kpi import KgBuildKpiCollector
+from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome
 
 
 def test_kpi_collector_records_graph_and_validation() -> None:
@@ -14,11 +15,13 @@ def test_kpi_collector_records_graph_and_validation() -> None:
 
     collector.record_graph(graph)
     collector.record_validation(
-        passed=False,
-        warning_count=2,
-        error_count=1,
-        warning_sources={"google-article": 2},
-        error_sources={"google-product": 1},
+        ValidationOutcome(
+            passed=False,
+            warning_sources={"google-article": 2},
+            error_sources={"google-product": 1},
+            queue_wait_ms=0,
+            validation_ms=0,
+        )
     )
     summary = collector.summary("demo")
 
diff --git a/tests/kg_build/test_postprocessor_runner_helpers.py b/tests/kg_build/test_postprocessor_runner_helpers.py
index 1083eaa..1034147 100644
--- a/tests/kg_build/test_postprocessor_runner_helpers.py
+++ b/tests/kg_build/test_postprocessor_runner_helpers.py
@@ -4,7 +4,7 @@
 
 from rdflib import Graph, Literal, URIRef
 
-from wordlift_sdk.kg_build import postprocessor_runner as runner
+from wordlift_sdk.kg_build.postprocessors import oneshot as runner
 
 
 def test_load_class_variants(monkeypatch) -> None:
diff --git a/tests/kg_build/test_postprocessor_runner_main.py b/tests/kg_build/test_postprocessor_runner_main.py
index 0b3bc8e..224010f 100644
--- a/tests/kg_build/test_postprocessor_runner_main.py
+++ b/tests/kg_build/test_postprocessor_runner_main.py
@@ -6,7 +6,7 @@
 
 from rdflib import Graph, Literal, URIRef
 
-from wordlift_sdk.kg_build import postprocessor_runner as runner
+from wordlift_sdk.kg_build.postprocessors import oneshot as runner
 
 
 def _graph() -> Graph:
diff --git a/tests/kg_build/test_postprocessor_service.py b/tests/kg_build/test_postprocessor_service.py
new file mode 100644
index 0000000..df0cf49
--- /dev/null
+++ b/tests/kg_build/test_postprocessor_service.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import asyncio
+from types import SimpleNamespace
+
+import pytest
+from rdflib import Graph, Literal, URIRef
+
+from wordlift_sdk.kg_build.postprocessors.service import PostprocessorService
+from wordlift_sdk.kg_build.postprocessors.types import (
+    LoadedPostprocessor,
+    PostprocessorContext,
+)
+
+
+def _sample_graph() -> Graph:
+    g = Graph()
+    g.add(
+        (
+            URIRef("https://example.com/s"),
+            URIRef("https://example.com/p"),
+            Literal("v"),
+        )
+    )
+    return g
+
+
+def _sample_context() -> PostprocessorContext:
+    return PostprocessorContext(
+        profile_name="test",
+        profile={},
+        url="https://example.com/page",
+        account=SimpleNamespace(dataset_uri="https://data.example.com"),
+        account_key=None,
+        exports={},
+        response=SimpleNamespace(
+            id=None, web_page=SimpleNamespace(url=None, html=None)
+        ),
+        existing_web_page_id=None,
+    )
+
+
+def _make_service(pool_size: int = 1, processors=None) -> PostprocessorService:
+    if processors is None:
+
+        class _Passthrough:
+            def process_graph(self, graph: Graph, context) -> Graph:
+                return graph
+
+        processors = [LoadedPostprocessor(name="passthrough", handler=_Passthrough())]
+
+    return PostprocessorService(
+        postprocessors_factory=lambda: processors,
+        pool_size=pool_size,
+    )
+
+
+def test_apply_returns_result_with_graph_and_timings() -> None:
+    service = _make_service()
+    result = asyncio.run(service.apply(_sample_graph(), _sample_context()))
+    service.close()
+
+    assert isinstance(result.graph, Graph)
+    assert len(result.graph) == 1
+    assert result.queue_wait_ms >= 0
+    assert result.postprocessors_ms >= 0
+
+
+def test_apply_runs_processors_in_order() -> None:
+    additions: list[int] = []
+
+    class _Mark:
+        def __init__(self, n: int) -> None:
+            self._n = n
+
+        def process_graph(self, graph: Graph, context) -> Graph:
+            additions.append(self._n)
+            graph.add(
+                (
+                    URIRef(f"https://example.com/s{self._n}"),
+                    URIRef("https://example.com/p"),
+                    Literal(self._n),
+                )
+            )
+            return graph
+
+    processors = [
+        LoadedPostprocessor(name="first", handler=_Mark(1)),
+        LoadedPostprocessor(name="second", handler=_Mark(2)),
+    ]
+    service = PostprocessorService(
+        postprocessors_factory=lambda: processors,
+        pool_size=1,
+    )
+    result = asyncio.run(service.apply(_sample_graph(), _sample_context()))
+    service.close()
+
+    assert additions == [1, 2]
+    assert len(result.graph) == 3  # original + 2 added
+
+
+def test_close_calls_close_on_closeable_handlers() -> None:
+    class _Closeable:
+        def __init__(self) -> None:
+            self.closed = False
+
+        def close(self) -> None:
+            self.closed = True
+
+        def process_graph(self, graph: Graph, context) -> Graph:
+            return graph
+
+    handler = _Closeable()
+    service = PostprocessorService(
+        postprocessors_factory=lambda: [LoadedPostprocessor(name="c", handler=handler)],
+        pool_size=1,
+    )
+    service.close()
+
+    assert handler.closed is True
+
+
+def test_pool_isolates_slots() -> None:
+    """Each slot in the pool should be an independent list of processors."""
+    slot_ids: list[int] = []
+
+    class _Recorder:
+        def __init__(self, slot_id: int) -> None:
+            self._slot_id = slot_id
+
+        def process_graph(self, graph: Graph, context) -> Graph:
+            slot_ids.append(self._slot_id)
+            return graph
+
+    slot_counter = [0]
+
+    def factory() -> list[LoadedPostprocessor]:
+        slot_counter[0] += 1
+        sid = slot_counter[0]
+        return [LoadedPostprocessor(name=f"slot-{sid}", handler=_Recorder(sid))]
+
+    pool_size = 2
+    service = PostprocessorService(postprocessors_factory=factory, pool_size=pool_size)
+    try:
+        # Run both slots sequentially
+        asyncio.run(service.apply(_sample_graph(), _sample_context()))
+        asyncio.run(service.apply(_sample_graph(), _sample_context()))
+    finally:
+        service.close()
+
+    # Both slots should have been used (order may vary but both IDs present)
+    assert len(slot_ids) == 2
+    assert set(slot_ids) == {1, 2}
+
+
+@pytest.mark.asyncio
+async def test_apply_async_returns_correct_graph() -> None:
+    service = _make_service()
+    graph = _sample_graph()
+    result = await service.apply(graph, _sample_context())
+    service.close()
+
+    assert isinstance(result.graph, Graph)
+    assert len(result.graph) == 1
diff --git a/tests/kg_build/test_postprocessor_worker.py b/tests/kg_build/test_postprocessor_worker.py
index 7f359a3..0cfe3b7 100644
--- a/tests/kg_build/test_postprocessor_worker.py
+++ b/tests/kg_build/test_postprocessor_worker.py
@@ -7,7 +7,7 @@
 
 from rdflib import Graph, Literal, URIRef
 
-from wordlift_sdk.kg_build import postprocessor_worker as worker
+from wordlift_sdk.kg_build.postprocessors import persistent as worker
 
 
 def _graph() -> Graph:
diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py
index 6ce1284..ed812ab 100644
--- a/tests/kg_build/test_postprocessors.py
+++ b/tests/kg_build/test_postprocessors.py
@@ -20,11 +20,14 @@
     LoadedPostprocessor,
     PostprocessorContext,
     PostprocessorSpec,
-    SubprocessPostprocessor,
-    _build_runner_payload,
     close_loaded_postprocessors,
     load_postprocessors_for_profile,
 )
+from wordlift_sdk.kg_build.postprocessors.graph_io import _build_runner_payload
+from wordlift_sdk.kg_build.postprocessors.subprocess import (
+    OneshotSubprocessPostprocessor,
+    PersistentSubprocessPostprocessor,
+)
 
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 _current_pythonpath = os.environ.get("PYTHONPATH", "")
@@ -162,8 +165,8 @@ class = "test_pp:ProfileTwo"
 
     first = loaded[0].handler
     second = loaded[1].handler
-    assert isinstance(second, SubprocessPostprocessor)
-    assert isinstance(first, SubprocessPostprocessor)
+    assert isinstance(second, OneshotSubprocessPostprocessor)
+    assert isinstance(first, OneshotSubprocessPostprocessor)
     assert first.spec.python == "/profile/python"
     assert first.spec.timeout_seconds == 17
     assert first.spec.keep_temp_on_error is True
@@ -190,7 +193,7 @@ class = "test_pp:BaseOne"
     assert [item.name for item in loaded] == ["test_pp:BaseOne"]
 
     first = loaded[0].handler
-    assert isinstance(first, SubprocessPostprocessor)
+    assert isinstance(first, OneshotSubprocessPostprocessor)
     assert first.spec.python == "/base/python"
     assert first.spec.timeout_seconds == 11
     assert first.spec.keep_temp_on_error is False
@@ -219,8 +222,7 @@ class = "test_pp:ProfileOne"
         runtime="persistent",
     )
     assert len(loaded) == 1
-    assert isinstance(loaded[0].handler, SubprocessPostprocessor)
-    assert loaded[0].handler.runtime == "persistent"
+    assert isinstance(loaded[0].handler, PersistentSubprocessPostprocessor)
 
 
 def test_subprocess_execution_and_nquads_exchange(tmp_path: Path) -> None:
@@ -249,7 +251,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root)
+    processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root)
 
     output = processor.process_graph(_sample_graph(), _sample_context())
     assert output is not None
@@ -291,11 +293,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(
-        spec=spec,
-        root_dir=root,
-        runtime="persistent",
-    )
+    processor = PersistentSubprocessPostprocessor(spec=spec, root_dir=root)
 
     first = processor.process_graph(_sample_graph(), _sample_context())
     second = processor.process_graph(_sample_graph(), _sample_context())
@@ -351,7 +349,12 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root, runtime=runtime)
+    cls = (
+        PersistentSubprocessPostprocessor
+        if runtime == "persistent"
+        else OneshotSubprocessPostprocessor
+    )
+    processor = cls(spec=spec, root_dir=root)
     try:
         output = processor.process_graph(
             _sample_graph(),
@@ -405,7 +408,12 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root, runtime=runtime)
+    cls = (
+        PersistentSubprocessPostprocessor
+        if runtime == "persistent"
+        else OneshotSubprocessPostprocessor
+    )
+    processor = cls(spec=spec, root_dir=root)
     try:
         output = processor.process_graph(
             _sample_graph(),
@@ -471,7 +479,7 @@ def process_graph(self, graph, context):
         [
             sys.executable,
             "-m",
-            "wordlift_sdk.kg_build.postprocessors.runner",
+            "wordlift_sdk.kg_build.postprocessors.oneshot",
             "--class",
             "test_pp:AddRunnerTriple",
             "--input-graph",
@@ -517,7 +525,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root)
+    processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root)
 
     with pytest.raises(subprocess.TimeoutExpired):
         processor.process_graph(_sample_graph(), _sample_context())
@@ -543,11 +551,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(
-        spec=spec,
-        root_dir=root,
-        runtime="persistent",
-    )
+    processor = PersistentSubprocessPostprocessor(spec=spec, root_dir=root)
 
     with pytest.raises(subprocess.TimeoutExpired):
         processor.process_graph(_sample_graph(), _sample_context())
@@ -571,7 +575,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=True,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root)
+    processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root)
 
     with pytest.raises(RuntimeError):
         processor.process_graph(_sample_graph(), _sample_context())
@@ -607,7 +611,7 @@ def process_graph(self, graph, context):
         enabled=True,
         keep_temp_on_error=True,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root)
+    processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root)
     secret = "top-secret-key"
 
     with pytest.raises(RuntimeError):
@@ -683,7 +687,7 @@ def test_subprocess_uses_inherited_environment_without_pythonpath_injection(
         enabled=True,
         keep_temp_on_error=False,
     )
-    processor = SubprocessPostprocessor(spec=spec, root_dir=root)
+    processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root)
     captured: dict[str, object] = {}
 
     def fake_run(*args, **kwargs):
diff --git a/tests/kg_build/test_profile_inheritance.py b/tests/kg_build/test_profile_inheritance.py
index 8bb90e1..fbdea15 100644
--- a/tests/kg_build/test_profile_inheritance.py
+++ b/tests/kg_build/test_profile_inheritance.py
@@ -45,7 +45,8 @@ def test_runtime_inherits_from_base_when_selected_missing(tmp_path: Path) -> Non
     )
 
     assert profile.settings["postprocessor_runtime"] == "persistent"
-    assert protocol._postprocessor_runtime == "persistent"
+    # Verify the protocol accepted the inherited runtime (service is initialised without error)
+    assert protocol._postprocessor_service is not None
 
 
 def test_validation_settings_parse_into_profile_settings(tmp_path: Path) -> None:
diff --git a/tests/kg_build/test_protocol.py b/tests/kg_build/test_protocol.py
index 3e46fed..43efe7e 100644
--- a/tests/kg_build/test_protocol.py
+++ b/tests/kg_build/test_protocol.py
@@ -7,7 +7,6 @@
 from jinja2 import UndefinedError
 from rdflib import BNode, Graph, Literal, RDF, URIRef
 from wordlift_client import WebPage, WebPageScrapeResponse
-from wordlift_sdk.validation.shacl import ValidationResult
 
 from wordlift_sdk.kg_build.config.loader import ProfileDefinition, ProfileMappingRoute
 import wordlift_sdk.kg_build.protocol as protocol_module
@@ -16,6 +15,17 @@
     _path_contains_part,
     _resolve_postprocessor_runtime,
 )
+from wordlift_sdk.kg_build.rml_mapping import MappingResult
+from wordlift_sdk.kg_build.postprocessors.types import PostprocessorResult
+from wordlift_sdk.kg_build.postprocessors.processors.graph_annotation import (
+    ImportAnnotationPostprocessor,
+)
+from wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor import (
+    CanonicalIdsPostprocessor,
+    RootIdReconcilerPostprocessor,
+    _find_web_page_iri as _find_web_page_iri_impl,
+)
+from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome
 
 
 def _make_profile() -> ProfileDefinition:
@@ -60,7 +70,7 @@ def _make_context() -> SimpleNamespace:
     return SimpleNamespace(
         account=SimpleNamespace(dataset_uri="https://data.example.com/dataset"),
         client_configuration=SimpleNamespace(api_key={}),
-        graph_queue=SimpleNamespace(put=AsyncMock()),
+        graph_queue=SimpleNamespace(put=AsyncMock(), close=AsyncMock()),
         configuration_provider=SimpleNamespace(
             get_value=lambda *_args, **_kwargs: None
         ),
@@ -71,13 +81,73 @@ def _make_context_without_dataset() -> SimpleNamespace:
     return SimpleNamespace(
         account=SimpleNamespace(dataset_uri=None),
         client_configuration=SimpleNamespace(api_key={}),
-        graph_queue=SimpleNamespace(put=AsyncMock()),
+        graph_queue=SimpleNamespace(put=AsyncMock(), close=AsyncMock()),
         configuration_provider=SimpleNamespace(
             get_value=lambda *_args, **_kwargs: None
         ),
     )
 
 
+def _make_mapping_result(graph: Graph) -> MappingResult:
+    return MappingResult(graph=graph, queue_wait_ms=0, mapping_ms=0)
+
+
+def _make_validation_outcome(
+    *,
+    passed: bool,
+    warning_sources: dict | None = None,
+    error_sources: dict | None = None,
+) -> ValidationOutcome:
+    return ValidationOutcome(
+        passed=passed,
+        warning_sources=warning_sources or {},
+        error_sources=error_sources or {},
+        queue_wait_ms=0,
+        validation_ms=0,
+    )
+
+
+def _passthrough_pp() -> AsyncMock:
+    return AsyncMock(
+        side_effect=lambda g, url, resp, ewi, eih: PostprocessorResult(
+            graph=g, queue_wait_ms=0, postprocessors_ms=0
+        )
+    )
+
+
+def _annotating_pp(
+    dataset_uri: str = "https://data.example.com/dataset",
+    import_hash_mode: str = "on",
+) -> AsyncMock:
+    async def _stage(graph, url, resp, ewi, eih):
+        ctx = SimpleNamespace(
+            account=SimpleNamespace(dataset_uri=dataset_uri),
+            existing_import_hash=eih,
+            import_hash_mode=import_hash_mode,
+        )
+        g = ImportAnnotationPostprocessor().process_graph(graph, ctx)
+        return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0)
+
+    return AsyncMock(side_effect=_stage)
+
+
+def _reconciling_pp(
+    dataset_uri: str = "https://data.example.com/dataset",
+) -> AsyncMock:
+    async def _stage(graph, url, resp, ewi, eih):
+        ctx = SimpleNamespace(
+            account=SimpleNamespace(dataset_uri=dataset_uri),
+            existing_import_hash=eih,
+            import_hash_mode="on",
+            existing_web_page_id=ewi,
+        )
+        g = RootIdReconcilerPostprocessor().process_graph(graph, ctx)
+        g = ImportAnnotationPostprocessor().process_graph(g, ctx)
+        return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0)
+
+    return AsyncMock(side_effect=_stage)
+
+
 def _make_graph(subject: str) -> Graph:
     graph = Graph()
     s = URIRef(subject)
@@ -130,12 +200,13 @@ async def test_profile_protocol_reconciles_to_existing_id_and_sets_source():
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_graph("https://example.com/mapped-web-page")
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(
+            _make_graph("https://example.com/mapped-web-page")
+        )
     )
+    protocol._run_postprocessing_stage = _reconciling_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -169,12 +240,11 @@ async def test_profile_protocol_put_strategy_writes_to_graph_queue() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -197,7 +267,7 @@ async def test_static_templates_use_graph_queue_when_put_strategy_enabled() -> N
     )
     protocol._template_graph = _make_dataset_scoped_graph()
     protocol._template_exports = {}
-    protocol._validate_graph_if_enabled = MagicMock(return_value=None)
+    protocol._shacl_validator.validate = AsyncMock(return_value=None)
     protocol._emit_progress = MagicMock()
     protocol._kpi.record_graph = MagicMock()
     protocol.patcher.patch_all = AsyncMock()
@@ -220,8 +290,7 @@ async def test_profile_protocol_put_strategy_honors_import_hash_write_mode() ->
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
+    protocol._run_postprocessing_stage = _passthrough_pp()
     protocol.patcher.patch_all = AsyncMock()
     graph = _make_dataset_scoped_graph()
     child = URIRef("https://data.example.com/dataset/entities/article-1/faq/1")
@@ -233,7 +302,7 @@ async def test_profile_protocol_put_strategy_honors_import_hash_write_mode() ->
         )
     )
     graph.add((child, RDF.type, URIRef("https://schema.org/Question")))
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=graph)
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph))
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -264,17 +333,22 @@ async def test_profile_protocol_put_strategy_skips_when_import_hash_matches() ->
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
     graph = _make_dataset_scoped_graph()
-    protocol._set_source(graph, existing_web_page_id=None)
+    # Pre-annotate so the expected hash matches what the pipeline will produce
+    ann_ctx = SimpleNamespace(
+        account=context.account,
+        existing_import_hash=None,
+        import_hash_mode="on",
+    )
+    ImportAnnotationPostprocessor().process_graph(graph, ann_ctx)
     expected_hash = protocol.patcher._compute_import_hash(
         URIRef("https://data.example.com/dataset/web-pages/1"),
         graph,
         "https://data.example.com/dataset",
     )
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=graph)
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph))
+    protocol._run_postprocessing_stage = _annotating_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -297,12 +371,11 @@ async def test_profile_protocol_put_strategy_honors_import_hash_off_mode() -> No
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -335,13 +408,12 @@ async def test_profile_protocol_sets_source_on_mapped_subject_when_existing_id_m
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
     mapped_subject = "https://example.com/mapped-web-page"
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_graph(mapped_subject)
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_graph(mapped_subject))
     )
+    protocol._run_postprocessing_stage = _annotating_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -367,12 +439,11 @@ async def test_profile_protocol_sets_source_only_on_first_level_uri_subjects():
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_multi_entity_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_multi_entity_graph())
     )
+    protocol._run_postprocessing_stage = _annotating_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -423,11 +494,8 @@ async def test_callback_runs_canonical_ids_after_postprocessors() -> None:
             Literal("https://translated.com/developers"),
         )
     )
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=mapped_graph)
 
-    def _inject_service_product_and_fragment_offer(
-        graph: Graph, *_args, **_kwargs
-    ) -> Graph:
+    async def _pp_with_injection(graph, url, resp, ewi, eih):
         graph.add((root, RDF.type, URIRef("http://schema.org/Product")))
         graph.add((root, RDF.type, URIRef("http://schema.org/Service")))
         graph.add(
@@ -444,11 +512,17 @@ def _inject_service_product_and_fragment_offer(
                 URIRef(f"{root}#aggregate-offer-usd"),
             )
         )
-        return graph
+        ctx = SimpleNamespace(
+            account=SimpleNamespace(dataset_uri="https://data.example.com/dataset"),
+            extensions=None,
+        )
+        g = CanonicalIdsPostprocessor().process_graph(graph, ctx)
+        return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0)
 
-    protocol._apply_postprocessors = MagicMock(
-        side_effect=_inject_service_product_and_fragment_offer
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(mapped_graph)
     )
+    protocol._run_postprocessing_stage = AsyncMock(side_effect=_pp_with_injection)
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://translated.com/developers", html="<html></html>")
@@ -484,12 +558,11 @@ async def test_profile_protocol_applies_existing_import_hash_to_all_uri_subjects
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_multi_entity_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_multi_entity_graph())
     )
+    protocol._run_postprocessing_stage = _annotating_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -520,15 +593,14 @@ async def test_profile_protocol_sets_source_when_web_page_absent_but_uri_subject
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
+    protocol._run_postprocessing_stage = _annotating_pp()
     protocol.patcher.patch_all = AsyncMock()
 
     graph = Graph()
     article = URIRef("https://example.com/entities/article-only")
     graph.add((article, RDF.type, URIRef("http://schema.org/Article")))
     graph.add((article, URIRef("http://schema.org/headline"), Literal("Title")))
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=graph)
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph))
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -554,8 +626,7 @@ async def test_profile_protocol_sets_source_by_dataset_id_depth() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
+    protocol._run_postprocessing_stage = _annotating_pp()
     protocol.patcher.patch_all = AsyncMock()
 
     graph = Graph()
@@ -567,7 +638,7 @@ async def test_profile_protocol_sets_source_by_dataset_id_depth() -> None:
     graph.add((entity, RDF.type, URIRef("https://schema.org/Article")))
     graph.add((entity, URIRef("https://schema.org/hasPart"), child))
     graph.add((child, RDF.type, URIRef("https://schema.org/Question")))
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=graph)
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph))
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -598,8 +669,7 @@ async def test_profile_protocol_does_not_set_source_on_blank_nodes():
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
+    protocol._run_postprocessing_stage = _annotating_pp()
     protocol.patcher.patch_all = AsyncMock()
 
     graph = Graph()
@@ -608,7 +678,7 @@ async def test_profile_protocol_does_not_set_source_on_blank_nodes():
     graph.add((article, RDF.type, URIRef("http://schema.org/Article")))
     graph.add((blank, RDF.type, URIRef("http://schema.org/Thing")))
     graph.add((article, URIRef("http://schema.org/mentions"), blank))
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=graph)
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph))
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -640,12 +710,11 @@ def fake_loader(*, root_dir, profile_name, runtime=None):
         return []
 
     monkeypatch.setattr(protocol_module, "load_postprocessors_for_profile", fake_loader)
-    protocol = ProfileImportProtocol(
+    ProfileImportProtocol(
         context=_make_context(),
         profile=_make_profile_with_settings({"POSTPROCESSOR_RUNTIME": "persistent"}),
         root_dir=Path.cwd(),
     )
-    assert protocol._postprocessor_runtime == "persistent"
     assert captured["runtime"] == "persistent"
 
 
@@ -675,7 +744,10 @@ def test_build_pp_context_exposes_resolved_profile_and_account_key() -> None:
     )
 
     context = protocol._build_pp_context(
-        "https://example.com/page", response, existing_web_page_id=None
+        "https://example.com/page",
+        response,
+        existing_web_page_id=None,
+        existing_import_hash=None,
     )
 
     assert context.account_key == "profile-secret"
@@ -699,44 +771,37 @@ def test_build_pp_context_preserves_custom_profile_settings() -> None:
     )
 
     context = protocol._build_pp_context(
-        "https://example.com/page", response, existing_web_page_id=None
+        "https://example.com/page",
+        response,
+        existing_web_page_id=None,
+        existing_import_hash=None,
     )
 
     assert context.profile["settings"]["disable_article_markup"] is True
 
 
-def test_apply_postprocessors_fails_fast_when_account_key_missing() -> None:
+def test_account_key_resolved_from_profile_api_key() -> None:
+    profile = ProfileDefinition(
+        **{
+            **_make_profile().__dict__,
+            "api_key": "profile-secret",
+        }
+    )
     protocol = ProfileImportProtocol(
         context=_make_context(),
-        profile=_make_profile(),
+        profile=profile,
         root_dir=Path.cwd(),
     )
+    assert protocol._account_key == "profile-secret"
 
-    class _NeverRun:
-        name = "never-run"
-        called = False
-
-        def run(self, graph, context):
-            self.called = True
-            return graph
-
-    handler = _NeverRun()
-    protocol._postprocessors = [handler]  # type: ignore[assignment]
 
-    response = WebPageScrapeResponse(
-        web_page=WebPage(url="https://example.com/page", html="<html></html>")
+def test_account_key_is_none_when_no_key_configured() -> None:
+    protocol = ProfileImportProtocol(
+        context=_make_context(),
+        profile=_make_profile(),
+        root_dir=Path.cwd(),
     )
-    graph = _make_graph("https://example.com/mapped-web-page")
-
-    with pytest.raises(RuntimeError, match="Postprocessor runtime requires an API key"):
-        protocol._apply_postprocessors(
-            graph,
-            "https://example.com/page",
-            response,
-            existing_web_page_id=None,
-        )
-
-    assert handler.called is False
+    assert protocol._account_key is None
 
 
 def test_protocol_helpers_runtime_and_path_part() -> None:
@@ -792,7 +857,7 @@ async def test_callback_returns_early_when_mapping_has_no_triples() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol.rml_service.apply_mapping = AsyncMock(return_value=Graph())
+    protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(Graph()))
     protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
@@ -803,21 +868,16 @@ async def test_callback_returns_early_when_mapping_has_no_triples() -> None:
     protocol.patcher.patch_all.assert_not_called()
 
 
-def test_close_invokes_postprocessor_cleanup(monkeypatch: pytest.MonkeyPatch) -> None:
-    called: dict[str, object] = {}
-
-    def fake_close(postprocessors):
-        called["value"] = postprocessors
-
-    monkeypatch.setattr(protocol_module, "close_loaded_postprocessors", fake_close)
+def test_close_invokes_postprocessor_service_close() -> None:
     protocol = ProfileImportProtocol(
         context=_make_context(),
         profile=_make_profile(),
         root_dir=Path.cwd(),
     )
-    protocol._postprocessors = ["x"]  # type: ignore[assignment]
-    protocol.close()
-    assert called["value"] == ["x"]
+    mock_close = MagicMock()
+    protocol._postprocessor_service.close = mock_close
+    asyncio.run(protocol.close())
+    mock_close.assert_called_once()
 
 
 def test_resolve_path_and_overlay_paths(tmp_path: Path) -> None:
@@ -1018,88 +1078,69 @@ def test_get_mapping_content_uses_cache_and_requires_dataset() -> None:
         protocol2._get_mapping_content(path)
 
 
-def test_apply_postprocessors_runs_all_processors() -> None:
+def test_postprocessor_factory_builds_required_processors(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Verify the factory used by PostprocessorService includes the standard processors."""
+
+    def fake_loader(*, root_dir, profile_name, runtime=None):
+        return []
+
+    monkeypatch.setattr(protocol_module, "load_postprocessors_for_profile", fake_loader)
     protocol = ProfileImportProtocol(
         context=_make_context(),
-        profile=_make_profile_with_settings({"api_key": "x"}),
+        profile=_make_profile(),
         root_dir=Path.cwd(),
     )
-    response = WebPageScrapeResponse(
-        web_page=WebPage(url="https://example.com/page", html="<html></html>")
-    )
-    graph = _make_graph("https://example.com/page")
-
-    class _P1:
-        name = "p1"
-
-        def run(self, g, _ctx):
-            g.add(
-                (
-                    URIRef("https://example.com/page"),
-                    URIRef("https://schema.org/name"),
-                    Literal("a"),
-                )
-            )
-            return g
-
-    class _P2:
-        name = "p2"
-
-        def run(self, g, _ctx):
-            return g
-
-    protocol._postprocessors = [_P1(), _P2()]  # type: ignore[assignment]
-    protocol._resolve_postprocessor_account_key = MagicMock(return_value="secret")
-    out = protocol._apply_postprocessors(
-        graph, "https://example.com/page", response, None
-    )
-    assert len(out) >= len(graph)
+    # Get one slot from the pool to inspect the processors
+    processors = list(protocol._postprocessor_service._queue.get_nowait())
+    names = [p.name for p in processors]
+    assert "root_id_reconciler" in names
+    assert "canonical_ids" in names
+    assert "import_annotation" in names
 
 
-def test_resolve_postprocessor_account_key_priority(
+def test_resolve_account_key_priority(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    protocol = ProfileImportProtocol(
-        context=_make_context(),
-        profile=_make_profile(),
-        root_dir=Path.cwd(),
+    profile = _make_profile()
+    context = _make_context()
+
+    profile_with_key = ProfileDefinition(
+        **{**profile.__dict__, "api_key": "profile-key"}
     )
-    protocol.profile = ProfileDefinition(
-        **{**protocol.profile.__dict__, "api_key": "profile-key"}
+    assert (
+        protocol_module._resolve_account_key(profile_with_key, context) == "profile-key"
     )
-    assert protocol._resolve_postprocessor_account_key() == "profile-key"
 
-    protocol.profile = ProfileDefinition(
-        **{**protocol.profile.__dict__, "api_key": None}
-    )
-    protocol.context.client_configuration.api_key = {"ApiKey": "runtime-key"}
-    assert protocol._resolve_postprocessor_account_key() == "runtime-key"
+    context.client_configuration.api_key = {"ApiKey": "runtime-key"}
+    assert protocol_module._resolve_account_key(profile, context) == "runtime-key"
 
-    protocol.context.client_configuration.api_key = {}
-    protocol.context.configuration_provider = SimpleNamespace(
+    context.client_configuration.api_key = {}
+    context.configuration_provider = SimpleNamespace(
         get_value=lambda name: "provider-key" if name == "WORDLIFT_KEY" else None
     )
-    assert protocol._resolve_postprocessor_account_key() == "provider-key"
+    assert protocol_module._resolve_account_key(profile, context) == "provider-key"
 
-    protocol.context.configuration_provider = SimpleNamespace(
+    context.configuration_provider = SimpleNamespace(
         get_value=lambda _name: (_ for _ in ()).throw(RuntimeError("nope"))
     )
     monkeypatch.setenv("WORDLIFT_API_KEY", "env-key")
-    assert protocol._resolve_postprocessor_account_key() == "env-key"
+    assert protocol_module._resolve_account_key(profile, context) == "env-key"
     monkeypatch.delenv("WORDLIFT_API_KEY", raising=False)
 
 
 def test_clean_key_write_debug_and_reconcile(tmp_path: Path) -> None:
+    assert protocol_module._clean_key(None) is None
+    assert protocol_module._clean_key("  ") is None
+    assert protocol_module._clean_key(" x ") == "x"
+
     protocol = ProfileImportProtocol(
         context=_make_context(),
         profile=_make_profile(),
         root_dir=tmp_path,
         debug_dir=tmp_path / "debug",
     )
-    assert protocol._clean_key(None) is None
-    assert protocol._clean_key("  ") is None
-    assert protocol._clean_key(" x ") == "x"
-
     graph = _make_graph("https://example.com/old")
     protocol._write_debug_graph(graph, "https://example.com/page")
     protocol._write_debug_source_documents(
@@ -1113,8 +1154,14 @@ def test_clean_key_write_debug_and_reconcile(tmp_path: Path) -> None:
     child = URIRef("https://example.com/child")
     https_graph.add((old, RDF.type, URIRef("https://schema.org/WebPage")))
     https_graph.add((child, URIRef("https://schema.org/about"), old))
-    assert protocol._find_web_page_iri(https_graph) == old
-    protocol._reconcile_root_id(https_graph, str(new))
+    assert _find_web_page_iri_impl(https_graph) == old
+    ctx = SimpleNamespace(
+        existing_web_page_id=str(new),
+        account=SimpleNamespace(dataset_uri=""),
+        existing_import_hash=None,
+        import_hash_mode="on",
+    )
+    RootIdReconcilerPostprocessor().process_graph(https_graph, ctx)
     assert (new, RDF.type, URIRef("https://schema.org/WebPage")) in https_graph
     assert (child, URIRef("https://schema.org/about"), new) in https_graph
 
@@ -1132,17 +1179,15 @@ async def test_callback_writes_html_xhtml_and_ttl_debug_artifacts(
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
 
-    async def _apply_mapping(**kwargs):
-        debug_output = kwargs.get("debug_output")
+    async def _mapping_stage(response, url, ewi, debug_output):
         if isinstance(debug_output, dict):
             debug_output["xhtml"] = "<html><body>Converted</body></html>"
-        return _make_graph("https://example.com/mapped-web-page")
+        return _make_mapping_result(_make_graph("https://example.com/mapped-web-page"))
 
-    protocol.rml_service.apply_mapping = AsyncMock(side_effect=_apply_mapping)
+    protocol._run_mapping_stage = AsyncMock(side_effect=_mapping_stage)
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html>Raw</html>")
@@ -1190,14 +1235,10 @@ def test_protocol_setting_parsers_and_progress_error_logging(
         profile=profile,
         root_dir=Path.cwd(),
     )
-    assert protocol._shacl_mode == "warn"
-    assert protocol._shacl_shape_specs == [
-        "google-article.ttl",
-        "https://example.com/custom-shape.ttl",
-    ]
+    assert protocol._shacl_validator.mode.value == "warn"
     assert protocol._import_hash_mode == "write"
-    assert protocol._resolve_list_setting(["a", " ", "b"]) == ["a", "b"]
-    assert protocol._resolve_list_setting(123) == ["123"]
+    assert protocol_module._resolve_list_setting(["a", " ", "b"]) == ["a", "b"]
+    assert protocol_module._resolve_list_setting(123) == ["123"]
 
     protocol._on_progress = lambda _payload: (_ for _ in ()).throw(RuntimeError("boom"))
     with caplog.at_level("WARNING"):
@@ -1265,8 +1306,8 @@ async def test_patch_static_templates_fail_validation_raises() -> None:
     protocol._template_graph = graph
     protocol._template_exports = {}
     protocol.patcher.patch_all = AsyncMock()
-    protocol._validate_graph = MagicMock(
-        return_value=_make_validation_result(conforms=False)
+    protocol._shacl_validator.validate = AsyncMock(
+        return_value=_make_validation_outcome(passed=False)
     )
 
     with pytest.raises(
@@ -1281,11 +1322,6 @@ async def test_patch_static_templates_fail_validation_raises() -> None:
 
 
 def test_find_web_page_iri_returns_none_when_missing() -> None:
-    protocol = ProfileImportProtocol(
-        context=_make_context(),
-        profile=_make_profile(),
-        root_dir=Path.cwd(),
-    )
     graph = Graph()
     graph.add(
         (
@@ -1294,61 +1330,17 @@ def test_find_web_page_iri_returns_none_when_missing() -> None:
             URIRef("https://schema.org/Thing"),
         )
     )
-    assert protocol._find_web_page_iri(graph) is None
+    assert _find_web_page_iri_impl(graph) is None
 
 
-def _make_validation_result(
-    *,
-    conforms: bool,
-    warning_shapes: list[URIRef] | None = None,
-    error_shapes: list[URIRef] | None = None,
-    shape_map: dict[URIRef, str] | None = None,
-) -> ValidationResult:
-    warning_shapes = warning_shapes or []
-    error_shapes = error_shapes or []
-    shape_map = shape_map or {}
-    report = Graph()
-    sh_result_severity = URIRef("http://www.w3.org/ns/shacl#resultSeverity")
-    sh_warning = URIRef("http://www.w3.org/ns/shacl#Warning")
-    sh_violation = URIRef("http://www.w3.org/ns/shacl#Violation")
-    sh_source_shape = URIRef("http://www.w3.org/ns/shacl#sourceShape")
-
-    for index, shape in enumerate(warning_shapes):
-        node = URIRef(f"https://example.com/report/w/{index}")
-        report.add((node, sh_result_severity, sh_warning))
-        report.add((node, sh_source_shape, shape))
-    for index, shape in enumerate(error_shapes):
-        node = URIRef(f"https://example.com/report/e/{index}")
-        report.add((node, sh_result_severity, sh_violation))
-        report.add((node, sh_source_shape, shape))
-
-    return ValidationResult(
-        conforms=conforms,
-        report_text="report",
-        report_graph=report,
-        data_graph=Graph(),
-        shape_source_map=shape_map,
-        warning_count=len(warning_shapes),
-    )
-
-
-def test_summarize_validation_aggregates_sources() -> None:
-    protocol = ProfileImportProtocol(
-        context=_make_context(),
-        profile=_make_profile(),
-        root_dir=Path.cwd(),
-    )
-    article_shape = URIRef("https://shape.example/article")
-    product_shape = URIRef("https://shape.example/product")
-    result = _make_validation_result(
-        conforms=False,
-        warning_shapes=[article_shape],
-        error_shapes=[article_shape, product_shape],
-        shape_map={article_shape: "google-article", product_shape: "google-product"},
+def test_validation_outcome_to_dict_aggregates_sources() -> None:
+    outcome = _make_validation_outcome(
+        passed=False,
+        warning_sources={"google-article": 1},
+        error_sources={"google-article": 1, "google-product": 1},
     )
-    summary = protocol._summarize_validation(result)
+    summary = outcome.to_dict()
     assert summary == {
-        "total": 1,
         "pass": False,
         "fail": True,
         "warnings": {"count": 1, "sources": {"google-article": 1}},
@@ -1376,21 +1368,16 @@ async def test_profile_protocol_emits_progress_and_validation_in_warn_mode() ->
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
+    )
+    protocol._run_postprocessing_stage = _passthrough_pp()
     protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
-    )
-    protocol._validate_graph = MagicMock(
-        return_value=_make_validation_result(
-            conforms=False,
-            warning_shapes=[URIRef("https://shape.example/w")],
-            error_shapes=[URIRef("https://shape.example/e")],
-            shape_map={
-                URIRef("https://shape.example/w"): "google-article",
-                URIRef("https://shape.example/e"): "google-product",
-            },
+    protocol._shacl_validator.validate = AsyncMock(
+        return_value=_make_validation_outcome(
+            passed=False,
+            warning_sources={"google-article": 1},
+            error_sources={"google-product": 1},
         )
     )
 
@@ -1404,7 +1391,6 @@ async def test_profile_protocol_emits_progress_and_validation_in_warn_mode() ->
     assert payload["kind"] == "graph"
     assert payload["url"] == "https://example.com/page"
     assert payload["validation"] == {
-        "total": 1,
         "pass": False,
         "fail": True,
         "warnings": {"count": 1, "sources": {"google-article": 1}},
@@ -1440,14 +1426,13 @@ async def test_profile_protocol_validation_fail_mode_raises() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
-    protocol._validate_graph = MagicMock(
-        return_value=_make_validation_result(conforms=False)
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
+    protocol._shacl_validator.validate = AsyncMock(
+        return_value=_make_validation_outcome(passed=False)
     )
 
     response = WebPageScrapeResponse(
@@ -1475,12 +1460,11 @@ async def test_profile_protocol_emits_null_validation_when_disabled() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -1504,12 +1488,11 @@ async def test_profile_protocol_passes_import_hash_mode_to_patcher() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -1535,12 +1518,11 @@ async def test_profile_protocol_emits_graph_and_static_template_events() -> None
     protocol._template_exports = {}
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _passthrough_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -1560,12 +1542,11 @@ async def test_profile_protocol_collects_run_level_kpis() -> None:
     protocol._patch_static_templates_once = AsyncMock()
     protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml"))
     protocol._get_mapping_content = MagicMock(return_value="mapping")
-    protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g)
-    protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g)
-    protocol.patcher.patch_all = AsyncMock()
-    protocol.rml_service.apply_mapping = AsyncMock(
-        return_value=_make_dataset_scoped_graph()
+    protocol._run_mapping_stage = AsyncMock(
+        return_value=_make_mapping_result(_make_dataset_scoped_graph())
     )
+    protocol._run_postprocessing_stage = _annotating_pp()
+    protocol.patcher.patch_all = AsyncMock()
 
     response = WebPageScrapeResponse(
         web_page=WebPage(url="https://example.com/page", html="<html></html>")
@@ -1604,7 +1585,7 @@ def test_protocol_validation_mode_normalization_and_deprecation(
             ),
             root_dir=Path.cwd(),
         )
-    assert strict_protocol._shacl_mode == "fail"
+    assert strict_protocol._shacl_validator.mode.value == "fail"
     assert "Deprecated SHACL validation mode 'strict' detected" in caplog.text
 
     with caplog.at_level("WARNING"):
@@ -1615,7 +1596,7 @@ def test_protocol_validation_mode_normalization_and_deprecation(
             ),
             root_dir=Path.cwd(),
         )
-    assert unknown_protocol._shacl_mode == "warn"
+    assert unknown_protocol._shacl_validator.mode.value == "warn"
     assert "Unsupported SHACL validation mode" in caplog.text
 
     with caplog.at_level("WARNING"):
diff --git a/tests/kg_build/test_rml_mapping.py b/tests/kg_build/test_rml_mapping.py
index 7d4d72e..810e31c 100644
--- a/tests/kg_build/test_rml_mapping.py
+++ b/tests/kg_build/test_rml_mapping.py
@@ -7,7 +7,6 @@
 import pytest
 from rdflib import Graph
 
-import wordlift_sdk.kg_build.rml_mapping as rml_module
 from wordlift_sdk.kg_build.rml_mapping import RmlMappingService
 
 
@@ -34,50 +33,46 @@ def _context(dataset_uri: str | None):
 
 
 @pytest.mark.asyncio
-async def test_apply_mapping_from_content_success(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    service = RmlMappingService(_context("https://data.example.com"))
+async def test_apply_mapping_from_content_success() -> None:
+    service = RmlMappingService(
+        _context("https://data.example.com"), pipeline=_Pipeline()
+    )
     service._html_converter.convert = MagicMock(return_value="<html></html>")
-    monkeypatch.setattr(rml_module, "MaterializationPipeline", _Pipeline)
     debug_output: dict[str, str] = {}
 
-    graph = await service.apply_mapping(
+    result = await service.apply_mapping(
         html="<html></html>",
         url="https://example.com/page",
         mapping_file_path="demo.yarrrml",
         mapping_content="m: 1",
         debug_output=debug_output,
     )
-    assert isinstance(graph, Graph)
-    assert len(graph) > 0
+    assert isinstance(result.graph, Graph)
+    assert len(result.graph) > 0
     assert debug_output["xhtml"] == "<html></html>"
 
 
 @pytest.mark.asyncio
 async def test_apply_mapping_file_not_found_returns_none() -> None:
     service = RmlMappingService(_context("https://data.example.com"))
-    out = await service.apply_mapping(
+    result = await service.apply_mapping(
         html="<html></html>",
         url="https://example.com",
         mapping_file_path=Path("/no/such/file.yarrrml"),
     )
-    assert out is None
+    assert result.graph is None
 
 
 @pytest.mark.asyncio
-async def test_apply_mapping_missing_dataset_uri_returns_none(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    service = RmlMappingService(_context(None))
-    monkeypatch.setattr(rml_module, "MaterializationPipeline", _Pipeline)
-    out = await service.apply_mapping(
+async def test_apply_mapping_missing_dataset_uri_returns_none() -> None:
+    service = RmlMappingService(_context(None), pipeline=_Pipeline())
+    result = await service.apply_mapping(
         html="<html></html>",
         url="https://example.com",
         mapping_file_path="x",
         mapping_content="m: 1",
     )
-    assert out is None
+    assert result.graph is None
 
 
 def test_normalize_schema_uris() -> None:
diff --git a/wordlift_sdk/kg_build/postprocessors/subprocess.py b/wordlift_sdk/kg_build/postprocessors/subprocess.py
index 52b1dfe..e5954af 100644
--- a/wordlift_sdk/kg_build/postprocessors/subprocess.py
+++ b/wordlift_sdk/kg_build/postprocessors/subprocess.py
@@ -16,9 +16,7 @@
 from rdflib import Graph
 
 from .types import (
-    Closeable,
     GraphPostprocessor,
-    LoadedPostprocessor,
     PostprocessorContext,
     PostprocessorRuntime,
     PostprocessorSpec,
@@ -275,6 +273,7 @@ def process_graph(
         self, graph: Graph, context: PostprocessorContext
     ) -> Graph | None:
         from .graph_io import _build_runner_payload
+
         return _run_subprocess(
             self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
         )
@@ -315,6 +314,9 @@ def _run(
                 f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "")
             )
 
+    def close(self) -> None:
+        pass  # oneshot processors have no persistent resources to release
+
 
 @dataclass
 class PersistentSubprocessPostprocessor:
@@ -335,6 +337,7 @@ def process_graph(
         self, graph: Graph, context: PostprocessorContext
     ) -> Graph | None:
         from .graph_io import _build_runner_payload
+
         return _run_subprocess(
             self.spec, self.root_dir, graph, _build_runner_payload(context), self._run
         )

From e193c5a1f7402d3f8d8d6f9bdf8536dd69b8a0d3 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 09:31:37 +0100
Subject: [PATCH 54/63] feat(tests): reorganise kg_build tests to mirror source
 package structure

---
 tests/kg_build/postprocessors/__init__.py                         | 0
 tests/kg_build/postprocessors/processors/__init__.py              | 0
 .../kg_build/{ => postprocessors/processors}/test_id_allocator.py | 0
 .../processors/test_id_generator.py}                              | 0
 .../{ => postprocessors/processors}/test_id_postprocessor.py      | 0
 .../test_oneshot_helpers.py}                                      | 0
 .../test_oneshot_main.py}                                         | 0
 .../test_persistent.py}                                           | 0
 tests/kg_build/{ => postprocessors}/test_postprocessors.py        | 0
 .../test_service.py}                                              | 0
 tests/{kg_build => workflow}/test_ingestion_bridge_url_handler.py | 0
 tests/{kg_build => workflow}/test_web_page_scrape_url_handler.py  | 0
 12 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/kg_build/postprocessors/__init__.py
 create mode 100644 tests/kg_build/postprocessors/processors/__init__.py
 rename tests/kg_build/{ => postprocessors/processors}/test_id_allocator.py (100%)
 rename tests/kg_build/{test_kg_build_id_generator.py => postprocessors/processors/test_id_generator.py} (100%)
 rename tests/kg_build/{ => postprocessors/processors}/test_id_postprocessor.py (100%)
 rename tests/kg_build/{test_postprocessor_runner_helpers.py => postprocessors/test_oneshot_helpers.py} (100%)
 rename tests/kg_build/{test_postprocessor_runner_main.py => postprocessors/test_oneshot_main.py} (100%)
 rename tests/kg_build/{test_postprocessor_worker.py => postprocessors/test_persistent.py} (100%)
 rename tests/kg_build/{ => postprocessors}/test_postprocessors.py (100%)
 rename tests/kg_build/{test_postprocessor_service.py => postprocessors/test_service.py} (100%)
 rename tests/{kg_build => workflow}/test_ingestion_bridge_url_handler.py (100%)
 rename tests/{kg_build => workflow}/test_web_page_scrape_url_handler.py (100%)

diff --git a/tests/kg_build/postprocessors/__init__.py b/tests/kg_build/postprocessors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/kg_build/postprocessors/processors/__init__.py b/tests/kg_build/postprocessors/processors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/kg_build/test_id_allocator.py b/tests/kg_build/postprocessors/processors/test_id_allocator.py
similarity index 100%
rename from tests/kg_build/test_id_allocator.py
rename to tests/kg_build/postprocessors/processors/test_id_allocator.py
diff --git a/tests/kg_build/test_kg_build_id_generator.py b/tests/kg_build/postprocessors/processors/test_id_generator.py
similarity index 100%
rename from tests/kg_build/test_kg_build_id_generator.py
rename to tests/kg_build/postprocessors/processors/test_id_generator.py
diff --git a/tests/kg_build/test_id_postprocessor.py b/tests/kg_build/postprocessors/processors/test_id_postprocessor.py
similarity index 100%
rename from tests/kg_build/test_id_postprocessor.py
rename to tests/kg_build/postprocessors/processors/test_id_postprocessor.py
diff --git a/tests/kg_build/test_postprocessor_runner_helpers.py b/tests/kg_build/postprocessors/test_oneshot_helpers.py
similarity index 100%
rename from tests/kg_build/test_postprocessor_runner_helpers.py
rename to tests/kg_build/postprocessors/test_oneshot_helpers.py
diff --git a/tests/kg_build/test_postprocessor_runner_main.py b/tests/kg_build/postprocessors/test_oneshot_main.py
similarity index 100%
rename from tests/kg_build/test_postprocessor_runner_main.py
rename to tests/kg_build/postprocessors/test_oneshot_main.py
diff --git a/tests/kg_build/test_postprocessor_worker.py b/tests/kg_build/postprocessors/test_persistent.py
similarity index 100%
rename from tests/kg_build/test_postprocessor_worker.py
rename to tests/kg_build/postprocessors/test_persistent.py
diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/postprocessors/test_postprocessors.py
similarity index 100%
rename from tests/kg_build/test_postprocessors.py
rename to tests/kg_build/postprocessors/test_postprocessors.py
diff --git a/tests/kg_build/test_postprocessor_service.py b/tests/kg_build/postprocessors/test_service.py
similarity index 100%
rename from tests/kg_build/test_postprocessor_service.py
rename to tests/kg_build/postprocessors/test_service.py
diff --git a/tests/kg_build/test_ingestion_bridge_url_handler.py b/tests/workflow/test_ingestion_bridge_url_handler.py
similarity index 100%
rename from tests/kg_build/test_ingestion_bridge_url_handler.py
rename to tests/workflow/test_ingestion_bridge_url_handler.py
diff --git a/tests/kg_build/test_web_page_scrape_url_handler.py b/tests/workflow/test_web_page_scrape_url_handler.py
similarity index 100%
rename from tests/kg_build/test_web_page_scrape_url_handler.py
rename to tests/workflow/test_web_page_scrape_url_handler.py

From db3aaf625cf9a1908170c67e97c63b4c599ecfb2 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 09:36:51 +0100
Subject: [PATCH 55/63] fix: update kg_build __init__ export paths after
 postprocessors reorganisation

---
 wordlift_sdk/kg_build/__init__.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py
index 1692a5e..3539b6f 100644
--- a/wordlift_sdk/kg_build/__init__.py
+++ b/wordlift_sdk/kg_build/__init__.py
@@ -64,13 +64,16 @@
         "wordlift_sdk.kg_build.container",
         "KgBuildApplicationContainer",
     ),
-    "IdAllocator": ("wordlift_sdk.kg_build.id_allocator", "IdAllocator"),
+    "IdAllocator": (
+        "wordlift_sdk.kg_build.postprocessors.processors.id_allocator",
+        "IdAllocator",
+    ),
     "CanonicalIdGenerator": (
-        "wordlift_sdk.kg_build.id_generator",
+        "wordlift_sdk.kg_build.postprocessors.processors.id_generator",
         "CanonicalIdGenerator",
     ),
     "CanonicalIdsPostprocessor": (
-        "wordlift_sdk.kg_build.id_postprocessor",
+        "wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor",
         "CanonicalIdsPostprocessor",
     ),
     "IriLookup": ("wordlift_sdk.kg_build.iri_lookup", "IriLookup"),

From a546ea02250a15c7210ced214823da20fe6e3df8 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 09:57:15 +0100
Subject: [PATCH 56/63] fix: update engine tests to mock at pool level after
 morph_kgc moved to subprocess

---
 ...tructured_data_engine_validation_helpers.py | 18 +-----------------
 ..._structured_data_materialization_generic.py | 17 +++++++++++------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/tests/test_structured_data_engine_validation_helpers.py b/tests/test_structured_data_engine_validation_helpers.py
index 884b483..f3c7700 100644
--- a/tests/test_structured_data_engine_validation_helpers.py
+++ b/tests/test_structured_data_engine_validation_helpers.py
@@ -977,23 +977,7 @@ def test_normalize_agent_yarrrml_additional_parser_branches(monkeypatch):
     assert any(m["name"] == "main" for m in mappings)
 
 
-def test_materialize_graph_and_xpath_first_text_branches(monkeypatch):
-    real_import = builtins.__import__
-
-    def _missing_morph(name, *args, **kwargs):
-        if name == "morph_kgc":
-            raise ImportError("missing")
-        return real_import(name, *args, **kwargs)
-
-    monkeypatch.setattr(builtins, "__import__", _missing_morph)
-    try:
-        engine._materialize_graph(Path("mapping.yarrrml"))
-        assert False, "expected RuntimeError"
-    except RuntimeError as exc:
-        assert "morph-kgc is required" in str(exc)
-    finally:
-        monkeypatch.setattr(builtins, "__import__", real_import)
-
+def test_materialize_graph_and_xpath_first_text_branches():
     class _Doc:
         def __init__(self):
             self.calls = 0
diff --git a/tests/test_structured_data_materialization_generic.py b/tests/test_structured_data_materialization_generic.py
index 8d387f8..425f228 100644
--- a/tests/test_structured_data_materialization_generic.py
+++ b/tests/test_structured_data_materialization_generic.py
@@ -577,12 +577,17 @@ def test_unsupported_xpath_or_function_raises_actionable_error(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    fake_morph = types.SimpleNamespace(
-        materialize=lambda _cfg: (_ for _ in ()).throw(
-            ValueError("XPathEvalError: Unsupported function local-namez()")
-        )
-    )
-    monkeypatch.setitem(sys.modules, "morph_kgc", fake_morph)
+    import wordlift_sdk.structured_data.engine as _engine
+
+    class _FakeFuture:
+        def result(self):
+            raise ValueError("XPathEvalError: Unsupported function local-namez()")
+
+    class _FakePool:
+        def submit(self, fn, *args, **kwargs):
+            return _FakeFuture()
+
+    monkeypatch.setattr(_engine, "_get_morph_kgc_pool", lambda: _FakePool())
 
     mapping = """
 prefixes:

From 6a32384c088bd8abe8d073eb7d41cff58d504dc0 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 12:54:58 +0100
Subject: [PATCH 57/63] fix(slicing): remap stdlib-only lazy exports to modules
 with real deps so the guard fires correctly

---
 wordlift_sdk/kg_build/__init__.py        | 2 +-
 wordlift_sdk/kg_build/protocol.py        | 1 +
 wordlift_sdk/structured_data/__init__.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py
index 3539b6f..49c260d 100644
--- a/wordlift_sdk/kg_build/__init__.py
+++ b/wordlift_sdk/kg_build/__init__.py
@@ -59,7 +59,7 @@
         "wordlift_sdk.kg_build.cloud_flow",
         "get_debug_output_dir",
     ),
-    "run_cloud_workflow": ("wordlift_sdk.kg_build.cloud_flow", "run_cloud_workflow"),
+    "run_cloud_workflow": ("wordlift_sdk.kg_build.protocol", "run_cloud_workflow"),
     "KgBuildApplicationContainer": (
         "wordlift_sdk.kg_build.container",
         "KgBuildApplicationContainer",
diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py
index 1295f64..d05b3e2 100644
--- a/wordlift_sdk/kg_build/protocol.py
+++ b/wordlift_sdk/kg_build/protocol.py
@@ -24,6 +24,7 @@
     ValidationOutcome,
 )
 
+from .cloud_flow import run_cloud_workflow as run_cloud_workflow  # noqa: F401
 from .config import ProfileDefinition
 from .entity_patcher import EntityPatcher
 from .kpi import KgBuildKpiCollector
diff --git a/wordlift_sdk/structured_data/__init__.py b/wordlift_sdk/structured_data/__init__.py
index 1e4d977..c5181c9 100644
--- a/wordlift_sdk/structured_data/__init__.py
+++ b/wordlift_sdk/structured_data/__init__.py
@@ -21,12 +21,12 @@
 
 
 _EXPORTS = {
-    "CreateRequest": ("wordlift_sdk.structured_data.models", "CreateRequest"),
+    "CreateRequest": ("wordlift_sdk.structured_data.orchestrator", "CreateRequest"),
     "CreateWorkflow": (
         "wordlift_sdk.structured_data.orchestrator",
         "CreateWorkflow",
     ),
-    "GenerateRequest": ("wordlift_sdk.structured_data.models", "GenerateRequest"),
+    "GenerateRequest": ("wordlift_sdk.structured_data.orchestrator", "GenerateRequest"),
     "GenerateWorkflow": (
         "wordlift_sdk.structured_data.orchestrator",
         "GenerateWorkflow",

From ae1e7e694df82e7d2bac886ce695e202d492861c Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 12:59:20 +0100
Subject: [PATCH 58/63] fix(tests): make lazy export tests slice-independent by
 stubbing heavy-dep modules

---
 tests/test_lazy_exports.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/tests/test_lazy_exports.py b/tests/test_lazy_exports.py
index 093c5c9..d7343ed 100644
--- a/tests/test_lazy_exports.py
+++ b/tests/test_lazy_exports.py
@@ -12,25 +12,51 @@ def _drop_modules(prefix: str) -> None:
             sys.modules.pop(name, None)
 
 
-def test_root_package_import_is_lazy():
+def test_root_package_import_is_lazy(monkeypatch: pytest.MonkeyPatch):
     _drop_modules("wordlift_sdk")
 
     package = importlib.import_module("wordlift_sdk")
 
     assert "wordlift_sdk.main" not in sys.modules
 
+    import types
+
+    stub_main = types.ModuleType("wordlift_sdk.main")
+    stub_main.run_kg_import_workflow = object()  # type: ignore[attr-defined]
+
+    def fake_import_module(name: str):
+        if name == "wordlift_sdk.main":
+            sys.modules["wordlift_sdk.main"] = stub_main
+            return stub_main
+        return importlib.import_module(name)
+
+    monkeypatch.setattr("wordlift_sdk._lazy_exports.import_module", fake_import_module)
+
     package.run_kg_import_workflow
 
     assert "wordlift_sdk.main" in sys.modules
 
 
-def test_feature_package_import_is_lazy():
+def test_feature_package_import_is_lazy(monkeypatch: pytest.MonkeyPatch):
     _drop_modules("wordlift_sdk.render")
 
     package = importlib.import_module("wordlift_sdk.render")
 
     assert "wordlift_sdk.render.html_renderer" not in sys.modules
 
+    import types
+
+    stub_renderer = types.ModuleType("wordlift_sdk.render.html_renderer")
+    stub_renderer.HtmlRenderer = object()  # type: ignore[attr-defined]
+
+    def fake_import_module(name: str):
+        if name == "wordlift_sdk.render.html_renderer":
+            sys.modules["wordlift_sdk.render.html_renderer"] = stub_renderer
+            return stub_renderer
+        return importlib.import_module(name)
+
+    monkeypatch.setattr("wordlift_sdk._lazy_exports.import_module", fake_import_module)
+
     package.HtmlRenderer
 
     assert "wordlift_sdk.render.html_renderer" in sys.modules

From c95d391a152acdbe4ee7cd198c0997468b78f928 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 13:05:59 +0100
Subject: [PATCH 59/63] fix(slicing): defer legacy import in
 create_google_search_console_data_import to avoid gql at collection time

---
 ..._google_search_console_data_import_helpers.py | 16 ++++++++++------
 .../create_google_search_console_data_import.py  |  3 ++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/test_google_search_console_data_import_helpers.py b/tests/test_google_search_console_data_import_helpers.py
index 025e148..e6601a1 100644
--- a/tests/test_google_search_console_data_import_helpers.py
+++ b/tests/test_google_search_console_data_import_helpers.py
@@ -2,6 +2,8 @@
 
 import asyncio
 import importlib
+import sys
+import types
 from datetime import datetime, timedelta
 from types import SimpleNamespace
 
@@ -16,6 +18,8 @@
     raise_error_if_account_analytics_not_configured,
 )
 
+_ENTITIES_MOD = "wordlift_sdk.deprecated.create_entities_with_top_query_dataframe"
+
 gsc_import_mod = importlib.import_module(
     "wordlift_sdk.google_search_console.create_google_search_console_data_import"
 )
@@ -43,9 +47,9 @@ async def test_create_google_search_console_data_import_only_imports_stale_rows(
     async def _fake_entities_df(key, url_list):
         return source_df
 
-    monkeypatch.setattr(
-        gsc_import_mod, "create_entities_with_top_query_dataframe", _fake_entities_df
-    )
+    stub = types.ModuleType(_ENTITIES_MOD)
+    stub.create_entities_with_top_query_dataframe = _fake_entities_df  # type: ignore[attr-defined]
+    monkeypatch.setitem(sys.modules, _ENTITIES_MOD, stub)
 
     called_urls: list[str] = []
 
@@ -100,9 +104,9 @@ async def test_create_google_search_console_data_import_skips_when_no_stale(
     async def _fake_entities_df(key, url_list):
         return source_df
 
-    monkeypatch.setattr(
-        gsc_import_mod, "create_entities_with_top_query_dataframe", _fake_entities_df
-    )
+    stub = types.ModuleType(_ENTITIES_MOD)
+    stub.create_entities_with_top_query_dataframe = _fake_entities_df  # type: ignore[attr-defined]
+    monkeypatch.setitem(sys.modules, _ENTITIES_MOD, stub)
 
     calls: dict[str, int] = {"gather": 0}
 
diff --git a/wordlift_sdk/google_search_console/create_google_search_console_data_import.py b/wordlift_sdk/google_search_console/create_google_search_console_data_import.py
index 00bd0fc..41f566d 100644
--- a/wordlift_sdk/google_search_console/create_google_search_console_data_import.py
+++ b/wordlift_sdk/google_search_console/create_google_search_console_data_import.py
@@ -9,7 +9,6 @@
 from twisted.mail.scripts.mailmail import Configuration
 from wordlift_client import AnalyticsImportRequest
 
-from ..deprecated import create_entities_with_top_query_dataframe
 from ..utils import create_delayed
 
 logger = logging.getLogger(__name__)
@@ -19,6 +18,8 @@ async def create_google_search_console_data_import(
     configuration: Configuration, key: str, url_list: list[str]
 ) -> None:
     # Get the entities data with the top query.
+    from ..deprecated import create_entities_with_top_query_dataframe
+
     entities_with_top_query_df = await create_entities_with_top_query_dataframe(
         key=key, url_list=url_list
     )

From 21d2408d0b1d7b2611010fad567941a9cdb2f137 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 13:08:50 +0100
Subject: [PATCH 60/63] =?UTF-8?q?fix(slicing):=20remove=20test=5Fingestion?=
 =?UTF-8?q?=5Fsource=5Fbridge=20from=20ingestion=20slice=20=E2=80=94=20nee?=
 =?UTF-8?q?ds=20legacy=20(gql)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/tools/run_slice_tests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/tools/run_slice_tests.py b/tests/tools/run_slice_tests.py
index 72e3111..d2ef093 100644
--- a/tests/tools/run_slice_tests.py
+++ b/tests/tools/run_slice_tests.py
@@ -41,7 +41,6 @@
         "tests/ingestion",
         "tests/test_google_sheets_url_provider.py",
         "tests/test_list_url_provider.py",
-        "tests/test_ingestion_source_bridge.py",
         "tests/url_provider/test_sitemap_url_provider.py",
     ],
     "structured-data": [

From 5094647307d6a7e827bf47b8288cbea2b9fa6e6b Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 14:00:07 +0100
Subject: [PATCH 61/63] fix(slicing): add python-liquid to workflow extra
 (needed by graph.ttl_liquid)

---
 poetry.lock    | 22 ++++++++++++++++------
 pyproject.toml |  1 +
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f0bc229..b3ac36a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand.
 
 [[package]]
 name = "advertools"
@@ -288,7 +288,7 @@ description = "Internationalization utilities"
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\" or extra == \"ingestion\""
+markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\" or extra == \"ingestion\""
 files = [
     {file = "babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35"},
     {file = "babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d"},
@@ -1702,8 +1702,11 @@ files = [
     {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
     {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
     {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
     {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
     {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
+    {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
     {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
     {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
     {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},
@@ -1788,7 +1791,7 @@ description = "Safely add untrusted strings to HTML/XML markup."
 optional = true
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\""
+markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\""
 files = [
     {file = "markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559"},
     {file = "markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419"},
@@ -3185,7 +3188,7 @@ description = "A Python engine for the Liquid template language."
 optional = true
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\""
+markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\""
 files = [
     {file = "python_liquid-2.1.0-py3-none-any.whl", hash = "sha256:d3bbcddff4e1a73287b59218df3471613598271e69ac3d17d97e000f4b984e3e"},
     {file = "python_liquid-2.1.0.tar.gz", hash = "sha256:a4c2abb24ac40ded8c9ba844ebbfbe78a3e41c6fe10a7bbe94144582569b73d0"},
@@ -3249,6 +3252,13 @@ optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
 files = [
+    {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
+    {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
     {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
     {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
     {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},
@@ -4345,9 +4355,9 @@ legacy = ["google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pyco
 render = ["lxml", "playwright"]
 structured-data = ["advertools", "lxml", "morph-kgc", "playwright", "pyshacl", "rdflib", "requests", "tqdm"]
 validation = ["pyshacl", "rdflib", "requests", "tqdm"]
-workflow = ["advertools", "google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pydantic-core", "rdflib", "tqdm"]
+workflow = ["advertools", "google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pydantic-core", "python-liquid", "rdflib", "tqdm"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <3.15"
-content-hash = "0810a8470047131214fc3655380b14044bb11660895b114d5f61fc0e0263d1bc"
+content-hash = "a119ca316866d292b70b03bb5e509c4eded8fb9d581ed3c5e541961e6aee98a8"
diff --git a/pyproject.toml b/pyproject.toml
index 96799a1..548349c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,7 @@ workflow = [
     "pandas",
     "playwright",
     "pydantic-core",
+    "python-liquid",
     "rdflib",
     "tqdm",
 ]

From 2c14ab6fcbadafad62f76610a373f25781679912 Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 14:47:10 +0100
Subject: [PATCH 62/63] fix: v7 integration fixes

---
 tests/test_dataset_resolver.py                | 15 ++++----
 tests/test_lazy_exports.py                    | 14 ++++++++
 ...chant_listing_defined_region_validation.py | 23 ++----------
 tests/test_product_snippet_validation.py      | 23 ++----------
 tests/test_recommended_one_of_validation.py   | 27 ++------------
 tests/test_structured_data_engine_class.py    | 15 ++++----
 ...structured_data_materialization_generic.py | 15 ++++----
 tests/test_structured_data_workflows.py       | 15 ++++----
 .../test_ingestion_bridge_url_handler.py      | 22 ++++++++----
 wordlift_sdk/graph/audit/_entity_matrix.py    |  2 +-
 wordlift_sdk/kg_build/__init__.py             | 20 +++++------
 wordlift_sdk/utils/__init__.py                |  4 +--
 wordlift_sdk/utils/{get_me.py => _get_me.py}  |  0
 .../utils/{reset_me.py => _reset_me.py}       |  0
 wordlift_sdk/validation/__init__.py           | 15 ++++++++
 .../validation/shacl_validation_service.py    | 35 +++++++------------
 16 files changed, 107 insertions(+), 138 deletions(-)
 rename wordlift_sdk/utils/{get_me.py => _get_me.py} (100%)
 rename wordlift_sdk/utils/{reset_me.py => _reset_me.py} (100%)

diff --git a/tests/test_dataset_resolver.py b/tests/test_dataset_resolver.py
index e42f838..03c945b 100644
--- a/tests/test_dataset_resolver.py
+++ b/tests/test_dataset_resolver.py
@@ -46,15 +46,16 @@ def __init__(self, *args, **kwargs) -> None:
         sys.modules.setdefault("wordlift_client.models", _models_module)
         sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module)
 
-_pyshacl = types.ModuleType("pyshacl")
+try:
+    import pyshacl as _pyshacl_real  # noqa: F401
+except ImportError:
+    _pyshacl = types.ModuleType("pyshacl")
 
+    def _stub_validate(*_args, **_kwargs):
+        return None, None, None
 
-def _stub_validate(*_args, **_kwargs):
-    return None, None, None
-
-
-_pyshacl.validate = _stub_validate
-sys.modules.setdefault("pyshacl", _pyshacl)
+    _pyshacl.validate = _stub_validate
+    sys.modules["pyshacl"] = _pyshacl
 
 from wordlift_sdk.structured_data.dataset_resolver import DatasetResolver  # noqa: E402
 
diff --git a/tests/test_lazy_exports.py b/tests/test_lazy_exports.py
index d7343ed..e7f020f 100644
--- a/tests/test_lazy_exports.py
+++ b/tests/test_lazy_exports.py
@@ -6,8 +6,22 @@
 import pytest
 
 
+# Modules that own ProcessPoolExecutors must not be evicted — dropping them
+# causes function-identity mismatches when the pool tries to pickle workers.
+_PRESERVE_MODULES = frozenset(
+    [
+        "wordlift_sdk.structured_data.engine",
+        "wordlift_sdk.validation.shacl_validation_service",
+        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler",
+        "wordlift_sdk.workflow.url_handler.web_page_scrape_url_handler",
+    ]
+)
+
+
 def _drop_modules(prefix: str) -> None:
     for name in list(sys.modules):
+        if name in _PRESERVE_MODULES:
+            continue
         if name == prefix or name.startswith(f"{prefix}."):
             sys.modules.pop(name, None)
 
diff --git a/tests/test_merchant_listing_defined_region_validation.py b/tests/test_merchant_listing_defined_region_validation.py
index d17c799..43d9474 100644
--- a/tests/test_merchant_listing_defined_region_validation.py
+++ b/tests/test_merchant_listing_defined_region_validation.py
@@ -1,30 +1,13 @@
-import importlib.util
 import json
-import sys
 from pathlib import Path
 
-import pytest
-
 from wordlift_sdk.validation import shacl
 from wordlift_sdk.validation.shacl import extract_validation_issues
 
 
-def _load_real_validate(monkeypatch: pytest.MonkeyPatch):
-    if "pyshacl" in sys.modules:
-        monkeypatch.delitem(sys.modules, "pyshacl", raising=False)
-    spec = importlib.util.find_spec("pyshacl")
-    if spec is None or spec.loader is None:
-        raise RuntimeError("pyshacl is required for this test.")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module.validate
-
-
 def test_merchant_listing_defined_region_address_country_only_is_warning_only(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     payload = {
         "@context": {"@vocab": "http://schema.org/"},
         "@id": "https://data.wordlift.io/wl1506344/merchant-return-policys/shipping-policy/offer-shipping-details/offer-shipping-details-1/defined-regions/defined-region",
@@ -51,10 +34,8 @@ def test_merchant_listing_defined_region_address_country_only_is_warning_only(
 
 
 def test_defined_region_address_country_only_conforms_with_default_shapes(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     payload = {
         "@context": {"@vocab": "http://schema.org/"},
         "@id": "https://data.wordlift.io/wl1506344/merchant-return-policys/shipping-policy/offer-shipping-details/offer-shipping-details-1/defined-regions/defined-region",
diff --git a/tests/test_product_snippet_validation.py b/tests/test_product_snippet_validation.py
index 9054125..b202ba1 100644
--- a/tests/test_product_snippet_validation.py
+++ b/tests/test_product_snippet_validation.py
@@ -1,29 +1,12 @@
 from pathlib import Path
-import importlib.util
 import json
-import sys
-
-import pytest
 
 from wordlift_sdk.validation import shacl
 
 
-def _load_real_validate(monkeypatch: pytest.MonkeyPatch):
-    if "pyshacl" in sys.modules:
-        monkeypatch.delitem(sys.modules, "pyshacl", raising=False)
-    spec = importlib.util.find_spec("pyshacl")
-    if spec is None or spec.loader is None:
-        raise RuntimeError("pyshacl is required for this test.")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module.validate
-
-
 def test_product_snippet_offers_satisfies_one_of(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     fixture = Path("tests/fixtures/product_snippet_offers.jsonld")
     data = json.loads(fixture.read_text(encoding="utf-8"))
 
@@ -43,10 +26,8 @@ def test_product_snippet_offers_satisfies_one_of(
 
 
 def test_product_snippet_aggregate_offer_satisfies_one_of(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     fixture = Path("tests/fixtures/product_snippet_aggregate_offer.jsonld")
     data = json.loads(fixture.read_text(encoding="utf-8"))
 
diff --git a/tests/test_recommended_one_of_validation.py b/tests/test_recommended_one_of_validation.py
index f4c89c2..29ce149 100644
--- a/tests/test_recommended_one_of_validation.py
+++ b/tests/test_recommended_one_of_validation.py
@@ -1,25 +1,10 @@
-import importlib.util
 import json
-import sys
 from pathlib import Path
 
-import pytest
-
 from wordlift_sdk.validation import shacl
 from wordlift_sdk.validation.shacl import extract_validation_issues
 
 
-def _load_real_validate(monkeypatch: pytest.MonkeyPatch):
-    if "pyshacl" in sys.modules:
-        monkeypatch.delitem(sys.modules, "pyshacl", raising=False)
-    spec = importlib.util.find_spec("pyshacl")
-    if spec is None or spec.loader is None:
-        raise RuntimeError("pyshacl is required for this test.")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module.validate
-
-
 def _write_jsonld(tmp_path: Path, name: str, payload: dict) -> Path:
     path = tmp_path / name
     path.write_text(json.dumps(payload), encoding="utf-8")
@@ -31,10 +16,8 @@ def _messages_for(result) -> list[str]:
 
 
 def test_dataset_recommended_either_or_is_warning_only(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     missing_payload = {
         "@context": {"@vocab": "http://schema.org/"},
         "@type": "Dataset",
@@ -74,10 +57,8 @@ def test_dataset_recommended_either_or_is_warning_only(
 
 
 def test_offer_shipping_details_recommended_either_or_is_warning_only(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     missing_payload = {
         "@context": {"@vocab": "http://schema.org/"},
         "@type": "OfferShippingDetails",
@@ -116,10 +97,8 @@ def test_offer_shipping_details_recommended_either_or_is_warning_only(
 
 
 def test_product_offer_price_currency_recommended_either_or_is_warning_only(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    tmp_path: Path,
 ) -> None:
-    monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch))
-
     missing_payload = {
         "@context": {"@vocab": "http://schema.org/"},
         "@type": "Product",
diff --git a/tests/test_structured_data_engine_class.py b/tests/test_structured_data_engine_class.py
index f4a272e..b208a78 100644
--- a/tests/test_structured_data_engine_class.py
+++ b/tests/test_structured_data_engine_class.py
@@ -44,15 +44,16 @@ def __init__(self, *args, **kwargs) -> None:
         sys.modules.setdefault("wordlift_client.models", _models_module)
         sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module)
 
-_pyshacl = types.ModuleType("pyshacl")
+try:
+    import pyshacl as _pyshacl_real  # noqa: F401
+except ImportError:
+    _pyshacl = types.ModuleType("pyshacl")
 
+    def _stub_validate(*_args, **_kwargs):
+        return None, None, None
 
-def _stub_validate(*_args, **_kwargs):
-    return None, None, None
-
-
-_pyshacl.validate = _stub_validate
-sys.modules.setdefault("pyshacl", _pyshacl)
+    _pyshacl.validate = _stub_validate
+    sys.modules["pyshacl"] = _pyshacl
 
 from wordlift_sdk.structured_data.structured_data_engine import (  # noqa: E402
     StructuredDataEngine,
diff --git a/tests/test_structured_data_materialization_generic.py b/tests/test_structured_data_materialization_generic.py
index 425f228..f4f492b 100644
--- a/tests/test_structured_data_materialization_generic.py
+++ b/tests/test_structured_data_materialization_generic.py
@@ -50,15 +50,16 @@ def __init__(self, *args, **kwargs) -> None:
         sys.modules.setdefault("wordlift_client.models", _models_module)
         sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module)
 
-_pyshacl = types.ModuleType("pyshacl")
+try:
+    import pyshacl as _pyshacl_real  # noqa: F401
+except ImportError:
+    _pyshacl = types.ModuleType("pyshacl")
 
+    def _stub_validate(*_args, **_kwargs):
+        return None, None, None
 
-def _stub_validate(*_args, **_kwargs):
-    return None, None, None
-
-
-_pyshacl.validate = _stub_validate
-sys.modules.setdefault("pyshacl", _pyshacl)
+    _pyshacl.validate = _stub_validate
+    sys.modules["pyshacl"] = _pyshacl
 
 from wordlift_sdk.structured_data.engine import (  # noqa: E402
     materialize_yarrrml_jsonld,
diff --git a/tests/test_structured_data_workflows.py b/tests/test_structured_data_workflows.py
index 373802c..0b7ecad 100644
--- a/tests/test_structured_data_workflows.py
+++ b/tests/test_structured_data_workflows.py
@@ -52,15 +52,16 @@ def __init__(self, *args, **kwargs) -> None:
         sys.modules.setdefault("wordlift_client.models", _models_module)
         sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module)
 
-_pyshacl = types.ModuleType("pyshacl")
+try:
+    import pyshacl as _pyshacl_real  # noqa: F401
+except ImportError:
+    _pyshacl = types.ModuleType("pyshacl")
 
+    def _stub_validate(*_args, **_kwargs):
+        return None, None, None
 
-def _stub_validate(*_args, **_kwargs):
-    return None, None, None
-
-
-_pyshacl.validate = _stub_validate
-sys.modules.setdefault("pyshacl", _pyshacl)
+    _pyshacl.validate = _stub_validate
+    sys.modules["pyshacl"] = _pyshacl
 
 from wordlift_sdk.structured_data import (  # noqa: E402
     CreateRequest,
diff --git a/tests/workflow/test_ingestion_bridge_url_handler.py b/tests/workflow/test_ingestion_bridge_url_handler.py
index 60fe267..5ad4fca 100644
--- a/tests/workflow/test_ingestion_bridge_url_handler.py
+++ b/tests/workflow/test_ingestion_bridge_url_handler.py
@@ -6,6 +6,7 @@
 
 import pytest
 
+import wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler as _handler_mod
 from wordlift_sdk.ingestion.errors import LoaderRuntimeError
 from wordlift_sdk.ingestion.loaders import PlaywrightLoaderAdapter
 from wordlift_sdk.url_source import Url
@@ -37,7 +38,8 @@ async def test_ingestion_bridge_handler_calls_callback(
     )
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[
                 SimpleNamespace(
@@ -88,7 +90,8 @@ async def test_ingestion_bridge_handler_raises_on_failed_ingestion(
     )
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[],
             events=[
@@ -124,7 +127,8 @@ async def test_ingestion_bridge_handler_raises_and_skips_callback_on_http_404(
     )
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[
                 SimpleNamespace(
@@ -164,7 +168,8 @@ async def test_ingestion_bridge_handler_raises_and_skips_callback_on_http_500(
     )
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[
                 SimpleNamespace(
@@ -205,7 +210,8 @@ async def test_ingestion_bridge_handler_surfaces_failed_meta_diagnostics(
     caplog.set_level("ERROR")
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[],
             events=[
@@ -265,7 +271,8 @@ async def test_ingestion_bridge_handler_meta_fallback_keeps_old_message(
     )
 
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[],
             events=[
@@ -306,7 +313,8 @@ async def test_ingestion_bridge_handler_truncates_diagnostics_payload(
 
     long_message = "token=abc123 " + ("x" * 10000)
     monkeypatch.setattr(
-        "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion",
+        _handler_mod,
+        "run_ingestion",
         lambda settings: SimpleNamespace(
             pages=[],
             events=[
diff --git a/wordlift_sdk/graph/audit/_entity_matrix.py b/wordlift_sdk/graph/audit/_entity_matrix.py
index 23b3048..1dab968 100644
--- a/wordlift_sdk/graph/audit/_entity_matrix.py
+++ b/wordlift_sdk/graph/audit/_entity_matrix.py
@@ -15,7 +15,7 @@
     _find_webpage_urls,
 )
 from wordlift_sdk.validation.shacl import (
-    normalize_schema_org_uris,
+    _normalize_schema_org_uris as normalize_schema_org_uris,  # type: ignore[attr-defined]
 )
 
 _SCHEMA_ORG_PREFIXES = ("http://schema.org/", "https://schema.org/")
diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py
index 49c260d..17c9c4d 100644
--- a/wordlift_sdk/kg_build/__init__.py
+++ b/wordlift_sdk/kg_build/__init__.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
-from importlib import import_module
-from typing import Any
-
+from .._lazy_exports import resolve_attr
 
 __all__ = [
     "ProfileConfig",
@@ -142,12 +140,10 @@
 }
 
 
-def __getattr__(name: str) -> Any:
-    target = _EXPORTS.get(name)
-    if target is None:
-        raise AttributeError(
-            f"module 'wordlift_sdk.kg_build' has no attribute '{name}'"
-        )
-    module_name, attr_name = target
-    module = import_module(module_name)
-    return getattr(module, attr_name)
+def __getattr__(name: str):
+    return resolve_attr(
+        name=name,
+        module_name="wordlift_sdk.kg_build",
+        exports=_EXPORTS,
+        extra="kg-build",
+    )
diff --git a/wordlift_sdk/utils/__init__.py b/wordlift_sdk/utils/__init__.py
index df6297e..18d34b3 100644
--- a/wordlift_sdk/utils/__init__.py
+++ b/wordlift_sdk/utils/__init__.py
@@ -36,8 +36,8 @@
         "create_entity_patch_request",
     ),
     "create_delayed": ("wordlift_sdk.utils.delayed", "create_delayed"),
-    "get_me": ("wordlift_sdk.utils.get_me", "get_me"),
-    "reset_me": ("wordlift_sdk.utils.reset_me", "reset_me"),
+    "get_me": ("wordlift_sdk.utils._get_me", "get_me"),
+    "reset_me": ("wordlift_sdk.utils._reset_me", "reset_me"),
     "HtmlConverter": ("wordlift_sdk.utils.html_converter", "HtmlConverter"),
     "AutoConcurrencyController": (
         "wordlift_sdk.utils.auto_concurrency",
diff --git a/wordlift_sdk/utils/get_me.py b/wordlift_sdk/utils/_get_me.py
similarity index 100%
rename from wordlift_sdk/utils/get_me.py
rename to wordlift_sdk/utils/_get_me.py
diff --git a/wordlift_sdk/utils/reset_me.py b/wordlift_sdk/utils/_reset_me.py
similarity index 100%
rename from wordlift_sdk/utils/reset_me.py
rename to wordlift_sdk/utils/_reset_me.py
diff --git a/wordlift_sdk/validation/__init__.py b/wordlift_sdk/validation/__init__.py
index bcc616f..40701c1 100644
--- a/wordlift_sdk/validation/__init__.py
+++ b/wordlift_sdk/validation/__init__.py
@@ -17,6 +17,9 @@
     "prepare_shapes",
     "validate_file",
     "validate_jsonld_from_url",
+    "ShaclValidationService",
+    "ValidationMode",
+    "ValidationOutcome",
 ]
 
 
@@ -51,6 +54,18 @@
         "wordlift_sdk.validation.shacl",
         "validate_jsonld_from_url",
     ),
+    "ShaclValidationService": (
+        "wordlift_sdk.validation.shacl_validation_service",
+        "ShaclValidationService",
+    ),
+    "ValidationMode": (
+        "wordlift_sdk.validation.shacl_validation_service",
+        "ValidationMode",
+    ),
+    "ValidationOutcome": (
+        "wordlift_sdk.validation.shacl_validation_service",
+        "ValidationOutcome",
+    ),
 }
 
 
diff --git a/wordlift_sdk/validation/shacl_validation_service.py b/wordlift_sdk/validation/shacl_validation_service.py
index 60e01f7..7ad8499 100644
--- a/wordlift_sdk/validation/shacl_validation_service.py
+++ b/wordlift_sdk/validation/shacl_validation_service.py
@@ -10,11 +10,10 @@
 from enum import Enum
 from typing import Any
 
-from pyshacl import validate as pyshacl_validate
 from rdflib import Graph
 from rdflib.namespace import SH
 
-from wordlift_sdk.validation.shacl import load_shapes_graph, normalize_schema_org_uris
+from wordlift_sdk.validation.shacl import PreparedShaclValidator
 
 logger = logging.getLogger(__name__)
 
@@ -29,13 +28,12 @@ class ValidationMode(str, Enum):
 
 # Module-level worker state — one copy per subprocess, initialised by _init_worker.
 # Must be module-level for picklability by ProcessPoolExecutor.
-_worker_shapes_graph: Graph | None = None
-_worker_source_map: dict = {}
+_worker_validator: PreparedShaclValidator | None = None
 
 
 def _init_worker(shape_specs: list[str] | None) -> None:
-    global _worker_shapes_graph, _worker_source_map
-    _worker_shapes_graph, _worker_source_map = load_shapes_graph(shape_specs)
+    global _worker_validator
+    _worker_validator = PreparedShaclValidator.from_shape_specs(shape_specs)
 
 
 def _validate_in_worker(ntriples: str, submit_time: float) -> dict:
@@ -44,30 +42,23 @@ def _validate_in_worker(ntriples: str, submit_time: float) -> dict:
 
     data_graph = Graph()
     data_graph.parse(data=ntriples, format="nt")
-    data_graph = normalize_schema_org_uris(data_graph)
 
-    conforms, report_graph, _ = pyshacl_validate(
-        data_graph,
-        shacl_graph=_worker_shapes_graph,
-        inference="rdfs",
-        abort_on_first=False,
-        allow_infos=True,
-        allow_warnings=True,
-    )
+    result = _worker_validator.validate_graph(data_graph)
+    source_map = _worker_validator.prepared_shapes.shape_source_map
 
     warning_sources: dict[str, int] = {}
     error_sources: dict[str, int] = {}
-    for node in report_graph.subjects(SH.resultSeverity, SH.Warning):
-        shape = next(report_graph.objects(node, SH.sourceShape), None)
-        label = _worker_source_map.get(shape, "unknown")
+    for node in result.report_graph.subjects(SH.resultSeverity, SH.Warning):
+        shape = next(result.report_graph.objects(node, SH.sourceShape), None)
+        label = source_map.get(shape, "unknown")
         warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1
-    for node in report_graph.subjects(SH.resultSeverity, SH.Violation):
-        shape = next(report_graph.objects(node, SH.sourceShape), None)
-        label = _worker_source_map.get(shape, "unknown")
+    for node in result.report_graph.subjects(SH.resultSeverity, SH.Violation):
+        shape = next(result.report_graph.objects(node, SH.sourceShape), None)
+        label = source_map.get(shape, "unknown")
         error_sources[str(label)] = error_sources.get(str(label), 0) + 1
 
     return {
-        "passed": bool(conforms),
+        "passed": bool(result.conforms),
         "warning_sources": dict(sorted(warning_sources.items())),
         "error_sources": dict(sorted(error_sources.items())),
         "queue_wait_ms": queue_wait_ms,

From d012f3da36b1f8e67bd6e59879f1c3aa6a3dae4e Mon Sep 17 00:00:00 2001
From: Rubens Panfili <rubens.panfili@gmail.com>
Date: Fri, 20 Mar 2026 17:59:40 +0100
Subject: [PATCH 63/63] chore: bump to v8.0.0

---
 CHANGELOG.md   | 35 +++++++++++++++++++++++++++++++++++
 pyproject.toml |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11327be..7a83011 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,40 @@
 # Changelog
 
+## 8.0.0 - 2026-03-20
+
+### Breaking
+
+- `kg_build` postprocessor subprocess entry points renamed:
+  - `runner.py` → `oneshot.py`
+  - `worker.py` → `persistent.py`
+- `SubprocessPostprocessor` split into `OneshotPostprocessor` and `PersistentPostprocessor`; any host code referencing the old class name must be updated.
+- `PostprocessorService` is now profile-agnostic; profile resolution no longer happens inside the service.
+- `utils.get_me` / `utils.reset_me` module files renamed to `_get_me.py` / `_reset_me.py`; direct submodule imports (not recommended) must be updated.
+
+### Added
+
+- `ShaclValidationService` — runs SHACL validation in a dedicated process pool via `PreparedShaclValidator`, wired into `ProfileImportProtocol`.
+- Separate pool-size settings for postprocessors and SHACL validation.
+- In-process postprocessor runtime (`inprocess`) for single-process execution.
+- SHACL process-pool queue-wait and execution-time tracking in timing logs.
+- `morph_kgc` subprocess pool for true RML-mapping parallelism, bypassing `pyparsing` lock contention and the GIL.
+  - Configurable pool size via `morph_kgc_pool_size` / `MORPH_KGC_POOL_SIZE`.
+  - Subprocess queue-wait tracked separately in timing logs.
+- `PostprocessorResult` dataclass — replaces implicit tuple return from postprocessing stage.
+- `ImportAnnotationPostprocessor` and `RootIdReconcilerPostprocessor` extracted as named processors.
+- `first_level_subjects` graph utility helper.
+- Slice verification tooling extended with `run_slice_smoke_imports.py` and `run_slice_tests.py`.
+
+### Changed
+
+- Postprocessors reorganised into `postprocessors/` subpackage (`processors/`, `PostprocessorService`, loader helpers).
+- `ProfileImportProtocol.__init__` decomposed into focused `_init_*` factory methods; class surface significantly reduced.
+- `morph_kgc` RML mapping stage runs in subprocess pool instead of a thread executor.
+- SHACL validation and postprocessors offloaded to dedicated thread/process pools; ingestion runs in an executor to avoid blocking the event loop.
+- Persistent `ApiClient` reused across requests instead of one per graph; `ApiClient` is closed on protocol shutdown.
+- Lazy-export guards remapped to modules with real third-party dependencies so `ModuleNotFoundError` fires correctly when an extra is absent.
+- `python-liquid` added to the `workflow` extra (required by `graph.ttl_liquid`).
+
 ## 7.0.0 - 2026-03-15
 
 ### Breaking
diff --git a/pyproject.toml b/pyproject.toml
index 548349c..3d7ee87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "wordlift-sdk"
-version = "7.0.1"
+version = "8.0.0"
 description = "Python toolkit for orchestrating WordLift imports and structured data workflows."
 authors = ["David Riccitelli <david@wordlift.io>"]
 readme = "README.md"