From 69d8c409869bdfd7b003d0ad6069a2e6f4f945fa Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 5 Mar 2026 17:56:03 +0100 Subject: [PATCH 01/63] chore: vscode excluded by gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 38d49af..2d0c882 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__/ dist/ .claude/ + +.vscode/ From 5e9499df2765940a0a7bb3d78586ca68f5fdc8cf Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Tue, 17 Mar 2026 16:01:25 +0100 Subject: [PATCH 02/63] fix: run ingestion in executor to avoid blocking event loop --- .../url_handler/ingestion_web_page_scrape_url_handler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py b/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py index a228e26..6c7a719 100644 --- a/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py +++ b/wordlift_sdk/workflow/url_handler/ingestion_web_page_scrape_url_handler.py @@ -1,5 +1,7 @@ from __future__ import annotations +import asyncio +import functools import json import logging import re @@ -43,7 +45,10 @@ def __init__( async def __call__(self, url: Url) -> None: settings = self._build_settings(url) - result = run_ingestion(settings) + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + None, functools.partial(run_ingestion, settings) + ) if not result.pages: failed = [ From 87d5a149f2a047c5baa2baa4adf8db1ebaf8e2a4 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Tue, 17 Mar 2026 16:05:45 +0100 Subject: [PATCH 03/63] fix: offload postprocessors and validation to executor to prevent blocking event loop --- wordlift_sdk/kg_build/protocol.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index fd8ddc0..6bf88b8 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import functools import hashlib import logging import os @@ -95,6 +96,7 @@ def __init__( self._mapping_cache: dict[Path, str] = {} self._static_templates_patched = False self._static_templates_lock = asyncio.Lock() + self._postprocessor_lock = asyncio.Lock() canonical_id_strategy = ( str( self.profile.settings.get( @@ -209,7 +211,18 @@ async def callback( if existing_web_page_id: self._reconcile_root_id(graph, existing_web_page_id) - graph = self._apply_postprocessors(graph, url, response, existing_web_page_id) + loop = asyncio.get_event_loop() + async with self._postprocessor_lock: + graph = await loop.run_in_executor( + None, + functools.partial( + self._apply_postprocessors, + graph, + url, + response, + existing_web_page_id, + ), + ) # Canonical IDs must run after custom postprocessors so any nodes minted # by local logic are normalized before graph sync patching. graph = self._core_ids.process_graph( @@ -225,7 +238,9 @@ async def callback( ) self._write_debug_graph(graph, url) - validation_payload = self._validate_graph_if_enabled(graph, url) + validation_payload = await loop.run_in_executor( + None, functools.partial(self._validate_graph_if_enabled, graph, url) + ) graph_metrics = self._kpi.graph_metrics(graph) self._emit_progress( { @@ -284,8 +299,14 @@ async def _patch_static_templates_once(self) -> None: self._ensure_templates_loaded() if self._template_graph and len(self._template_graph) > 0: - validation_payload = self._validate_graph_if_enabled( - self._template_graph, "static_templates" + _loop = asyncio.get_event_loop() + validation_payload = await _loop.run_in_executor( + None, + functools.partial( + self._validate_graph_if_enabled, + self._template_graph, + "static_templates", + ), ) self._emit_progress( { From 9514e4580c14e74cd1b93f7acf3e398fdf3eec25 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Tue, 17 Mar 2026 16:39:24 +0100 Subject: [PATCH 04/63] feat: use postprocessor pool for true concurrent processing --- wordlift_sdk/kg_build/protocol.py | 57 +++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 6bf88b8..9aaecb4 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -96,7 +96,6 @@ def __init__( self._mapping_cache: dict[Path, str] = {} self._static_templates_patched = False self._static_templates_lock = asyncio.Lock() - self._postprocessor_lock = asyncio.Lock() canonical_id_strategy = ( str( self.profile.settings.get( @@ -117,11 +116,25 @@ def __init__( self._postprocessor_runtime, self.profile.origins.get("postprocessor_runtime", "default"), ) - self._postprocessors = load_postprocessors_for_profile( - root_dir=self.root_dir, - profile_name=self.profile.name, - runtime=self._postprocessor_runtime, + _pool_size = int( + self.profile.settings.get( + "concurrency", self.profile.settings.get("CONCURRENCY", 4) + ) ) + logger.info( + "Postprocessor pool size for profile '%s': %d", + self.profile.name, + _pool_size, + ) + self._postprocessors_queue: asyncio.Queue = asyncio.Queue() + for _ in range(_pool_size): + self._postprocessors_queue.put_nowait( + load_postprocessors_for_profile( + root_dir=self.root_dir, + profile_name=self.profile.name, + runtime=self._postprocessor_runtime, + ) + ) self._shacl_mode = self._resolve_validation_mode( self.profile.settings.get( "shacl_validate_mode", @@ -212,17 +225,21 @@ async def callback( if existing_web_page_id: self._reconcile_root_id(graph, existing_web_page_id) loop = asyncio.get_event_loop() - async with self._postprocessor_lock: + _postprocessors = await self._postprocessors_queue.get() + try: graph = await loop.run_in_executor( None, functools.partial( - self._apply_postprocessors, + self._apply_postprocessors_with, graph, url, response, existing_web_page_id, + _postprocessors, ), ) + finally: + self._postprocessors_queue.put_nowait(_postprocessors) # Canonical IDs must run after custom postprocessors so any nodes minted # by local logic are normalized before graph sync patching. graph = self._core_ids.process_graph( @@ -262,7 +279,11 @@ async def callback( logger.info("Wrote %s triples for %s", len(graph), url) def close(self) -> None: - close_loaded_postprocessors(self._postprocessors) + while not self._postprocessors_queue.empty(): + try: + close_loaded_postprocessors(self._postprocessors_queue.get_nowait()) + except asyncio.QueueEmpty: + break def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) @@ -485,7 +506,23 @@ def _apply_postprocessors( response: WebPageScrapeResponse, existing_web_page_id: str | None, ) -> Graph: - if not self._postprocessors: + return self._apply_postprocessors_with( + graph, + url, + response, + existing_web_page_id, + list(self._postprocessors_queue._queue), # type: ignore[attr-defined] + ) + + def _apply_postprocessors_with( + self, + graph: Graph, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + postprocessors: list, + ) -> Graph: + if not postprocessors: return graph pp_context = self._build_pp_context(url, response, existing_web_page_id) @@ -495,7 +532,7 @@ def _apply_postprocessors( "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY." ) - for processor in self._postprocessors: + for processor in postprocessors: graph = processor.run(graph, pp_context) logger.info("Applied postprocessor '%s' for %s", processor.name, url) return graph From 6706fcc018d689ea491b922306e3ec6ce3d5fefb Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Tue, 17 Mar 2026 17:12:06 +0100 Subject: [PATCH 05/63] fix: increase postprocessor startup timeout from 10s to 60s --- wordlift_sdk/kg_build/postprocessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index b67de6b..a41af15 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -181,7 +181,7 @@ def _ensure_started(self) -> subprocess.Popen[str]: try: ready = self._read_message( - process, timeout_seconds=min(self._spec.timeout_seconds, 10) + process, timeout_seconds=min(self._spec.timeout_seconds, 60) ) except Exception: self._terminate(process) From 1ceeef78cdde90b9337c5b9ad6c792e8c9899e57 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Tue, 17 Mar 2026 17:41:51 +0100 Subject: [PATCH 06/63] debug: add timing instrumentation to mapping, postprocessor, and validation stages --- wordlift_sdk/kg_build/protocol.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 9aaecb4..4b63047 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -6,6 +6,7 @@ import logging import os import tempfile +import time from dataclasses import asdict from pathlib import Path from types import SimpleNamespace @@ -210,6 +211,7 @@ async def callback( mapping_response = self._mapping_response(response, existing_web_page_id) debug_output: dict[str, str] | None = {} if self.debug_dir else None + _t0 = time.perf_counter() graph = await self.rml_service.apply_mapping( html=response.web_page.html, url=url, @@ -218,6 +220,7 @@ async def callback( response=mapping_response, debug_output=debug_output, ) + _t_mapping = int((time.perf_counter() - _t0) * 1000) if not graph or len(graph) == 0: logger.warning("No triples produced for %s", url) return @@ -225,8 +228,11 @@ async def callback( if existing_web_page_id: self._reconcile_root_id(graph, existing_web_page_id) loop = asyncio.get_event_loop() + _t1 = time.perf_counter() _postprocessors = await self._postprocessors_queue.get() + _t_queue_wait = int((time.perf_counter() - _t1) * 1000) try: + _t2 = time.perf_counter() graph = await loop.run_in_executor( None, functools.partial( @@ -238,6 +244,7 @@ async def callback( _postprocessors, ), ) + _t_postprocessors = int((time.perf_counter() - _t2) * 1000) finally: self._postprocessors_queue.put_nowait(_postprocessors) # Canonical IDs must run after custom postprocessors so any nodes minted @@ -255,9 +262,11 @@ async def callback( ) self._write_debug_graph(graph, url) + _t3 = time.perf_counter() validation_payload = await loop.run_in_executor( None, functools.partial(self._validate_graph_if_enabled, graph, url) ) + _t_validation = int((time.perf_counter() - _t3) * 1000) graph_metrics = self._kpi.graph_metrics(graph) self._emit_progress( { @@ -276,7 +285,15 @@ async def callback( ): raise RuntimeError(f"SHACL validation failed for {url} in fail mode.") await self._write_graph(graph) - logger.info("Wrote %s triples for %s", len(graph), url) + logger.info( + "Wrote %s triples for %s [mapping=%dms queue_wait=%dms postprocessors=%dms validation=%dms]", + len(graph), + url, + _t_mapping, + _t_queue_wait, + _t_postprocessors, + _t_validation, + ) def close(self) -> None: while not self._postprocessors_queue.empty(): @@ -533,8 +550,14 @@ def _apply_postprocessors_with( ) for processor in postprocessors: + _tp = time.perf_counter() graph = processor.run(graph, pp_context) - logger.info("Applied postprocessor '%s' for %s", processor.name, url) + logger.info( + "Applied postprocessor '%s' for %s [%dms]", + processor.name, + url, + int((time.perf_counter() - _tp) * 1000), + ) return graph def _build_pp_context( From 6b99e6e7464fec7623b38f5c708a0815dac22c28 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 09:03:14 +0100 Subject: [PATCH 07/63] refactor: pre-load SHACL and validate in-memory to avoid I/O --- wordlift_sdk/kg_build/protocol.py | 60 ++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 4b63047..d6b7c65 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -5,7 +5,7 @@ import hashlib import logging import os -import tempfile + import time from dataclasses import asdict from pathlib import Path @@ -19,10 +19,13 @@ from wordlift_sdk.protocol.web_page_import_protocol import ( WebPageImportProtocolInterface, ) +from pyshacl import validate as pyshacl_validate +from rdflib.namespace import SH from wordlift_sdk.validation.shacl import ( ValidationResult, + _load_shapes_graph, + _normalize_schema_org_uris, resolve_shape_specs, - validate_file, ) from .config import ProfileDefinition @@ -164,6 +167,24 @@ def __init__( exclude_builtin_shapes=shacl_exclude_builtin_shapes or None, extra_shapes=shacl_extra_shapes or None, ) + _shacl_validate_mode_for_preload = self._resolve_validation_mode( + self.profile.settings.get( + "shacl_validate_mode", + self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"), + ) + ) + if _shacl_validate_mode_for_preload != "off": + self._shacl_shapes_graph, self._shacl_source_map = _load_shapes_graph( + self._shacl_shape_specs if self._shacl_shape_specs else None + ) + logger.info( + "Pre-loaded %d SHACL shape triples for profile '%s'", + len(self._shacl_shapes_graph), + self.profile.name, + ) + else: + self._shacl_shapes_graph = None + self._shacl_source_map = {} self._import_hash_mode = self._resolve_import_hash_mode( self.profile.settings.get( "import_hash_mode", @@ -745,21 +766,26 @@ def _validate_graph_if_enabled( return summary def _validate_graph(self, graph: Graph) -> ValidationResult: - with tempfile.NamedTemporaryFile(mode="w", suffix=".ttl", delete=False) as f: - tmp = Path(f.name) - try: - graph.serialize(destination=tmp, format="turtle") - return validate_file( - str(tmp), - shape_specs=self._shacl_shape_specs - if self._shacl_shape_specs - else None, - ) - finally: - try: - tmp.unlink(missing_ok=True) - except Exception: - logger.debug("Failed to remove temporary SHACL graph file: %s", tmp) + data_graph = _normalize_schema_org_uris(graph) + conforms, report_graph, report_text = pyshacl_validate( + data_graph, + shacl_graph=self._shacl_shapes_graph, + inference="rdfs", + abort_on_first=False, + allow_infos=True, + allow_warnings=True, + ) + warning_count = sum( + 1 for _ in report_graph.subjects(SH.resultSeverity, SH.Warning) + ) + return ValidationResult( + conforms=conforms, + report_text=report_text, + report_graph=report_graph, + data_graph=data_graph, + shape_source_map=self._shacl_source_map, + warning_count=warning_count, + ) def _summarize_validation(self, result: ValidationResult) -> dict[str, Any]: sh = URIRef("http://www.w3.org/ns/shacl#") From 030fbaa559c6404ade4cd9b4f6c8bedc677f04c3 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 10:38:21 +0100 Subject: [PATCH 08/63] feat: run SHACL validation in a process pool to bypass GIL and parallelize across CPUs --- wordlift_sdk/kg_build/protocol.py | 119 ++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 24 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index d6b7c65..4149cc3 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -5,8 +5,8 @@ import hashlib import logging import os - import time +from concurrent.futures import ProcessPoolExecutor from dataclasses import asdict from pathlib import Path from types import SimpleNamespace @@ -50,6 +50,59 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts +# Module-level state for SHACL worker processes (one copy per process) +_shacl_worker_shapes_graph: Graph | None = None +_shacl_worker_source_map: dict = {} + + +def _init_shacl_worker(shape_specs: list[str] | None) -> None: + global _shacl_worker_shapes_graph, _shacl_worker_source_map + _shacl_worker_shapes_graph, _shacl_worker_source_map = _load_shapes_graph( + shape_specs + ) + + +def _shacl_validate_in_worker(ntriples: str) -> dict: + data_graph = Graph() + data_graph.parse(data=ntriples, format="nt") + data_graph = _normalize_schema_org_uris(data_graph) + conforms, report_graph, _ = pyshacl_validate( + data_graph, + shacl_graph=_shacl_worker_shapes_graph, + inference="rdfs", + abort_on_first=False, + allow_infos=True, + allow_warnings=True, + ) + warning_sources: dict[str, int] = {} + error_sources: dict[str, int] = {} + warning_count = 0 + error_count = 0 + for node in report_graph.subjects(SH.resultSeverity, SH.Warning): + warning_count += 1 + shape = next(report_graph.objects(node, SH.sourceShape), None) + label = _shacl_worker_source_map.get(shape, "unknown") + warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1 + for node in report_graph.subjects(SH.resultSeverity, SH.Violation): + error_count += 1 + shape = next(report_graph.objects(node, SH.sourceShape), None) + label = _shacl_worker_source_map.get(shape, "unknown") + error_sources[str(label)] = error_sources.get(str(label), 0) + 1 + return { + "total": 1, + "pass": bool(conforms), + "fail": not bool(conforms), + "warnings": { + "count": warning_count, + "sources": dict(sorted(warning_sources.items())), + }, + "errors": { + "count": error_count, + "sources": dict(sorted(error_sources.items())), + }, + } + + def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str: value = settings.get("postprocessor_runtime") if value is None: @@ -167,24 +220,21 @@ def __init__( exclude_builtin_shapes=shacl_exclude_builtin_shapes or None, extra_shapes=shacl_extra_shapes or None, ) - _shacl_validate_mode_for_preload = self._resolve_validation_mode( - self.profile.settings.get( - "shacl_validate_mode", - self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"), - ) - ) - if _shacl_validate_mode_for_preload != "off": - self._shacl_shapes_graph, self._shacl_source_map = _load_shapes_graph( - self._shacl_shape_specs if self._shacl_shape_specs else None + if self._shacl_mode != "off": + self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor( + max_workers=_pool_size, + initializer=_init_shacl_worker, + initargs=( + self._shacl_shape_specs if self._shacl_shape_specs else None, + ), ) logger.info( - "Pre-loaded %d SHACL shape triples for profile '%s'", - len(self._shacl_shapes_graph), + "Created SHACL process pool with %d workers for profile '%s'", + _pool_size, self.profile.name, ) else: - self._shacl_shapes_graph = None - self._shacl_source_map = {} + self._process_executor = None self._import_hash_mode = self._resolve_import_hash_mode( self.profile.settings.get( "import_hash_mode", @@ -284,9 +334,7 @@ async def callback( self._write_debug_graph(graph, url) _t3 = time.perf_counter() - validation_payload = await loop.run_in_executor( - None, functools.partial(self._validate_graph_if_enabled, graph, url) - ) + validation_payload = await self._async_validate_if_enabled(loop, graph, url) _t_validation = int((time.perf_counter() - _t3) * 1000) graph_metrics = self._kpi.graph_metrics(graph) self._emit_progress( @@ -322,6 +370,8 @@ def close(self) -> None: close_loaded_postprocessors(self._postprocessors_queue.get_nowait()) except asyncio.QueueEmpty: break + if self._process_executor is not None: + self._process_executor.shutdown(wait=False) def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) @@ -359,13 +409,8 @@ async def _patch_static_templates_once(self) -> None: self._ensure_templates_loaded() if self._template_graph and len(self._template_graph) > 0: _loop = asyncio.get_event_loop() - validation_payload = await _loop.run_in_executor( - None, - functools.partial( - self._validate_graph_if_enabled, - self._template_graph, - "static_templates", - ), + validation_payload = await self._async_validate_if_enabled( + _loop, self._template_graph, "static_templates" ) self._emit_progress( { @@ -742,6 +787,32 @@ def _mapping_response( web_page=response.web_page, ) + async def _async_validate_if_enabled( + self, loop: Any, graph: Graph, url: str + ) -> dict[str, Any] | None: + if self._shacl_mode == "off": + return None + ntriples = graph.serialize(format="nt") + summary = await loop.run_in_executor( + self._process_executor, + functools.partial(_shacl_validate_in_worker, ntriples), + ) + self._kpi.record_validation( + passed=summary["pass"], + warning_count=summary["warnings"]["count"], + error_count=summary["errors"]["count"], + warning_sources=summary["warnings"]["sources"], + error_sources=summary["errors"]["sources"], + ) + logger.info( + "SHACL validation for %s: pass=%s warnings=%s errors=%s", + url, + summary["pass"], + summary["warnings"]["count"], + summary["errors"]["count"], + ) + return summary + def _validate_graph_if_enabled( self, graph: Graph, url: str ) -> dict[str, Any] | None: From d06ebc31765f03cf4c9b3b58a2a9de3fb8d2fb4c Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 10:59:56 +0100 Subject: [PATCH 09/63] feat: add separate pool size settings for postprocessors and SHACL validation --- wordlift_sdk/kg_build/protocol.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 4149cc3..a32238c 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -178,13 +178,20 @@ def __init__( "concurrency", self.profile.settings.get("CONCURRENCY", 4) ) ) + _pp_pool_size = int( + self.profile.settings.get( + "postprocessor_pool_size", + self.profile.settings.get("POSTPROCESSOR_POOL_SIZE", _pool_size), + ) + ) logger.info( - "Postprocessor pool size for profile '%s': %d", + "Postprocessor pool size for profile '%s': %d (concurrency=%d)", self.profile.name, + _pp_pool_size, _pool_size, ) self._postprocessors_queue: asyncio.Queue = asyncio.Queue() - for _ in range(_pool_size): + for _ in range(_pp_pool_size): self._postprocessors_queue.put_nowait( load_postprocessors_for_profile( root_dir=self.root_dir, @@ -221,8 +228,16 @@ def __init__( extra_shapes=shacl_extra_shapes or None, ) if self._shacl_mode != "off": + _shacl_pool_size = int( + self.profile.settings.get( + "shacl_pool_size", + self.profile.settings.get( + "SHACL_POOL_SIZE", max(2, _pool_size // 2) + ), + ) + ) self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor( - max_workers=_pool_size, + max_workers=_shacl_pool_size, initializer=_init_shacl_worker, initargs=( self._shacl_shape_specs if self._shacl_shape_specs else None, @@ -230,7 +245,7 @@ def __init__( ) logger.info( "Created SHACL process pool with %d workers for profile '%s'", - _pool_size, + _shacl_pool_size, self.profile.name, ) else: From c6591e5891b6c7628394676a1fa89003d103519f Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 11:44:53 +0100 Subject: [PATCH 10/63] feat: track SHACL process pool queue wait and execution time separately in timing log --- wordlift_sdk/kg_build/protocol.py | 49 ++++++++++++++++++------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index a32238c..989d889 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -62,7 +62,9 @@ def _init_shacl_worker(shape_specs: list[str] | None) -> None: ) -def _shacl_validate_in_worker(ntriples: str) -> dict: +def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict: + _queue_wait_ms = int((time.time() - submit_time) * 1000) + _t_start = time.perf_counter() data_graph = Graph() data_graph.parse(data=ntriples, format="nt") data_graph = _normalize_schema_org_uris(data_graph) @@ -100,6 +102,8 @@ def _shacl_validate_in_worker(ntriples: str) -> dict: "count": error_count, "sources": dict(sorted(error_sources.items())), }, + "_queue_wait_ms": _queue_wait_ms, + "_validation_ms": int((time.perf_counter() - _t_start) * 1000), } @@ -348,9 +352,11 @@ async def callback( ) self._write_debug_graph(graph, url) - _t3 = time.perf_counter() - validation_payload = await self._async_validate_if_enabled(loop, graph, url) - _t_validation = int((time.perf_counter() - _t3) * 1000) + ( + validation_payload, + _t_validation_wait, + _t_validation_actual, + ) = await self._async_validate_if_enabled(loop, graph, url) graph_metrics = self._kpi.graph_metrics(graph) self._emit_progress( { @@ -370,13 +376,14 @@ async def callback( raise RuntimeError(f"SHACL validation failed for {url} in fail mode.") await self._write_graph(graph) logger.info( - "Wrote %s triples for %s [mapping=%dms queue_wait=%dms postprocessors=%dms validation=%dms]", + "Wrote %s triples for %s [mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]", len(graph), url, _t_mapping, _t_queue_wait, _t_postprocessors, - _t_validation, + _t_validation_wait, + _t_validation_actual, ) def close(self) -> None: @@ -424,7 +431,7 @@ async def _patch_static_templates_once(self) -> None: self._ensure_templates_loaded() if self._template_graph and len(self._template_graph) > 0: _loop = asyncio.get_event_loop() - validation_payload = await self._async_validate_if_enabled( + validation_payload, _, _ = await self._async_validate_if_enabled( _loop, self._template_graph, "static_templates" ) self._emit_progress( @@ -804,29 +811,31 @@ def _mapping_response( async def _async_validate_if_enabled( self, loop: Any, graph: Graph, url: str - ) -> dict[str, Any] | None: + ) -> tuple[dict[str, Any] | None, int, int]: if self._shacl_mode == "off": - return None + return None, 0, 0 ntriples = graph.serialize(format="nt") - summary = await loop.run_in_executor( + result = await loop.run_in_executor( self._process_executor, - functools.partial(_shacl_validate_in_worker, ntriples), + functools.partial(_shacl_validate_in_worker, ntriples, time.time()), ) + validation_queue_wait_ms = result.pop("_queue_wait_ms", 0) + validation_ms = result.pop("_validation_ms", 0) self._kpi.record_validation( - passed=summary["pass"], - warning_count=summary["warnings"]["count"], - error_count=summary["errors"]["count"], - warning_sources=summary["warnings"]["sources"], - error_sources=summary["errors"]["sources"], + passed=result["pass"], + warning_count=result["warnings"]["count"], + error_count=result["errors"]["count"], + warning_sources=result["warnings"]["sources"], + error_sources=result["errors"]["sources"], ) logger.info( "SHACL validation for %s: pass=%s warnings=%s errors=%s", url, - summary["pass"], - summary["warnings"]["count"], - summary["errors"]["count"], + result["pass"], + result["warnings"]["count"], + result["errors"]["count"], ) - return summary + return result, validation_queue_wait_ms, validation_ms def _validate_graph_if_enabled( self, graph: Graph, url: str From 5338b9c384712b01af83b48e4c18ace9a9feecc2 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 12:13:08 +0100 Subject: [PATCH 11/63] feat: add inprocess postprocessor runtime for running processors in the same process --- wordlift_sdk/kg_build/postprocessors.py | 65 +++++++++++++++++-------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index a41af15..4e5a079 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -1,5 +1,8 @@ from __future__ import annotations +import asyncio +import importlib +import inspect import json import logging import select @@ -21,6 +24,7 @@ _RUNTIME_ONESHOT = "oneshot" _RUNTIME_PERSISTENT = "persistent" +_RUNTIME_INPROCESS = "inprocess" @dataclass(frozen=True) @@ -373,6 +377,23 @@ def _run_persistent( ) +@dataclass(frozen=True) +class InProcessPostprocessor: + class_path: str + + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: + module_name, class_name = self.class_path.split(":", 1) + module = importlib.import_module(module_name) + klass = getattr(module, class_name) + processor = klass() + result = processor.process_graph(graph, context) + if inspect.isawaitable(result): + result = asyncio.run(result) + return result + + def _as_bool(value: Any, default: bool) -> bool: if value is None: return default @@ -399,8 +420,10 @@ def _as_positive_int(value: Any, default: int) -> int: def _normalize_runtime(value: str | None) -> str: runtime = (value or _RUNTIME_ONESHOT).strip().lower() - if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT}: - raise ValueError("POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent.") + if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}: + raise ValueError( + "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess." + ) return runtime @@ -510,16 +533,17 @@ def load_postprocessors_for_profile( for spec in specs: if not spec.enabled: continue - loaded.append( - LoadedPostprocessor( - name=spec.class_path, - handler=SubprocessPostprocessor( - spec=spec, - root_dir=root_dir, - runtime=resolved_runtime, - ), + if resolved_runtime == _RUNTIME_INPROCESS: + handler: GraphPostprocessor = InProcessPostprocessor( + class_path=spec.class_path ) - ) + else: + handler = SubprocessPostprocessor( + spec=spec, + root_dir=root_dir, + runtime=resolved_runtime, + ) + loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler)) logger.info( "Loaded %s postprocessors for profile '%s' from manifest: %s (runtime=%s)", @@ -550,16 +574,17 @@ def load_postprocessors( for spec in specs: if not spec.enabled: continue - loaded.append( - LoadedPostprocessor( - name=spec.class_path, - handler=SubprocessPostprocessor( - spec=spec, - root_dir=root_dir, - runtime=resolved_runtime, - ), + if resolved_runtime == _RUNTIME_INPROCESS: + handler: GraphPostprocessor = InProcessPostprocessor( + class_path=spec.class_path ) - ) + else: + handler = SubprocessPostprocessor( + spec=spec, + root_dir=root_dir, + runtime=resolved_runtime, + ) + loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler)) return loaded From 1dc472a7a24d992f7f8b4dd4365fe2740b7502db Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 12:32:08 +0100 Subject: [PATCH 12/63] feat: run postprocessors on a dedicated thread pool instead of the default executor --- wordlift_sdk/kg_build/protocol.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 989d889..69e17e7 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -6,7 +6,7 @@ import logging import os import time -from concurrent.futures import ProcessPoolExecutor +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import asdict from pathlib import Path from types import SimpleNamespace @@ -194,6 +194,9 @@ def __init__( _pp_pool_size, _pool_size, ) + self._pp_executor = ThreadPoolExecutor( + max_workers=_pp_pool_size, thread_name_prefix="worai_pp" + ) self._postprocessors_queue: asyncio.Queue = asyncio.Queue() for _ in range(_pp_pool_size): self._postprocessors_queue.put_nowait( @@ -324,7 +327,7 @@ async def callback( try: _t2 = time.perf_counter() graph = await loop.run_in_executor( - None, + self._pp_executor, functools.partial( self._apply_postprocessors_with, graph, @@ -392,6 +395,7 @@ def close(self) -> None: close_loaded_postprocessors(self._postprocessors_queue.get_nowait()) except asyncio.QueueEmpty: break + self._pp_executor.shutdown(wait=False) if self._process_executor is not None: self._process_executor.shutdown(wait=False) From 85dd88620cce244a128557cad0de80736218709d Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 12:54:15 +0100 Subject: [PATCH 13/63] fix: handle SHACL process pool timeout and broken executor errors gracefully --- wordlift_sdk/kg_build/protocol.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 69e17e7..a7fcaf6 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -6,6 +6,7 @@ import logging import os import time +import concurrent.futures from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import asdict from pathlib import Path @@ -819,10 +820,22 @@ async def _async_validate_if_enabled( if self._shacl_mode == "off": return None, 0, 0 ntriples = graph.serialize(format="nt") - result = await loop.run_in_executor( - self._process_executor, - functools.partial(_shacl_validate_in_worker, ntriples, time.time()), - ) + try: + result = await asyncio.wait_for( + loop.run_in_executor( + self._process_executor, + functools.partial(_shacl_validate_in_worker, ntriples, time.time()), + ), + timeout=120.0, + ) + except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc: + logger.warning( + "SHACL validation skipped for %s: %s (%s)", + url, + type(exc).__name__, + exc, + ) + return None, 0, 0 validation_queue_wait_ms = result.pop("_queue_wait_ms", 0) validation_ms = result.pop("_validation_ms", 0) self._kpi.record_validation( From fb10f70a20ced6aa629b33917f957f59a2af51c8 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 13:51:58 +0100 Subject: [PATCH 14/63] fix: offload graph hashing to executor to avoid blocking the event loop --- wordlift_sdk/protocol/graph/graph_queue.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py index 1ad8e33..f104777 100644 --- a/wordlift_sdk/protocol/graph/graph_queue.py +++ b/wordlift_sdk/protocol/graph/graph_queue.py @@ -39,7 +39,8 @@ def __init__(self, client_configuration: Configuration): reraise=True, ) async def put(self, graph: Graph) -> None: - hash = GraphQueue.hash_graph(graph) + loop = asyncio.get_event_loop() + hash = await loop.run_in_executor(None, GraphQueue.hash_graph, graph) if hash not in self.hashes: self.hashes.add(hash) From 1e6c540f28ab1dfdac2fc67525b3a7a4b770f198 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 13:52:47 +0100 Subject: [PATCH 15/63] fix: enable stop_after_attempt(5) retry limit on graph queue put --- wordlift_sdk/protocol/graph/graph_queue.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py index f104777..ff14016 100644 --- a/wordlift_sdk/protocol/graph/graph_queue.py +++ b/wordlift_sdk/protocol/graph/graph_queue.py @@ -8,7 +8,13 @@ from rdflib import Graph from rdflib.compare import to_isomorphic from wordlift_client import Configuration -from tenacity import retry, retry_if_exception_type, wait_fixed, after_log +from tenacity import ( + retry, + retry_if_exception_type, + wait_fixed, + after_log, + stop_after_attempt, +) logger = logging.getLogger(__name__) @@ -22,7 +28,7 @@ def __init__(self, client_configuration: Configuration): self.hashes = set() @retry( - # stop=stop_after_attempt(5), # Retry up to 5 times + stop=stop_after_attempt(5), retry=retry_if_exception_type( asyncio.TimeoutError | aiohttp.client_exceptions.ServerDisconnectedError From a8190ca7096660a8de12c68bf3c2581ff23861fb Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 14:47:54 +0100 Subject: [PATCH 16/63] fix: disable morph_kgc internal multiprocessing to prevent fork deadlocks in threaded context --- wordlift_sdk/structured_data/engine.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py index 2fec73d..5107852 100644 --- a/wordlift_sdk/structured_data/engine.py +++ b/wordlift_sdk/structured_data/engine.py @@ -1351,6 +1351,10 @@ def _materialize_graph(mapping_path: Path) -> Graph: config = ( "[CONFIGURATION]\n" "output_format = N-TRIPLES\n" + # Disable morph_kgc internal multiprocessing: on Linux it uses fork() which + # deadlocks when the parent process already has threads running (asyncio pool, + # SHACL ProcessPoolExecutor). The outer pipeline handles concurrency. + "number_of_processes = 1\n" "\n" "[DataSource1]\n" f"mappings = {mapping_path}\n" From 7905d7db8273d59ef3ec62194ea864e86932c5a0 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 15:01:39 +0100 Subject: [PATCH 17/63] fix: offload RML mapping to dedicated thread pool to prevent blocking the event loop --- wordlift_sdk/kg_build/protocol.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index a7fcaf6..04cb0e4 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -198,6 +198,11 @@ def __init__( self._pp_executor = ThreadPoolExecutor( max_workers=_pp_pool_size, thread_name_prefix="worai_pp" ) + # Dedicated executor for RML mapping (morph_kgc is CPU-bound and has no + # async I/O — running it directly on the event loop thread blocks everything). + self._mapping_executor = ThreadPoolExecutor( + max_workers=_pool_size, thread_name_prefix="worai_ml" + ) self._postprocessors_queue: asyncio.Queue = asyncio.Queue() for _ in range(_pp_pool_size): self._postprocessors_queue.put_nowait( @@ -306,13 +311,23 @@ async def callback( debug_output: dict[str, str] | None = {} if self.debug_dir else None _t0 = time.perf_counter() - graph = await self.rml_service.apply_mapping( - html=response.web_page.html, - url=url, - mapping_file_path=mapping_path, - mapping_content=rendered_mapping, - response=mapping_response, - debug_output=debug_output, + # apply_mapping is async def but contains no awaits — it runs morph_kgc + # (pure Python CPU work) synchronously. Running it directly on the event + # loop blocks all other coroutines for ~450ms per URL. Offload to a thread + # so the event loop stays free to schedule I/O for other concurrent URLs. + _mapping_loop = asyncio.get_event_loop() + graph = await _mapping_loop.run_in_executor( + self._mapping_executor, + lambda: asyncio.run( + self.rml_service.apply_mapping( + html=response.web_page.html, + url=url, + mapping_file_path=mapping_path, + mapping_content=rendered_mapping, + response=mapping_response, + debug_output=debug_output, + ) + ), ) _t_mapping = int((time.perf_counter() - _t0) * 1000) if not graph or len(graph) == 0: @@ -397,6 +412,7 @@ def close(self) -> None: except asyncio.QueueEmpty: break self._pp_executor.shutdown(wait=False) + self._mapping_executor.shutdown(wait=False) if self._process_executor is not None: self._process_executor.shutdown(wait=False) From df23fe7701fad4bc1012dd44469c0373921fb93d Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 15:11:18 +0100 Subject: [PATCH 18/63] fix: serialize morph_kgc calls with a lock to prevent thread-safety issues in pyparsing --- wordlift_sdk/structured_data/engine.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py index 5107852..cb59f15 100644 --- a/wordlift_sdk/structured_data/engine.py +++ b/wordlift_sdk/structured_data/engine.py @@ -7,6 +7,7 @@ import json import logging import re +import threading from dataclasses import dataclass from importlib import resources from pathlib import Path @@ -28,6 +29,9 @@ from wordlift_sdk.utils.ssl_ca_bundle import resolve_ssl_ca_cert from wordlift_sdk.validation.shacl import ValidationResult, validate_file +# morph_kgc uses rdflib's SPARQL parser (pyparsing) which has global state and +# is NOT thread-safe. Serialize all morph_kgc calls with a module-level lock. +_morph_kgc_lock = threading.Lock() _SCHEMA_BASE = "https://schema.org" _SCHEMA_HTTP = "http://schema.org/" @@ -1360,7 +1364,8 @@ def _materialize_graph(mapping_path: Path) -> Graph: f"mappings = {mapping_path}\n" ) try: - return morph_kgc.materialize(config) + with _morph_kgc_lock: + return morph_kgc.materialize(config) except RuntimeError: raise except Exception as exc: From bee7dd802cfa496a29189bef38f2761cbffb94e3 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 15:22:20 +0100 Subject: [PATCH 19/63] Revert "fix: offload RML mapping to dedicated thread pool to prevent blocking the event loop" This reverts commit 783c7eac16038aeb60e85e34829212cfb3e546b7. --- wordlift_sdk/kg_build/protocol.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 04cb0e4..a7fcaf6 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -198,11 +198,6 @@ def __init__( self._pp_executor = ThreadPoolExecutor( max_workers=_pp_pool_size, thread_name_prefix="worai_pp" ) - # Dedicated executor for RML mapping (morph_kgc is CPU-bound and has no - # async I/O — running it directly on the event loop thread blocks everything). - self._mapping_executor = ThreadPoolExecutor( - max_workers=_pool_size, thread_name_prefix="worai_ml" - ) self._postprocessors_queue: asyncio.Queue = asyncio.Queue() for _ in range(_pp_pool_size): self._postprocessors_queue.put_nowait( @@ -311,23 +306,13 @@ async def callback( debug_output: dict[str, str] | None = {} if self.debug_dir else None _t0 = time.perf_counter() - # apply_mapping is async def but contains no awaits — it runs morph_kgc - # (pure Python CPU work) synchronously. Running it directly on the event - # loop blocks all other coroutines for ~450ms per URL. Offload to a thread - # so the event loop stays free to schedule I/O for other concurrent URLs. - _mapping_loop = asyncio.get_event_loop() - graph = await _mapping_loop.run_in_executor( - self._mapping_executor, - lambda: asyncio.run( - self.rml_service.apply_mapping( - html=response.web_page.html, - url=url, - mapping_file_path=mapping_path, - mapping_content=rendered_mapping, - response=mapping_response, - debug_output=debug_output, - ) - ), + graph = await self.rml_service.apply_mapping( + html=response.web_page.html, + url=url, + mapping_file_path=mapping_path, + mapping_content=rendered_mapping, + response=mapping_response, + debug_output=debug_output, ) _t_mapping = int((time.perf_counter() - _t0) * 1000) if not graph or len(graph) == 0: @@ -412,7 +397,6 @@ def close(self) -> None: except asyncio.QueueEmpty: break self._pp_executor.shutdown(wait=False) - self._mapping_executor.shutdown(wait=False) if self._process_executor is not None: self._process_executor.shutdown(wait=False) From 7044deede01c2e59fbe4e4d5411e5ebe6baeb646 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 15:46:37 +0100 Subject: [PATCH 20/63] perf: run morph_kgc in a subprocess pool for true parallelism without pyparsing lock contention --- wordlift_sdk/kg_build/protocol.py | 30 +++++++++++---- wordlift_sdk/structured_data/engine.py | 51 +++++++++++++++++++------- 2 files changed, 61 insertions(+), 20 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index a7fcaf6..ba4205f 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -198,6 +198,12 @@ def __init__( self._pp_executor = ThreadPoolExecutor( max_workers=_pp_pool_size, thread_name_prefix="worai_pp" ) + # Wraps apply_mapping calls so they run in a thread rather than blocking + # the asyncio event loop. The thread itself blocks on the morph_kgc + # ProcessPoolExecutor slot, leaving the event loop free for I/O. + self._mapping_executor = ThreadPoolExecutor( + max_workers=_pool_size, thread_name_prefix="worai_ml" + ) self._postprocessors_queue: asyncio.Queue = asyncio.Queue() for _ in range(_pp_pool_size): self._postprocessors_queue.put_nowait( @@ -306,13 +312,22 @@ async def callback( debug_output: dict[str, str] | None = {} if self.debug_dir else None _t0 = time.perf_counter() - graph = await self.rml_service.apply_mapping( - html=response.web_page.html, - url=url, - mapping_file_path=mapping_path, - mapping_content=rendered_mapping, - response=mapping_response, - debug_output=debug_output, + # apply_mapping has no awaits — all work is synchronous (morph_kgc). + # Run it in a thread so the event loop stays free for I/O while the + # thread waits for its morph_kgc subprocess slot to become available. + _loop = asyncio.get_event_loop() + graph = await _loop.run_in_executor( + self._mapping_executor, + lambda: asyncio.run( + self.rml_service.apply_mapping( + html=response.web_page.html, + url=url, + mapping_file_path=mapping_path, + mapping_content=rendered_mapping, + response=mapping_response, + debug_output=debug_output, + ) + ), ) _t_mapping = int((time.perf_counter() - _t0) * 1000) if not graph or len(graph) == 0: @@ -397,6 +412,7 @@ def close(self) -> None: except asyncio.QueueEmpty: break self._pp_executor.shutdown(wait=False) + self._mapping_executor.shutdown(wait=False) if self._process_executor is not None: self._process_executor.shutdown(wait=False) diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py index cb59f15..9543330 100644 --- a/wordlift_sdk/structured_data/engine.py +++ b/wordlift_sdk/structured_data/engine.py @@ -6,8 +6,10 @@ import hashlib import json import logging +import multiprocessing +import os import re -import threading +from concurrent.futures import ProcessPoolExecutor from dataclasses import dataclass from importlib import resources from pathlib import Path @@ -29,9 +31,34 @@ from wordlift_sdk.utils.ssl_ca_bundle import resolve_ssl_ca_cert from wordlift_sdk.validation.shacl import ValidationResult, validate_file -# morph_kgc uses rdflib's SPARQL parser (pyparsing) which has global state and -# is NOT thread-safe. Serialize all morph_kgc calls with a module-level lock. -_morph_kgc_lock = threading.Lock() + +# Top-level worker — must be module-level to be picklable for ProcessPoolExecutor. +# Each subprocess has its own Python interpreter so pyparsing state is isolated; +# no lock needed and genuine parallelism is possible. +def _morph_kgc_worker(config: str) -> str: + import morph_kgc as _mkgc + + return _mkgc.materialize(config).serialize(format="nt") + + +# Lazy process pool — created on first use in the main process only. +# Worker subprocesses import this module but never call _get_morph_kgc_pool(), +# so they do NOT create their own pools (no recursive process explosion). +_morph_kgc_pool: ProcessPoolExecutor | None = None + + +def _get_morph_kgc_pool() -> ProcessPoolExecutor: + global _morph_kgc_pool + if _morph_kgc_pool is None: + # Use "spawn" context to start workers cleanly without inheriting any + # locks or file descriptors from the parent process. + ctx = multiprocessing.get_context("spawn") + _morph_kgc_pool = ProcessPoolExecutor( + max_workers=os.cpu_count() or 4, + mp_context=ctx, + ) + return _morph_kgc_pool + _SCHEMA_BASE = "https://schema.org" _SCHEMA_HTTP = "http://schema.org/" @@ -1345,13 +1372,6 @@ def _normalize_materialization_error(error: Exception) -> RuntimeError: def _materialize_graph(mapping_path: Path) -> Graph: - try: - import morph_kgc - except ImportError as exc: - raise RuntimeError( - "morph-kgc is required. Install with: pip install morph-kgc" - ) from exc - config = ( "[CONFIGURATION]\n" "output_format = N-TRIPLES\n" @@ -1364,8 +1384,13 @@ def _materialize_graph(mapping_path: Path) -> Graph: f"mappings = {mapping_path}\n" ) try: - with _morph_kgc_lock: - return morph_kgc.materialize(config) + # Submit to subprocess pool — each worker has isolated pyparsing state, + # so calls are genuinely parallel across CPU cores with no lock needed. + # .result() blocks the calling thread (not the asyncio event loop). + ntriples = _get_morph_kgc_pool().submit(_morph_kgc_worker, config).result() + graph = Graph() + graph.parse(data=ntriples, format="nt") + return graph except RuntimeError: raise except Exception as exc: From bec6f276a07ba85c1fee4a497e3faa516d52ba11 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 15:56:18 +0100 Subject: [PATCH 21/63] perf: expose morph_kgc pool size setting and track subprocess queue wait in timing log --- wordlift_sdk/kg_build/protocol.py | 17 +++++++++- wordlift_sdk/structured_data/engine.py | 43 +++++++++++++++++++++----- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index ba4205f..ba653fe 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -41,6 +41,7 @@ ) from .rml_mapping import RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer +from wordlift_sdk.structured_data.engine import init_morph_kgc_pool, _morph_kgc_tls logger = logging.getLogger(__name__) SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") @@ -198,6 +199,18 @@ def __init__( self._pp_executor = ThreadPoolExecutor( max_workers=_pp_pool_size, thread_name_prefix="worai_pp" ) + _mapping_pool_size = int( + self.profile.settings.get( + "mapping_pool_size", + self.profile.settings.get("MAPPING_POOL_SIZE", os.cpu_count() or 4), + ) + ) + logger.info( + "Mapping pool size for profile '%s': %d", + self.profile.name, + _mapping_pool_size, + ) + init_morph_kgc_pool(_mapping_pool_size) # Wraps apply_mapping calls so they run in a thread rather than blocking # the asyncio event loop. The thread itself blocks on the morph_kgc # ProcessPoolExecutor slot, leaving the event loop free for I/O. @@ -330,6 +343,7 @@ async def callback( ), ) _t_mapping = int((time.perf_counter() - _t0) * 1000) + _t_mapping_wait = getattr(_morph_kgc_tls, "mapping_wait_ms", 0) if not graph or len(graph) == 0: logger.warning("No triples produced for %s", url) return @@ -395,9 +409,10 @@ async def callback( raise RuntimeError(f"SHACL validation failed for {url} in fail mode.") await self._write_graph(graph) logger.info( - "Wrote %s triples for %s [mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]", + "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]", len(graph), url, + _t_mapping_wait, _t_mapping, _t_queue_wait, _t_postprocessors, diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py index 9543330..c62ad75 100644 --- a/wordlift_sdk/structured_data/engine.py +++ b/wordlift_sdk/structured_data/engine.py @@ -32,26 +32,48 @@ from wordlift_sdk.validation.shacl import ValidationResult, validate_file +import threading +import time as _time + + # Top-level worker — must be module-level to be picklable for ProcessPoolExecutor. -# Each subprocess has its own Python interpreter so pyparsing state is isolated; -# no lock needed and genuine parallelism is possible. -def _morph_kgc_worker(config: str) -> str: +# Accepts submit_time so it can measure queue wait (time spent waiting for a +# free subprocess slot). Returns (ntriples, queue_wait_ms). +def _morph_kgc_worker(config: str, submit_time: float) -> tuple[str, int]: import morph_kgc as _mkgc + import time as _t - return _mkgc.materialize(config).serialize(format="nt") + queue_wait_ms = int((_t.time() - submit_time) * 1000) + ntriples = _mkgc.materialize(config).serialize(format="nt") + return ntriples, queue_wait_ms +# Thread-local used to pass mapping_wait_ms back to the protocol layer without +# changing the return type of _materialize_graph / apply_mapping. +_morph_kgc_tls = threading.local() + # Lazy process pool — created on first use in the main process only. # Worker subprocesses import this module but never call _get_morph_kgc_pool(), # so they do NOT create their own pools (no recursive process explosion). _morph_kgc_pool: ProcessPoolExecutor | None = None +def init_morph_kgc_pool(max_workers: int) -> None: + """Pre-create the morph_kgc process pool with a specific worker count. + Call once from the protocol __init__ before any mapping work starts. + Subsequent calls are no-ops (pool is only created once). + """ + global _morph_kgc_pool + if _morph_kgc_pool is not None: + return + ctx = multiprocessing.get_context("spawn") + _morph_kgc_pool = ProcessPoolExecutor(max_workers=max_workers, mp_context=ctx) + + def _get_morph_kgc_pool() -> ProcessPoolExecutor: global _morph_kgc_pool if _morph_kgc_pool is None: - # Use "spawn" context to start workers cleanly without inheriting any - # locks or file descriptors from the parent process. + # Fallback if init_morph_kgc_pool was never called. ctx = multiprocessing.get_context("spawn") _morph_kgc_pool = ProcessPoolExecutor( max_workers=os.cpu_count() or 4, @@ -1387,7 +1409,14 @@ def _materialize_graph(mapping_path: Path) -> Graph: # Submit to subprocess pool — each worker has isolated pyparsing state, # so calls are genuinely parallel across CPU cores with no lock needed. # .result() blocks the calling thread (not the asyncio event loop). - ntriples = _get_morph_kgc_pool().submit(_morph_kgc_worker, config).result() + ntriples, queue_wait_ms = ( + _get_morph_kgc_pool() + .submit(_morph_kgc_worker, config, _time.time()) + .result() + ) + # Store wait time in thread-local so protocol.py can read it without + # changing the return type of this function. + _morph_kgc_tls.mapping_wait_ms = queue_wait_ms graph = Graph() graph.parse(data=ntriples, format="nt") return graph From c8ed0602f137eb36815b869bb568f5d3448b412d Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 16:46:42 +0100 Subject: [PATCH 22/63] fix: read morph_kgc queue wait from worker thread via closure to avoid thread-local race --- wordlift_sdk/kg_build/protocol.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index ba653fe..5d8bcfe 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -328,10 +328,13 @@ async def callback( # apply_mapping has no awaits — all work is synchronous (morph_kgc). # Run it in a thread so the event loop stays free for I/O while the # thread waits for its morph_kgc subprocess slot to become available. - _loop = asyncio.get_event_loop() - graph = await _loop.run_in_executor( - self._mapping_executor, - lambda: asyncio.run( + # _morph_kgc_tls is thread-local: capture it inside the worker thread + # and pass the value back via a closure dict. + _timing: dict[str, int] = {} + + def _run_mapping() -> Graph | None: + _t_start = time.perf_counter() + result = asyncio.run( self.rml_service.apply_mapping( html=response.web_page.html, url=url, @@ -340,10 +343,18 @@ async def callback( response=mapping_response, debug_output=debug_output, ) - ), - ) - _t_mapping = int((time.perf_counter() - _t0) * 1000) - _t_mapping_wait = getattr(_morph_kgc_tls, "mapping_wait_ms", 0) + ) + mw = getattr(_morph_kgc_tls, "mapping_wait_ms", 0) + _timing["mapping_wait_ms"] = mw + # Subtract queue-wait so mapping= shows actual execution time only, + # consistent with how validation_wait/validation are reported. + _timing["mapping_ms"] = int((time.perf_counter() - _t_start) * 1000) - mw + return result + + _loop = asyncio.get_event_loop() + graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping) + _t_mapping = _timing.get("mapping_ms", int((time.perf_counter() - _t0) * 1000)) + _t_mapping_wait = _timing.get("mapping_wait_ms", 0) if not graph or len(graph) == 0: logger.warning("No triples produced for %s", url) return From 8a0a34e6771ee2c6a11b282998e5531cfb8abb4a Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 17:10:13 +0100 Subject: [PATCH 23/63] perf: reuse a single persistent ApiClient across requests instead of creating one per graph --- wordlift_sdk/protocol/graph/graph_queue.py | 53 +++++++++++++++++----- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py index ff14016..053360e 100644 --- a/wordlift_sdk/protocol/graph/graph_queue.py +++ b/wordlift_sdk/protocol/graph/graph_queue.py @@ -26,6 +26,37 @@ class GraphQueue: def __init__(self, client_configuration: Configuration): self.client_configuration = client_configuration self.hashes = set() + self._api_client: wordlift_client.ApiClient | None = None + self._api_client_lock: asyncio.Lock | None = None + + async def _get_api_client(self) -> wordlift_client.ApiClient: + # Lazy-init the lock (must be created on the event loop). + if self._api_client_lock is None: + self._api_client_lock = asyncio.Lock() + if self._api_client is not None: + return self._api_client + async with self._api_client_lock: + if self._api_client is None: + # ApiClient.__init__ calls ssl.create_default_context() synchronously. + # Run it in a thread so the event loop isn't blocked during cert loading. + loop = asyncio.get_event_loop() + client = await loop.run_in_executor( + None, + lambda: wordlift_client.ApiClient( + configuration=self.client_configuration + ), + ) + await client.__aenter__() + self._api_client = client + return self._api_client + + async def close(self) -> None: + if self._api_client is not None: + try: + await self._api_client.__aexit__(None, None, None) + except Exception: + pass + self._api_client = None @retry( stop=stop_after_attempt(5), @@ -50,19 +81,17 @@ async def put(self, graph: Graph) -> None: if hash not in self.hashes: self.hashes.add(hash) - async with wordlift_client.ApiClient( - configuration=self.client_configuration - ) as api_client: - api_instance = wordlift_client.EntitiesApi(api_client) + api_client = await self._get_api_client() + api_instance = wordlift_client.EntitiesApi(api_client) - try: - await api_instance.create_or_update_entities( - graph.serialize(format="turtle"), - _content_type="text/turtle", - ) - except Exception as e: - logger.error(f"Failed to create entities: {e}", exc_info=e) - raise e + try: + await api_instance.create_or_update_entities( + graph.serialize(format="turtle"), + _content_type="text/turtle", + ) + except Exception as e: + logger.error(f"Failed to create entities: {e}", exc_info=e) + raise e @staticmethod def hash_graph(graph: Graph) -> str: From a05328e8ef1f3e2058a90d08a053eecd45142945 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Wed, 18 Mar 2026 17:19:42 +0100 Subject: [PATCH 24/63] fix: create ApiClient directly on the event loop thread instead of in an executor --- wordlift_sdk/protocol/graph/graph_queue.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/wordlift_sdk/protocol/graph/graph_queue.py b/wordlift_sdk/protocol/graph/graph_queue.py index 053360e..56e818a 100644 --- a/wordlift_sdk/protocol/graph/graph_queue.py +++ b/wordlift_sdk/protocol/graph/graph_queue.py @@ -37,14 +37,11 @@ async def _get_api_client(self) -> wordlift_client.ApiClient: return self._api_client async with self._api_client_lock: if self._api_client is None: - # ApiClient.__init__ calls ssl.create_default_context() synchronously. - # Run it in a thread so the event loop isn't blocked during cert loading. - loop = asyncio.get_event_loop() - client = await loop.run_in_executor( - None, - lambda: wordlift_client.ApiClient( - configuration=self.client_configuration - ), + # ApiClient.__init__ calls ssl.create_default_context() synchronously + # and must run on the event loop thread (it calls asyncio internals). + # Creating it once and caching avoids repeated SSL cert loading per put(). + client = wordlift_client.ApiClient( + configuration=self.client_configuration ) await client.__aenter__() self._api_client = client From b358a10f387251286da5c108cf9545f4b6b66ebc Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 10:43:28 +0100 Subject: [PATCH 25/63] refactor: make load_shapes_graph and normalize_schema_org_uris public in shacl module --- wordlift_sdk/graph/audit/_entity_matrix.py | 4 ++-- wordlift_sdk/kg_build/protocol.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/wordlift_sdk/graph/audit/_entity_matrix.py b/wordlift_sdk/graph/audit/_entity_matrix.py index 1c8101e..23b3048 100644 --- a/wordlift_sdk/graph/audit/_entity_matrix.py +++ b/wordlift_sdk/graph/audit/_entity_matrix.py @@ -15,7 +15,7 @@ _find_webpage_urls, ) from wordlift_sdk.validation.shacl import ( - _normalize_schema_org_uris, # type: ignore[attr-defined] + normalize_schema_org_uris, ) _SCHEMA_ORG_PREFIXES = ("http://schema.org/", "https://schema.org/") @@ -120,7 +120,7 @@ def build_entity_matrix( excl: set[str] = set(exclude_types or []) load_result = load_graph(path) - normalized = _normalize_schema_org_uris(load_result.graph) + normalized = normalize_schema_org_uris(load_result.graph) webpage_urls = _find_webpage_urls(normalized) if not webpage_urls: diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 5d8bcfe..104e5af 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -24,8 +24,8 @@ from rdflib.namespace import SH from wordlift_sdk.validation.shacl import ( ValidationResult, - _load_shapes_graph, - _normalize_schema_org_uris, + load_shapes_graph, + normalize_schema_org_uris, resolve_shape_specs, ) @@ -59,7 +59,7 @@ def _path_contains_part(path: str, part: str) -> bool: def _init_shacl_worker(shape_specs: list[str] | None) -> None: global _shacl_worker_shapes_graph, _shacl_worker_source_map - _shacl_worker_shapes_graph, _shacl_worker_source_map = _load_shapes_graph( + _shacl_worker_shapes_graph, _shacl_worker_source_map = load_shapes_graph( shape_specs ) @@ -69,7 +69,7 @@ def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict: _t_start = time.perf_counter() data_graph = Graph() data_graph.parse(data=ntriples, format="nt") - data_graph = _normalize_schema_org_uris(data_graph) + data_graph = normalize_schema_org_uris(data_graph) conforms, report_graph, _ = pyshacl_validate( data_graph, shacl_graph=_shacl_worker_shapes_graph, @@ -920,7 +920,7 @@ def _validate_graph_if_enabled( return summary def _validate_graph(self, graph: Graph) -> ValidationResult: - data_graph = _normalize_schema_org_uris(graph) + data_graph = normalize_schema_org_uris(graph) conforms, report_graph, report_text = pyshacl_validate( data_graph, shacl_graph=self._shacl_shapes_graph, From 61c0e367fc247602742145bb4ef3ff8efc818c6f Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 11:17:54 +0100 Subject: [PATCH 26/63] feat: add ShaclValidationService to validation package --- .../validation/shacl_validation_service.py | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 wordlift_sdk/validation/shacl_validation_service.py diff --git a/wordlift_sdk/validation/shacl_validation_service.py b/wordlift_sdk/validation/shacl_validation_service.py new file mode 100644 index 0000000..60e01f7 --- /dev/null +++ b/wordlift_sdk/validation/shacl_validation_service.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import asyncio +import concurrent.futures +import functools +import logging +import time +from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass +from enum import Enum +from typing import Any + +from pyshacl import validate as pyshacl_validate +from rdflib import Graph +from rdflib.namespace import SH + +from wordlift_sdk.validation.shacl import load_shapes_graph, normalize_schema_org_uris + +logger = logging.getLogger(__name__) + +DEFAULT_VALIDATION_TIMEOUT_SECONDS = 120.0 + + +class ValidationMode(str, Enum): + OFF = "off" + WARN = "warn" + FAIL = "fail" + + +# Module-level worker state — one copy per subprocess, initialised by _init_worker. +# Must be module-level for picklability by ProcessPoolExecutor. +_worker_shapes_graph: Graph | None = None +_worker_source_map: dict = {} + + +def _init_worker(shape_specs: list[str] | None) -> None: + global _worker_shapes_graph, _worker_source_map + _worker_shapes_graph, _worker_source_map = load_shapes_graph(shape_specs) + + +def _validate_in_worker(ntriples: str, submit_time: float) -> dict: + queue_wait_ms = int((time.time() - submit_time) * 1000) + t_start = time.perf_counter() + + data_graph = Graph() + data_graph.parse(data=ntriples, format="nt") + data_graph = normalize_schema_org_uris(data_graph) + + conforms, report_graph, _ = pyshacl_validate( + data_graph, + shacl_graph=_worker_shapes_graph, + inference="rdfs", + abort_on_first=False, + allow_infos=True, + allow_warnings=True, + ) + + warning_sources: dict[str, int] = {} + error_sources: dict[str, int] = {} + for node in report_graph.subjects(SH.resultSeverity, SH.Warning): + shape = next(report_graph.objects(node, SH.sourceShape), None) + label = _worker_source_map.get(shape, "unknown") + warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1 + for node in report_graph.subjects(SH.resultSeverity, SH.Violation): + shape = next(report_graph.objects(node, SH.sourceShape), None) + label = _worker_source_map.get(shape, "unknown") + error_sources[str(label)] = error_sources.get(str(label), 0) + 1 + + return { + "passed": bool(conforms), + "warning_sources": dict(sorted(warning_sources.items())), + "error_sources": dict(sorted(error_sources.items())), + "queue_wait_ms": queue_wait_ms, + "validation_ms": int((time.perf_counter() - t_start) * 1000), + } + + +@dataclass +class ValidationOutcome: + passed: bool + warning_sources: dict[str, int] + error_sources: dict[str, int] + queue_wait_ms: int + validation_ms: int + + @property + def failed(self) -> bool: + return not self.passed + + @property + def warning_count(self) -> int: + return sum(self.warning_sources.values()) + + @property + def error_count(self) -> int: + return sum(self.error_sources.values()) + + def to_dict(self) -> dict[str, Any]: + return { + "pass": self.passed, + "fail": self.failed, + "warnings": {"count": self.warning_count, "sources": self.warning_sources}, + "errors": {"count": self.error_count, "sources": self.error_sources}, + } + + +class ShaclValidationService: + def __init__( + self, + shape_specs: list[str] | None, + mode: ValidationMode, + pool_size: int = 1, + timeout_seconds: float = DEFAULT_VALIDATION_TIMEOUT_SECONDS, + ) -> None: + self._mode = mode + self._timeout_seconds = timeout_seconds + self._executor: ProcessPoolExecutor | None = None + if mode != ValidationMode.OFF: + self._executor = ProcessPoolExecutor( + max_workers=pool_size, + initializer=_init_worker, + initargs=(shape_specs,), + ) + logger.info( + "Created SHACL process pool with %d workers (mode=%s)", + pool_size, + mode, + ) + + @property + def mode(self) -> ValidationMode: + return self._mode + + async def validate(self, graph: Graph) -> ValidationOutcome | None: + """Validate *graph* against the configured SHACL shapes. + + Returns ``None`` when validation is disabled (mode=off) or skipped due + to a timeout or broken executor. + """ + if self._mode == ValidationMode.OFF or self._executor is None: + return None + ntriples = graph.serialize(format="nt") + loop = asyncio.get_event_loop() + try: + result = await asyncio.wait_for( + loop.run_in_executor( + self._executor, + functools.partial(_validate_in_worker, ntriples, time.time()), + ), + timeout=self._timeout_seconds, + ) + except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc: + logger.warning("SHACL validation skipped: %s (%s)", type(exc).__name__, exc) + return None + return ValidationOutcome( + passed=result["passed"], + warning_sources=result["warning_sources"], + error_sources=result["error_sources"], + queue_wait_ms=result["queue_wait_ms"], + validation_ms=result["validation_ms"], + ) + + def close(self) -> None: + if self._executor is not None: + self._executor.shutdown(wait=False) + self._executor = None From 635dd51d70369a9a2b2b2b159d3cf8c0467b0579 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 11:34:04 +0100 Subject: [PATCH 27/63] refactor: wire ShaclValidationService into ProfileImportProtocol --- wordlift_sdk/kg_build/protocol.py | 319 +++++++----------------------- 1 file changed, 70 insertions(+), 249 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 104e5af..e36026c 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -6,8 +6,7 @@ import logging import os import time -import concurrent.futures -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict from pathlib import Path from types import SimpleNamespace @@ -20,13 +19,11 @@ from wordlift_sdk.protocol.web_page_import_protocol import ( WebPageImportProtocolInterface, ) -from pyshacl import validate as pyshacl_validate -from rdflib.namespace import SH -from wordlift_sdk.validation.shacl import ( - ValidationResult, - load_shapes_graph, - normalize_schema_org_uris, - resolve_shape_specs, +from wordlift_sdk.validation.shacl import resolve_shape_specs +from wordlift_sdk.validation.shacl_validation_service import ( + ShaclValidationService, + ValidationMode, + ValidationOutcome, ) from .config import ProfileDefinition @@ -52,63 +49,6 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts -# Module-level state for SHACL worker processes (one copy per process) -_shacl_worker_shapes_graph: Graph | None = None -_shacl_worker_source_map: dict = {} - - -def _init_shacl_worker(shape_specs: list[str] | None) -> None: - global _shacl_worker_shapes_graph, _shacl_worker_source_map - _shacl_worker_shapes_graph, _shacl_worker_source_map = load_shapes_graph( - shape_specs - ) - - -def _shacl_validate_in_worker(ntriples: str, submit_time: float) -> dict: - _queue_wait_ms = int((time.time() - submit_time) * 1000) - _t_start = time.perf_counter() - data_graph = Graph() - data_graph.parse(data=ntriples, format="nt") - data_graph = normalize_schema_org_uris(data_graph) - conforms, report_graph, _ = pyshacl_validate( - data_graph, - shacl_graph=_shacl_worker_shapes_graph, - inference="rdfs", - abort_on_first=False, - allow_infos=True, - allow_warnings=True, - ) - warning_sources: dict[str, int] = {} - error_sources: dict[str, int] = {} - warning_count = 0 - error_count = 0 - for node in report_graph.subjects(SH.resultSeverity, SH.Warning): - warning_count += 1 - shape = next(report_graph.objects(node, SH.sourceShape), None) - label = _shacl_worker_source_map.get(shape, "unknown") - warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1 - for node in report_graph.subjects(SH.resultSeverity, SH.Violation): - error_count += 1 - shape = next(report_graph.objects(node, SH.sourceShape), None) - label = _shacl_worker_source_map.get(shape, "unknown") - error_sources[str(label)] = error_sources.get(str(label), 0) + 1 - return { - "total": 1, - "pass": bool(conforms), - "fail": not bool(conforms), - "warnings": { - "count": warning_count, - "sources": dict(sorted(warning_sources.items())), - }, - "errors": { - "count": error_count, - "sources": dict(sorted(error_sources.items())), - }, - "_queue_wait_ms": _queue_wait_ms, - "_validation_ms": int((time.perf_counter() - _t_start) * 1000), - } - - def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str: value = settings.get("postprocessor_runtime") if value is None: @@ -226,7 +166,7 @@ def __init__( runtime=self._postprocessor_runtime, ) ) - self._shacl_mode = self._resolve_validation_mode( + shacl_mode = self._resolve_validation_mode( self.profile.settings.get( "shacl_validate_mode", self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"), @@ -254,29 +194,17 @@ def __init__( exclude_builtin_shapes=shacl_exclude_builtin_shapes or None, extra_shapes=shacl_extra_shapes or None, ) - if self._shacl_mode != "off": - _shacl_pool_size = int( - self.profile.settings.get( - "shacl_pool_size", - self.profile.settings.get( - "SHACL_POOL_SIZE", max(2, _pool_size // 2) - ), - ) - ) - self._process_executor: ProcessPoolExecutor | None = ProcessPoolExecutor( - max_workers=_shacl_pool_size, - initializer=_init_shacl_worker, - initargs=( - self._shacl_shape_specs if self._shacl_shape_specs else None, - ), - ) - logger.info( - "Created SHACL process pool with %d workers for profile '%s'", - _shacl_pool_size, - self.profile.name, + _shacl_pool_size = int( + self.profile.settings.get( + "shacl_pool_size", + self.profile.settings.get("SHACL_POOL_SIZE", max(2, _pool_size // 2)), ) - else: - self._process_executor = None + ) + self._shacl_validator = ShaclValidationService( + shape_specs=self._shacl_shape_specs or None, + mode=shacl_mode, + pool_size=_shacl_pool_size, + ) self._import_hash_mode = self._resolve_import_hash_mode( self.profile.settings.get( "import_hash_mode", @@ -285,7 +213,7 @@ def __init__( ) self._kpi = KgBuildKpiCollector( dataset_uri=getattr(self.context.account, "dataset_uri", None), - validation_enabled=self._shacl_mode != "off", + validation_enabled=self._shacl_validator.mode != ValidationMode.OFF, ) logger.debug( "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s", @@ -396,11 +324,24 @@ def _run_mapping() -> Graph | None: ) self._write_debug_graph(graph, url) - ( - validation_payload, - _t_validation_wait, - _t_validation_actual, - ) = await self._async_validate_if_enabled(loop, graph, url) + outcome: ValidationOutcome | None = await self._shacl_validator.validate(graph) + if outcome is not None: + logger.info( + "SHACL validation for %s: pass=%s warnings=%d errors=%d", + url, + outcome.passed, + outcome.warning_count, + outcome.error_count, + ) + self._kpi.record_validation( + passed=outcome.passed, + warning_count=outcome.warning_count, + error_count=outcome.error_count, + warning_sources=outcome.warning_sources, + error_sources=outcome.error_sources, + ) + _t_validation_wait = outcome.queue_wait_ms if outcome else 0 + _t_validation_actual = outcome.validation_ms if outcome else 0 graph_metrics = self._kpi.graph_metrics(graph) self._emit_progress( { @@ -408,14 +349,14 @@ def _run_mapping() -> Graph | None: "profile": self.profile.name, "url": url, "graph": graph_metrics, - "validation": validation_payload, + "validation": outcome.to_dict() if outcome else None, } ) self._kpi.record_graph(graph) if ( - validation_payload is not None - and self._shacl_mode == "fail" - and not validation_payload["pass"] + outcome is not None + and self._shacl_validator.mode == ValidationMode.FAIL + and outcome.failed ): raise RuntimeError(f"SHACL validation failed for {url} in fail mode.") await self._write_graph(graph) @@ -439,8 +380,7 @@ def close(self) -> None: break self._pp_executor.shutdown(wait=False) self._mapping_executor.shutdown(wait=False) - if self._process_executor is not None: - self._process_executor.shutdown(wait=False) + self._shacl_validator.close() def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) @@ -477,23 +417,34 @@ async def _patch_static_templates_once(self) -> None: self._ensure_templates_loaded() if self._template_graph and len(self._template_graph) > 0: - _loop = asyncio.get_event_loop() - validation_payload, _, _ = await self._async_validate_if_enabled( - _loop, self._template_graph, "static_templates" - ) + outcome = await self._shacl_validator.validate(self._template_graph) + if outcome is not None: + logger.info( + "SHACL validation for static_templates: pass=%s warnings=%d errors=%d", + outcome.passed, + outcome.warning_count, + outcome.error_count, + ) + self._kpi.record_validation( + passed=outcome.passed, + warning_count=outcome.warning_count, + error_count=outcome.error_count, + warning_sources=outcome.warning_sources, + error_sources=outcome.error_sources, + ) self._emit_progress( { "kind": "static_templates", "profile": self.profile.name, "graph": self._kpi.graph_metrics(self._template_graph), - "validation": validation_payload, + "validation": outcome.to_dict() if outcome else None, } ) self._kpi.record_graph(self._template_graph) if ( - validation_payload is not None - and self._shacl_mode == "fail" - and not validation_payload["pass"] + outcome is not None + and self._shacl_validator.mode == ValidationMode.FAIL + and outcome.failed ): raise RuntimeError( "SHACL validation failed for static templates in fail mode." @@ -856,139 +807,6 @@ def _mapping_response( web_page=response.web_page, ) - async def _async_validate_if_enabled( - self, loop: Any, graph: Graph, url: str - ) -> tuple[dict[str, Any] | None, int, int]: - if self._shacl_mode == "off": - return None, 0, 0 - ntriples = graph.serialize(format="nt") - try: - result = await asyncio.wait_for( - loop.run_in_executor( - self._process_executor, - functools.partial(_shacl_validate_in_worker, ntriples, time.time()), - ), - timeout=120.0, - ) - except (asyncio.TimeoutError, concurrent.futures.BrokenExecutor) as exc: - logger.warning( - "SHACL validation skipped for %s: %s (%s)", - url, - type(exc).__name__, - exc, - ) - return None, 0, 0 - validation_queue_wait_ms = result.pop("_queue_wait_ms", 0) - validation_ms = result.pop("_validation_ms", 0) - self._kpi.record_validation( - passed=result["pass"], - warning_count=result["warnings"]["count"], - error_count=result["errors"]["count"], - warning_sources=result["warnings"]["sources"], - error_sources=result["errors"]["sources"], - ) - logger.info( - "SHACL validation for %s: pass=%s warnings=%s errors=%s", - url, - result["pass"], - result["warnings"]["count"], - result["errors"]["count"], - ) - return result, validation_queue_wait_ms, validation_ms - - def _validate_graph_if_enabled( - self, graph: Graph, url: str - ) -> dict[str, Any] | None: - if self._shacl_mode == "off": - return None - result = self._validate_graph(graph) - summary = self._summarize_validation(result) - self._kpi.record_validation( - passed=summary["pass"], - warning_count=summary["warnings"]["count"], - error_count=summary["errors"]["count"], - warning_sources=summary["warnings"]["sources"], - error_sources=summary["errors"]["sources"], - ) - logger.info( - "SHACL validation for %s: pass=%s warnings=%s errors=%s", - url, - summary["pass"], - summary["warnings"]["count"], - summary["errors"]["count"], - ) - return summary - - def _validate_graph(self, graph: Graph) -> ValidationResult: - data_graph = normalize_schema_org_uris(graph) - conforms, report_graph, report_text = pyshacl_validate( - data_graph, - shacl_graph=self._shacl_shapes_graph, - inference="rdfs", - abort_on_first=False, - allow_infos=True, - allow_warnings=True, - ) - warning_count = sum( - 1 for _ in report_graph.subjects(SH.resultSeverity, SH.Warning) - ) - return ValidationResult( - conforms=conforms, - report_text=report_text, - report_graph=report_graph, - data_graph=data_graph, - shape_source_map=self._shacl_source_map, - warning_count=warning_count, - ) - - def _summarize_validation(self, result: ValidationResult) -> dict[str, Any]: - sh = URIRef("http://www.w3.org/ns/shacl#") - sh_warning = URIRef(f"{sh}Warning") - sh_violation = URIRef(f"{sh}Violation") - sh_source_shape = URIRef(f"{sh}sourceShape") - - warning_sources: dict[str, int] = {} - error_sources: dict[str, int] = {} - warning_count = 0 - error_count = 0 - - for report_node in result.report_graph.subjects( - URIRef(f"{sh}resultSeverity"), sh_warning - ): - warning_count += 1 - shape = next( - result.report_graph.objects(report_node, sh_source_shape), None - ) - label = result.shape_source_map.get(shape, "unknown") - warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1 - - for report_node in result.report_graph.subjects( - URIRef(f"{sh}resultSeverity"), sh_violation - ): - error_count += 1 - shape = next( - result.report_graph.objects(report_node, sh_source_shape), None - ) - label = result.shape_source_map.get(shape, "unknown") - error_sources[str(label)] = error_sources.get(str(label), 0) + 1 - - return { - "total": 1, - "pass": bool(result.conforms), - "fail": not bool(result.conforms), - "warnings": { - "count": warning_count, - "sources": dict( - sorted(warning_sources.items(), key=lambda item: item[0]) - ), - }, - "errors": { - "count": error_count, - "sources": dict( - sorted(error_sources.items(), key=lambda item: item[0]) - ), - }, - } def _emit_progress(self, payload: dict[str, Any]) -> None: if not callable(self._on_progress): @@ -1012,19 +830,22 @@ def _resolve_list_setting(self, value: Any) -> list[str]: return specs return [str(value).strip()] if str(value).strip() else [] - def _resolve_validation_mode(self, value: Any) -> str: + def _resolve_validation_mode(self, value: Any) -> ValidationMode: if value is None: - return "warn" + return ValidationMode.WARN mode = str(value).strip().lower() if mode == "strict": logger.warning( "Deprecated SHACL validation mode 'strict' detected; using 'fail'." ) - return "fail" - if mode in {"off", "warn", "fail"}: - return mode - logger.warning("Unsupported SHACL validation mode '%s'; using 'warn'.", mode) - return "warn" + return ValidationMode.FAIL + try: + return ValidationMode(mode) + except ValueError: + logger.warning( + "Unsupported SHACL validation mode '%s'; using 'warn'.", mode + ) + return ValidationMode.WARN def _resolve_import_hash_mode(self, value: Any) -> str: if value is None: From e5afb05963b55c5a13eb19714086570255178338 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:09:36 +0100 Subject: [PATCH 28/63] refactor: delegate load_postprocessors_for_profile to load_postprocessors --- wordlift_sdk/kg_build/postprocessors.py | 82 +++++++++++-------------- wordlift_sdk/kg_build/protocol.py | 21 +++---- wordlift_sdk/kg_build/rml_mapping.py | 21 +++++-- wordlift_sdk/structured_data/engine.py | 6 +- 4 files changed, 63 insertions(+), 67 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index 4e5a079..f9b7116 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -418,6 +418,14 @@ def _as_positive_int(value: Any, default: int) -> int: return value +def _build_handler( + spec: PostprocessorSpec, root_dir: Path, runtime: str +) -> GraphPostprocessor: + if runtime == _RUNTIME_INPROCESS: + return InProcessPostprocessor(class_path=spec.class_path) + return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime) + + def _normalize_runtime(value: str | None) -> str: runtime = (value or _RUNTIME_ONESHOT).strip().lower() if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}: @@ -509,6 +517,21 @@ def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]: } +def _load_from_specs( + specs: list[PostprocessorSpec], + root_dir: Path, + runtime: str, +) -> list[LoadedPostprocessor]: + return [ + LoadedPostprocessor( + name=spec.class_path, + handler=_build_handler(spec, root_dir, runtime), + ) + for spec in specs + if spec.enabled + ] + + def load_postprocessors_for_profile( *, root_dir: Path, @@ -518,73 +541,38 @@ def load_postprocessors_for_profile( base_manifest = root_dir / "profiles" / "_base" / "postprocessors.toml" profile_manifest = root_dir / "profiles" / profile_name / "postprocessors.toml" - selected_manifest: Path | None if profile_manifest.exists(): - selected_manifest = profile_manifest + selected_manifest: Path | None = profile_manifest elif base_manifest.exists(): selected_manifest = base_manifest else: selected_manifest = None - specs = _load_manifest_specs(selected_manifest) if selected_manifest else [] - - resolved_runtime = _normalize_runtime(runtime) - loaded: list[LoadedPostprocessor] = [] - for spec in specs: - if not spec.enabled: - continue - if resolved_runtime == _RUNTIME_INPROCESS: - handler: GraphPostprocessor = InProcessPostprocessor( - class_path=spec.class_path - ) - else: - handler = SubprocessPostprocessor( - spec=spec, - root_dir=root_dir, - runtime=resolved_runtime, - ) - loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler)) - - logger.info( - "Loaded %s postprocessors for profile '%s' from manifest: %s (runtime=%s)", - len(loaded), - profile_name, - selected_manifest or "none", - resolved_runtime, - ) logger.debug( - "Postprocessor manifest precedence for profile '%s': selected=%s base=%s chosen=%s", + "Postprocessor manifest precedence for profile '%s': profile=%s base=%s chosen=%s", profile_name, profile_manifest, base_manifest, selected_manifest or "none", ) - return loaded + return load_postprocessors(selected_manifest, root_dir=root_dir, runtime=runtime) def load_postprocessors( - manifest_path: Path, + manifest_path: Path | None, *, root_dir: Path, runtime: str | None = None, ) -> list[LoadedPostprocessor]: - specs = _load_manifest_specs(manifest_path) + specs = _load_manifest_specs(manifest_path) if manifest_path else [] resolved_runtime = _normalize_runtime(runtime) - loaded: list[LoadedPostprocessor] = [] - for spec in specs: - if not spec.enabled: - continue - if resolved_runtime == _RUNTIME_INPROCESS: - handler: GraphPostprocessor = InProcessPostprocessor( - class_path=spec.class_path - ) - else: - handler = SubprocessPostprocessor( - spec=spec, - root_dir=root_dir, - runtime=resolved_runtime, - ) - loaded.append(LoadedPostprocessor(name=spec.class_path, handler=handler)) + loaded = _load_from_specs(specs, root_dir, resolved_runtime) + logger.info( + "Loaded %s postprocessors from manifest: %s (runtime=%s)", + len(loaded), + manifest_path or "none", + resolved_runtime, + ) return loaded diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index e36026c..47065f8 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -36,9 +36,9 @@ close_loaded_postprocessors, load_postprocessors_for_profile, ) -from .rml_mapping import RmlMappingService +from .rml_mapping import MappingResult, RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer -from wordlift_sdk.structured_data.engine import init_morph_kgc_pool, _morph_kgc_tls +from wordlift_sdk.structured_data.engine import init_morph_kgc_pool logger = logging.getLogger(__name__) SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") @@ -252,17 +252,13 @@ async def callback( mapping_response = self._mapping_response(response, existing_web_page_id) debug_output: dict[str, str] | None = {} if self.debug_dir else None - _t0 = time.perf_counter() # apply_mapping has no awaits — all work is synchronous (morph_kgc). # Run it in a thread so the event loop stays free for I/O while the # thread waits for its morph_kgc subprocess slot to become available. - # _morph_kgc_tls is thread-local: capture it inside the worker thread - # and pass the value back via a closure dict. _timing: dict[str, int] = {} def _run_mapping() -> Graph | None: - _t_start = time.perf_counter() - result = asyncio.run( + mapping: MappingResult = asyncio.run( self.rml_service.apply_mapping( html=response.web_page.html, url=url, @@ -272,16 +268,13 @@ def _run_mapping() -> Graph | None: debug_output=debug_output, ) ) - mw = getattr(_morph_kgc_tls, "mapping_wait_ms", 0) - _timing["mapping_wait_ms"] = mw - # Subtract queue-wait so mapping= shows actual execution time only, - # consistent with how validation_wait/validation are reported. - _timing["mapping_ms"] = int((time.perf_counter() - _t_start) * 1000) - mw - return result + _timing["mapping_wait_ms"] = mapping.queue_wait_ms + _timing["mapping_ms"] = mapping.mapping_ms + return mapping.graph _loop = asyncio.get_event_loop() graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping) - _t_mapping = _timing.get("mapping_ms", int((time.perf_counter() - _t0) * 1000)) + _t_mapping = _timing.get("mapping_ms", 0) _t_mapping_wait = _timing.get("mapping_wait_ms", 0) if not graph or len(graph) == 0: logger.warning("No triples produced for %s", url) diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py index 5b40a91..b666ab6 100644 --- a/wordlift_sdk/kg_build/rml_mapping.py +++ b/wordlift_sdk/kg_build/rml_mapping.py @@ -4,17 +4,27 @@ import logging import os import tempfile +import time +from dataclasses import dataclass from pathlib import Path from typing import Any from rdflib import Graph from wordlift_sdk.protocol import Context +from wordlift_sdk.structured_data.engine import _morph_kgc_tls from wordlift_sdk.structured_data.materialization import MaterializationPipeline from wordlift_sdk.utils.html_converter import HtmlConverter logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class MappingResult: + graph: Graph | None + queue_wait_ms: int + mapping_ms: int + + class RmlMappingService: def __init__(self, context: Context) -> None: self._context = context @@ -32,7 +42,9 @@ async def apply_mapping( mapping_content: str | None = None, response: object | None = None, debug_output: dict[str, str] | None = None, - ) -> Graph | None: + ) -> MappingResult: + queue_wait_ms = 0 + _t_start = time.perf_counter() try: xhtml_str = xhtml or self.to_xhtml(html) if debug_output is not None: @@ -50,7 +62,7 @@ async def apply_mapping( resolved_mapping_content = f.read() except FileNotFoundError: logger.error("Mapping file not found: %s", mapping_file_path) - return None + return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000)) dataset_uri = getattr(self._context.account, "dataset_uri", None) if not dataset_uri: @@ -70,6 +82,7 @@ async def apply_mapping( url=url, response=response, ) + queue_wait_ms = getattr(_morph_kgc_tls, "queue_wait_ms", 0) jsonld_data = pipeline.postprocess( jsonld_raw, mappings, @@ -93,7 +106,7 @@ async def apply_mapping( "No triples generated from mapping %s.", mapping_file_path ) - return graph + return MappingResult(graph=graph, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000) - queue_wait_ms) except Exception as exc: logger.error( @@ -102,7 +115,7 @@ async def apply_mapping( exc, exc_info=True, ) - return None + return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000)) def _normalize_schema_uris(self, payload: Any): if isinstance(payload, dict): diff --git a/wordlift_sdk/structured_data/engine.py b/wordlift_sdk/structured_data/engine.py index c62ad75..3d27952 100644 --- a/wordlift_sdk/structured_data/engine.py +++ b/wordlift_sdk/structured_data/engine.py @@ -48,8 +48,10 @@ def _morph_kgc_worker(config: str, submit_time: float) -> tuple[str, int]: return ntriples, queue_wait_ms -# Thread-local used to pass mapping_wait_ms back to the protocol layer without -# changing the return type of _materialize_graph / apply_mapping. +# Thread-local used to pass mapping_wait_ms out of _materialize_graph without +# changing the return type of the public materialization API. +# Consumed by rml_mapping.RmlMappingService.apply_mapping — callers above that +# layer receive the timing as a regular return value. _morph_kgc_tls = threading.local() # Lazy process pool — created on first use in the main process only. From bb1c123a369d6ea8ee3ad1f5a0ac0254af24c6bf Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:13:37 +0100 Subject: [PATCH 29/63] refactor: replace runtime string constants with PostprocessorRuntime enum --- wordlift_sdk/kg_build/postprocessors.py | 28 ++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index f9b7116..c73574a 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -10,6 +10,7 @@ import subprocess import tempfile from dataclasses import dataclass, field +from enum import Enum from pathlib import Path from typing import Any, Protocol, runtime_checkable @@ -22,9 +23,11 @@ except ModuleNotFoundError: # pragma: no cover import tomli as tomllib -_RUNTIME_ONESHOT = "oneshot" -_RUNTIME_PERSISTENT = "persistent" -_RUNTIME_INPROCESS = "inprocess" + +class PostprocessorRuntime(str, Enum): + ONESHOT = "oneshot" + PERSISTENT = "persistent" + INPROCESS = "inprocess" @dataclass(frozen=True) @@ -256,7 +259,7 @@ def _terminate(self, process: subprocess.Popen[str]) -> None: class SubprocessPostprocessor: spec: PostprocessorSpec root_dir: Path - runtime: str = _RUNTIME_ONESHOT + runtime: PostprocessorRuntime = PostprocessorRuntime.ONESHOT _persistent_client: PersistentPostprocessorClient | None = field( init=False, default=None, @@ -285,7 +288,7 @@ def process_graph( encoding="utf-8", ) - if self.runtime == _RUNTIME_PERSISTENT: + if self.runtime == PostprocessorRuntime.PERSISTENT: self._run_persistent( input_graph_path=input_graph_path, output_graph_path=output_graph_path, @@ -419,20 +422,21 @@ def _as_positive_int(value: Any, default: int) -> int: def _build_handler( - spec: PostprocessorSpec, root_dir: Path, runtime: str + spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime ) -> GraphPostprocessor: - if runtime == _RUNTIME_INPROCESS: + if runtime == PostprocessorRuntime.INPROCESS: return InProcessPostprocessor(class_path=spec.class_path) return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime) -def _normalize_runtime(value: str | None) -> str: - runtime = (value or _RUNTIME_ONESHOT).strip().lower() - if runtime not in {_RUNTIME_ONESHOT, _RUNTIME_PERSISTENT, _RUNTIME_INPROCESS}: +def _normalize_runtime(value: str | None) -> PostprocessorRuntime: + raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower() + try: + return PostprocessorRuntime(raw) + except ValueError: raise ValueError( "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess." ) - return runtime def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]: @@ -520,7 +524,7 @@ def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]: def _load_from_specs( specs: list[PostprocessorSpec], root_dir: Path, - runtime: str, + runtime: PostprocessorRuntime, ) -> list[LoadedPostprocessor]: return [ LoadedPostprocessor( From d2494001bb00dc18d477f6e6c5194f9c772eaf10 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:15:08 +0100 Subject: [PATCH 30/63] fix: remove to_xhtml from public contract of RmlMappingService --- wordlift_sdk/kg_build/rml_mapping.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py index b666ab6..316ff11 100644 --- a/wordlift_sdk/kg_build/rml_mapping.py +++ b/wordlift_sdk/kg_build/rml_mapping.py @@ -30,7 +30,7 @@ def __init__(self, context: Context) -> None: self._context = context self._html_converter = HtmlConverter() - def to_xhtml(self, html: str) -> str: + def _to_xhtml(self, html: str) -> str: return self._html_converter.convert(html) async def apply_mapping( @@ -46,7 +46,7 @@ async def apply_mapping( queue_wait_ms = 0 _t_start = time.perf_counter() try: - xhtml_str = xhtml or self.to_xhtml(html) + xhtml_str = xhtml or self._to_xhtml(html) if debug_output is not None: debug_output["xhtml"] = xhtml_str @@ -62,7 +62,11 @@ async def apply_mapping( resolved_mapping_content = f.read() except FileNotFoundError: logger.error("Mapping file not found: %s", mapping_file_path) - return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000)) + return MappingResult( + graph=None, + queue_wait_ms=queue_wait_ms, + mapping_ms=int((time.perf_counter() - _t_start) * 1000), + ) dataset_uri = getattr(self._context.account, "dataset_uri", None) if not dataset_uri: @@ -106,7 +110,12 @@ async def apply_mapping( "No triples generated from mapping %s.", mapping_file_path ) - return MappingResult(graph=graph, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000) - queue_wait_ms) + return MappingResult( + graph=graph, + queue_wait_ms=queue_wait_ms, + mapping_ms=int((time.perf_counter() - _t_start) * 1000) + - queue_wait_ms, + ) except Exception as exc: logger.error( @@ -115,7 +124,11 @@ async def apply_mapping( exc, exc_info=True, ) - return MappingResult(graph=None, queue_wait_ms=queue_wait_ms, mapping_ms=int((time.perf_counter() - _t_start) * 1000)) + return MappingResult( + graph=None, + queue_wait_ms=queue_wait_ms, + mapping_ms=int((time.perf_counter() - _t_start) * 1000), + ) def _normalize_schema_uris(self, payload: Any): if isinstance(payload, dict): From 092050ebadcc8a50ce639695b870bae9dfaac65c Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:20:46 +0100 Subject: [PATCH 31/63] refactor: extract Closeable protocol and use isinstance check in close_loaded_postprocessors --- wordlift_sdk/kg_build/postprocessors.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index c73574a..5843143 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -43,6 +43,11 @@ class PostprocessorContext: ids: Any | None = None +@runtime_checkable +class Closeable(Protocol): + def close(self) -> None: ... + + @runtime_checkable class GraphPostprocessor(Protocol): def process_graph( @@ -582,9 +587,8 @@ def load_postprocessors( def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None: for processor in postprocessors: - close = getattr(processor.handler, "close", None) - if callable(close): - close() + if isinstance(processor.handler, Closeable): + processor.handler.close() def _write_graph_nquads(graph: Graph, path: Path) -> None: From 6a4df850a8f49a5fcf97e96f8c0e4edf14594734 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:31:42 +0100 Subject: [PATCH 32/63] refactor: split SubprocessPostprocessor into Oneshot and Persistent variants --- wordlift_sdk/kg_build/postprocessors.py | 171 ++++++++++++++---------- 1 file changed, 101 insertions(+), 70 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index 5843143..7e16618 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -43,6 +43,17 @@ class PostprocessorContext: ids: Any | None = None +class _SubprocessRunner(Protocol): + def __call__( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_path: Path, + context_payload: dict[str, Any], + ) -> None: ... + + @runtime_checkable class Closeable(Protocol): def close(self) -> None: ... @@ -260,83 +271,79 @@ def _terminate(self, process: subprocess.Popen[str]) -> None: pass -@dataclass -class SubprocessPostprocessor: +def _run_subprocess( + spec: PostprocessorSpec, + root_dir: Path, + graph: Graph, + payload: dict[str, Any], + runner: _SubprocessRunner, +) -> Graph | None: + """Shared scaffolding for subprocess-based postprocessors. + + Handles temp-dir lifecycle, graph serialization, output verification, + and debug-copy on failure. *runner* is called with the prepared paths + and is responsible only for the actual subprocess execution step. + """ + temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_")) + failed = False + try: + input_graph_path = temp_dir_path / "input_graph.nq" + output_graph_path = temp_dir_path / "output_graph.nq" + context_path = temp_dir_path / "context.json" + + _write_graph_nquads(graph, input_graph_path) + context_path.write_text( + json.dumps(payload, ensure_ascii=True, default=str), + encoding="utf-8", + ) + + runner( + input_graph_path=input_graph_path, + output_graph_path=output_graph_path, + context_path=context_path, + context_payload=payload, + ) + + if not output_graph_path.exists(): + failed = True + raise RuntimeError( + f"Postprocessor did not produce output graph: {spec.class_path}" + ) + + return _read_graph_nquads(output_graph_path) + except Exception: + failed = True + raise + finally: + if failed and spec.keep_temp_on_error: + debug_dir = root_dir / "output" / "postprocessor_debug" + debug_dir.mkdir(parents=True, exist_ok=True) + target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_")) + if target.exists(): + shutil.rmtree(target) + shutil.copytree(temp_dir_path, target) + _redact_debug_context(target / "context.json") + if temp_dir_path.exists(): + shutil.rmtree(temp_dir_path, ignore_errors=True) + + +@dataclass(frozen=True) +class OneshotSubprocessPostprocessor: spec: PostprocessorSpec root_dir: Path - runtime: PostprocessorRuntime = PostprocessorRuntime.ONESHOT - _persistent_client: PersistentPostprocessorClient | None = field( - init=False, - default=None, - repr=False, - ) - - def close(self) -> None: - if self._persistent_client is not None: - self._persistent_client.close() - self._persistent_client = None def process_graph( self, graph: Graph, context: PostprocessorContext ) -> Graph | None: - payload = _build_runner_payload(context) - temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_")) - failed = False - try: - input_graph_path = temp_dir_path / "input_graph.nq" - output_graph_path = temp_dir_path / "output_graph.nq" - context_path = temp_dir_path / "context.json" - - _write_graph_nquads(graph, input_graph_path) - context_path.write_text( - json.dumps(payload, ensure_ascii=True, default=str), - encoding="utf-8", - ) + return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run) - if self.runtime == PostprocessorRuntime.PERSISTENT: - self._run_persistent( - input_graph_path=input_graph_path, - output_graph_path=output_graph_path, - context_payload=payload, - ) - else: - self._run_oneshot( - input_graph_path=input_graph_path, - output_graph_path=output_graph_path, - context_path=context_path, - ) - - if not output_graph_path.exists(): - failed = True - raise RuntimeError( - "Postprocessor did not produce output graph: " - f"{self.spec.class_path}" - ) - - return _read_graph_nquads(output_graph_path) - except Exception: - failed = True - raise - finally: - if failed and self.spec.keep_temp_on_error: - debug_dir = self.root_dir / "output" / "postprocessor_debug" - debug_dir.mkdir(parents=True, exist_ok=True) - target = debug_dir / ( - self.spec.class_path.replace(":", "_").replace(".", "_") - ) - if target.exists(): - shutil.rmtree(target) - shutil.copytree(temp_dir_path, target) - _redact_debug_context(target / "context.json") - if temp_dir_path.exists(): - shutil.rmtree(temp_dir_path, ignore_errors=True) - - def _run_oneshot( + def _run( self, *, input_graph_path: Path, output_graph_path: Path, context_path: Path, + **_: Any, ) -> None: cmd = [ self.spec.python, @@ -366,19 +373,41 @@ def _run_oneshot( f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "") ) - def _run_persistent( + +@dataclass +class PersistentSubprocessPostprocessor: + spec: PostprocessorSpec + root_dir: Path + _client: PersistentPostprocessorClient | None = field( + init=False, + default=None, + repr=False, + ) + + def close(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: + return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run) + + def _run( self, *, input_graph_path: Path, output_graph_path: Path, context_payload: dict[str, Any], + **_: Any, ) -> None: - if self._persistent_client is None: - self._persistent_client = PersistentPostprocessorClient( + if self._client is None: + self._client = PersistentPostprocessorClient( spec=self.spec, root_dir=self.root_dir, ) - self._persistent_client.process_graph( + self._client.process_graph( input_graph_path=input_graph_path, output_graph_path=output_graph_path, context_payload=context_payload, @@ -431,7 +460,9 @@ def _build_handler( ) -> GraphPostprocessor: if runtime == PostprocessorRuntime.INPROCESS: return InProcessPostprocessor(class_path=spec.class_path) - return SubprocessPostprocessor(spec=spec, root_dir=root_dir, runtime=runtime) + if runtime == PostprocessorRuntime.PERSISTENT: + return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir) + return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir) def _normalize_runtime(value: str | None) -> PostprocessorRuntime: From 5ed1ba7d6ad092e393578c6577c46983180968e5 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:38:25 +0100 Subject: [PATCH 33/63] refactor: introduce PostprocessorResult and remove dead _apply_postprocessors --- wordlift_sdk/kg_build/postprocessors.py | 15 +++++++-- wordlift_sdk/kg_build/protocol.py | 43 +++++++++++-------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index 7e16618..d29b8f5 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -66,6 +66,13 @@ def process_graph( ) -> Graph | None: ... +@dataclass(frozen=True) +class PostprocessorResult: + graph: Graph + queue_wait_ms: int + postprocessors_ms: int + + @dataclass(frozen=True) class LoadedPostprocessor: name: str @@ -335,7 +342,9 @@ class OneshotSubprocessPostprocessor: def process_graph( self, graph: Graph, context: PostprocessorContext ) -> Graph | None: - return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run) + return _run_subprocess( + self.spec, self.root_dir, graph, _build_runner_payload(context), self._run + ) def _run( self, @@ -392,7 +401,9 @@ def close(self) -> None: def process_graph( self, graph: Graph, context: PostprocessorContext ) -> Graph | None: - return _run_subprocess(self.spec, self.root_dir, graph, _build_runner_payload(context), self._run) + return _run_subprocess( + self.spec, self.root_dir, graph, _build_runner_payload(context), self._run + ) def _run( self, diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 47065f8..85fe5c2 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -33,6 +33,7 @@ from .kpi import KgBuildKpiCollector from .postprocessors import ( PostprocessorContext, + PostprocessorResult, close_loaded_postprocessors, load_postprocessors_for_profile, ) @@ -285,10 +286,9 @@ def _run_mapping() -> Graph | None: loop = asyncio.get_event_loop() _t1 = time.perf_counter() _postprocessors = await self._postprocessors_queue.get() - _t_queue_wait = int((time.perf_counter() - _t1) * 1000) + _queue_wait_ms = int((time.perf_counter() - _t1) * 1000) try: - _t2 = time.perf_counter() - graph = await loop.run_in_executor( + pp_result: PostprocessorResult = await loop.run_in_executor( self._pp_executor, functools.partial( self._apply_postprocessors_with, @@ -297,11 +297,12 @@ def _run_mapping() -> Graph | None: response, existing_web_page_id, _postprocessors, + _queue_wait_ms, ), ) - _t_postprocessors = int((time.perf_counter() - _t2) * 1000) finally: self._postprocessors_queue.put_nowait(_postprocessors) + graph = pp_result.graph # Canonical IDs must run after custom postprocessors so any nodes minted # by local logic are normalized before graph sync patching. graph = self._core_ids.process_graph( @@ -359,8 +360,8 @@ def _run_mapping() -> Graph | None: url, _t_mapping_wait, _t_mapping, - _t_queue_wait, - _t_postprocessors, + pp_result.queue_wait_ms, + pp_result.postprocessors_ms, _t_validation_wait, _t_validation_actual, ) @@ -595,21 +596,6 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool: and existing_hash == import_hash ) - def _apply_postprocessors( - self, - graph: Graph, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - ) -> Graph: - return self._apply_postprocessors_with( - graph, - url, - response, - existing_web_page_id, - list(self._postprocessors_queue._queue), # type: ignore[attr-defined] - ) - def _apply_postprocessors_with( self, graph: Graph, @@ -617,9 +603,13 @@ def _apply_postprocessors_with( response: WebPageScrapeResponse, existing_web_page_id: str | None, postprocessors: list, - ) -> Graph: + queue_wait_ms: int, + ) -> PostprocessorResult: + _t_start = time.perf_counter() if not postprocessors: - return graph + return PostprocessorResult( + graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0 + ) pp_context = self._build_pp_context(url, response, existing_web_page_id) if not pp_context.account_key: @@ -637,7 +627,11 @@ def _apply_postprocessors_with( url, int((time.perf_counter() - _tp) * 1000), ) - return graph + return PostprocessorResult( + graph=graph, + queue_wait_ms=queue_wait_ms, + postprocessors_ms=int((time.perf_counter() - _t_start) * 1000), + ) def _build_pp_context( self, @@ -800,7 +794,6 @@ def _mapping_response( web_page=response.web_page, ) - def _emit_progress(self, payload: dict[str, Any]) -> None: if not callable(self._on_progress): return From 310e5dcb09a5e4e4bcfc2df46b27cc75a0105a70 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:46:32 +0100 Subject: [PATCH 34/63] refactor: extract PostprocessorService from ProfileImportProtocol --- .../kg_build/postprocessor_service.py | 195 ++++++++++++++++++ wordlift_sdk/kg_build/protocol.py | 164 ++------------- 2 files changed, 212 insertions(+), 147 deletions(-) create mode 100644 wordlift_sdk/kg_build/postprocessor_service.py diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessor_service.py new file mode 100644 index 0000000..cd8c21d --- /dev/null +++ b/wordlift_sdk/kg_build/postprocessor_service.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import asyncio +import functools +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict +from pathlib import Path +from typing import Any + +from rdflib import Graph +from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse +from wordlift_sdk.protocol import Context + +from .config import ProfileDefinition +from .id_allocator import IdAllocator +from .postprocessors import ( + PostprocessorContext, + PostprocessorResult, + close_loaded_postprocessors, + load_postprocessors_for_profile, +) + +logger = logging.getLogger(__name__) + + +def _clean_key(value: Any) -> str | None: + if value is None: + return None + key = str(value).strip() + return key or None + + +class PostprocessorService: + def __init__( + self, + *, + root_dir: Path, + profile: ProfileDefinition, + context: Context, + pool_size: int, + runtime: str, + ) -> None: + self._profile = profile + self._context = context + self._executor = ThreadPoolExecutor( + max_workers=pool_size, thread_name_prefix="worai_pp" + ) + self._queue: asyncio.Queue = asyncio.Queue() + for _ in range(pool_size): + self._queue.put_nowait( + load_postprocessors_for_profile( + root_dir=root_dir, + profile_name=profile.name, + runtime=runtime, + ) + ) + logger.info( + "Created postprocessor pool for profile '%s' (pool_size=%d runtime=%s)", + profile.name, + pool_size, + runtime, + ) + + async def apply( + self, + graph: Graph, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + exports: dict[str, Any], + ) -> PostprocessorResult: + _t1 = time.perf_counter() + postprocessors = await self._queue.get() + queue_wait_ms = int((time.perf_counter() - _t1) * 1000) + loop = asyncio.get_event_loop() + try: + return await loop.run_in_executor( + self._executor, + functools.partial( + self._run, + graph, + url, + response, + existing_web_page_id, + postprocessors, + queue_wait_ms, + exports, + ), + ) + finally: + self._queue.put_nowait(postprocessors) + + def build_context( + self, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + exports: dict[str, Any], + ) -> PostprocessorContext: + dataset_uri = str(getattr(self._context.account, "dataset_uri", "")).rstrip("/") + ids = IdAllocator(dataset_uri) if dataset_uri else None + profile_payload = asdict(self._profile) + profile_settings = dict(profile_payload.get("settings", {}) or {}) + profile_settings.setdefault("api_url", "https://api.wordlift.io") + profile_payload["settings"] = profile_settings + return PostprocessorContext( + profile_name=self._profile.name, + profile=profile_payload, + url=url, + account=self._context.account, + account_key=self._resolve_account_key(), + exports=exports, + response=response, + existing_web_page_id=existing_web_page_id, + ids=ids, + ) + + def close(self) -> None: + while not self._queue.empty(): + try: + close_loaded_postprocessors(self._queue.get_nowait()) + except asyncio.QueueEmpty: + break + self._executor.shutdown(wait=False) + + def _run( + self, + graph: Graph, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + postprocessors: list, + queue_wait_ms: int, + exports: dict[str, Any], + ) -> PostprocessorResult: + _t_start = time.perf_counter() + if not postprocessors: + return PostprocessorResult( + graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0 + ) + + pp_context = self.build_context(url, response, existing_web_page_id, exports) + if not pp_context.account_key: + raise RuntimeError( + "Postprocessor runtime requires an API key. Configure one via profile " + "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY." + ) + + for processor in postprocessors: + _tp = time.perf_counter() + graph = processor.run(graph, pp_context) + logger.info( + "Applied postprocessor '%s' for %s [%dms]", + processor.name, + url, + int((time.perf_counter() - _tp) * 1000), + ) + return PostprocessorResult( + graph=graph, + queue_wait_ms=queue_wait_ms, + postprocessors_ms=int((time.perf_counter() - _t_start) * 1000), + ) + + def _resolve_account_key(self) -> str | None: + profile_key = _clean_key(self._profile.api_key) + if profile_key: + return profile_key + + client_config = getattr(self._context, "client_configuration", None) + if client_config is not None: + api_key_map = getattr(client_config, "api_key", None) + if isinstance(api_key_map, dict): + runtime_key = _clean_key(api_key_map.get("ApiKey")) + if runtime_key: + return runtime_key + + provider = getattr(self._context, "configuration_provider", None) + if provider is not None: + for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): + try: + key = _clean_key(provider.get_value(name)) + except Exception: + key = None + if key: + return key + + for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): + key = _clean_key(os.getenv(name)) + if key: + return key + + return None diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 85fe5c2..b998811 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -1,13 +1,10 @@ from __future__ import annotations import asyncio -import functools import hashlib import logging import os -import time from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict from pathlib import Path from types import SimpleNamespace from typing import Any @@ -28,15 +25,9 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher -from .id_allocator import IdAllocator from .id_postprocessor import CanonicalIdsPostprocessor from .kpi import KgBuildKpiCollector -from .postprocessors import ( - PostprocessorContext, - PostprocessorResult, - close_loaded_postprocessors, - load_postprocessors_for_profile, -) +from .postprocessor_service import PostprocessorService from .rml_mapping import MappingResult, RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer from wordlift_sdk.structured_data.engine import init_morph_kgc_pool @@ -111,13 +102,13 @@ def __init__( .lower() ) self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) - self._postprocessor_runtime = _resolve_postprocessor_runtime( + _postprocessor_runtime = _resolve_postprocessor_runtime( dict(self.profile.settings) ) logger.info( "Resolved postprocessor runtime for profile '%s': %s (origin=%s)", self.profile.name, - self._postprocessor_runtime, + _postprocessor_runtime, self.profile.origins.get("postprocessor_runtime", "default"), ) _pool_size = int( @@ -137,8 +128,12 @@ def __init__( _pp_pool_size, _pool_size, ) - self._pp_executor = ThreadPoolExecutor( - max_workers=_pp_pool_size, thread_name_prefix="worai_pp" + self._postprocessor_service = PostprocessorService( + root_dir=self.root_dir, + profile=self.profile, + context=context, + pool_size=_pp_pool_size, + runtime=_postprocessor_runtime, ) _mapping_pool_size = int( self.profile.settings.get( @@ -158,15 +153,6 @@ def __init__( self._mapping_executor = ThreadPoolExecutor( max_workers=_pool_size, thread_name_prefix="worai_ml" ) - self._postprocessors_queue: asyncio.Queue = asyncio.Queue() - for _ in range(_pp_pool_size): - self._postprocessors_queue.put_nowait( - load_postprocessors_for_profile( - root_dir=self.root_dir, - profile_name=self.profile.name, - runtime=self._postprocessor_runtime, - ) - ) shacl_mode = self._resolve_validation_mode( self.profile.settings.get( "shacl_validate_mode", @@ -283,30 +269,17 @@ def _run_mapping() -> Graph | None: if existing_web_page_id: self._reconcile_root_id(graph, existing_web_page_id) - loop = asyncio.get_event_loop() - _t1 = time.perf_counter() - _postprocessors = await self._postprocessors_queue.get() - _queue_wait_ms = int((time.perf_counter() - _t1) * 1000) - try: - pp_result: PostprocessorResult = await loop.run_in_executor( - self._pp_executor, - functools.partial( - self._apply_postprocessors_with, - graph, - url, - response, - existing_web_page_id, - _postprocessors, - _queue_wait_ms, - ), - ) - finally: - self._postprocessors_queue.put_nowait(_postprocessors) + pp_result = await self._postprocessor_service.apply( + graph, url, response, existing_web_page_id, self._template_exports or {} + ) graph = pp_result.graph # Canonical IDs must run after custom postprocessors so any nodes minted # by local logic are normalized before graph sync patching. graph = self._core_ids.process_graph( - graph, self._build_pp_context(url, response, existing_web_page_id) + graph, + self._postprocessor_service.build_context( + url, response, existing_web_page_id, self._template_exports or {} + ), ) self._set_source(graph, existing_web_page_id) self._set_existing_import_hash(graph, existing_import_hash) @@ -367,12 +340,7 @@ def _run_mapping() -> Graph | None: ) def close(self) -> None: - while not self._postprocessors_queue.empty(): - try: - close_loaded_postprocessors(self._postprocessors_queue.get_nowait()) - except asyncio.QueueEmpty: - break - self._pp_executor.shutdown(wait=False) + self._postprocessor_service.close() self._mapping_executor.shutdown(wait=False) self._shacl_validator.close() @@ -596,104 +564,6 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool: and existing_hash == import_hash ) - def _apply_postprocessors_with( - self, - graph: Graph, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - postprocessors: list, - queue_wait_ms: int, - ) -> PostprocessorResult: - _t_start = time.perf_counter() - if not postprocessors: - return PostprocessorResult( - graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0 - ) - - pp_context = self._build_pp_context(url, response, existing_web_page_id) - if not pp_context.account_key: - raise RuntimeError( - "Postprocessor runtime requires an API key. Configure one via profile " - "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY." - ) - - for processor in postprocessors: - _tp = time.perf_counter() - graph = processor.run(graph, pp_context) - logger.info( - "Applied postprocessor '%s' for %s [%dms]", - processor.name, - url, - int((time.perf_counter() - _tp) * 1000), - ) - return PostprocessorResult( - graph=graph, - queue_wait_ms=queue_wait_ms, - postprocessors_ms=int((time.perf_counter() - _t_start) * 1000), - ) - - def _build_pp_context( - self, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - ) -> PostprocessorContext: - dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/") - ids = IdAllocator(dataset_uri) if dataset_uri else None - profile_payload = asdict(self.profile) - profile_settings = dict(profile_payload.get("settings", {}) or {}) - profile_settings.setdefault("api_url", "https://api.wordlift.io") - profile_payload["settings"] = profile_settings - return PostprocessorContext( - profile_name=self.profile.name, - profile=profile_payload, - url=url, - account=self.context.account, - account_key=self._resolve_postprocessor_account_key(), - exports=self._template_exports or {}, - response=response, - existing_web_page_id=existing_web_page_id, - ids=ids, - ) - - def _resolve_postprocessor_account_key(self) -> str | None: - profile_key = self._clean_key(self.profile.api_key) - if profile_key: - return profile_key - - client_config = getattr(self.context, "client_configuration", None) - if client_config is not None: - api_key_map = getattr(client_config, "api_key", None) - if isinstance(api_key_map, dict): - runtime_key = self._clean_key(api_key_map.get("ApiKey")) - if runtime_key: - return runtime_key - - provider = getattr(self.context, "configuration_provider", None) - if provider is not None: - for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): - try: - key = self._clean_key(provider.get_value(name)) - except Exception: - key = None - if key: - return key - - for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): - key = self._clean_key(os.getenv(name)) - if key: - return key - - return None - - @staticmethod - def _clean_key(value: Any) -> str | None: - if value is None: - return None - key = str(value).strip() - return key or None - def _write_debug_graph(self, graph: Graph, url: str) -> None: assert self.debug_dir is not None self.debug_dir.mkdir(parents=True, exist_ok=True) From e422d008b25a79ad4c505fffea84dcb6c2ee391e Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 12:49:47 +0100 Subject: [PATCH 35/63] fix+refactor: inject MaterializationPipeline/HtmlConverter and fix TLS attribute name --- wordlift_sdk/kg_build/rml_mapping.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/wordlift_sdk/kg_build/rml_mapping.py b/wordlift_sdk/kg_build/rml_mapping.py index 316ff11..6d9ef39 100644 --- a/wordlift_sdk/kg_build/rml_mapping.py +++ b/wordlift_sdk/kg_build/rml_mapping.py @@ -26,9 +26,15 @@ class MappingResult: class RmlMappingService: - def __init__(self, context: Context) -> None: + def __init__( + self, + context: Context, + pipeline: MaterializationPipeline | None = None, + html_converter: HtmlConverter | None = None, + ) -> None: self._context = context - self._html_converter = HtmlConverter() + self._pipeline = pipeline or MaterializationPipeline() + self._html_converter = html_converter or HtmlConverter() def _to_xhtml(self, html: str) -> str: return self._html_converter.convert(html) @@ -72,22 +78,21 @@ async def apply_mapping( if not dataset_uri: raise RuntimeError("Dataset URI not available on context.account.") - pipeline = MaterializationPipeline() - normalized_yarrrml, mappings = pipeline.normalize( + normalized_yarrrml, mappings = self._pipeline.normalize( resolved_mapping_content, url, Path(data_path), response=response, ) - jsonld_raw = pipeline.materialize( + jsonld_raw = self._pipeline.materialize( normalized_yarrrml, Path(data_path), Path(temp_dir), url=url, response=response, ) - queue_wait_ms = getattr(_morph_kgc_tls, "queue_wait_ms", 0) - jsonld_data = pipeline.postprocess( + queue_wait_ms = getattr(_morph_kgc_tls, "mapping_wait_ms", 0) + jsonld_data = self._pipeline.postprocess( jsonld_raw, mappings, xhtml_str, From 23a1a1478059d74e0e0bb4e6d69fcb434d81f046 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:01:08 +0100 Subject: [PATCH 36/63] refactor: wire _setting helper into __init__ to eliminate nested .get() chains --- wordlift_sdk/kg_build/protocol.py | 75 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index b998811..9b117d8 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -41,11 +41,20 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts +def _setting(settings: dict, name: str, fallback: str, default: Any) -> Any: + """Read a profile setting by snake_case name, falling back to UPPER_CASE, then default.""" + v = settings.get(name) + if v is None: + v = settings.get(fallback) + return default if v is None else v + + def _resolve_postprocessor_runtime(settings: dict[str, Any]) -> str: - value = settings.get("postprocessor_runtime") - if value is None: - value = settings.get("POSTPROCESSOR_RUNTIME") - return str(value or "persistent") + return str( + _setting( + settings, "postprocessor_runtime", "POSTPROCESSOR_RUNTIME", "persistent" + ) + ) class ProfileImportProtocol(WebPageImportProtocolInterface): @@ -91,35 +100,32 @@ def __init__( self._mapping_cache: dict[Path, str] = {} self._static_templates_patched = False self._static_templates_lock = asyncio.Lock() + + settings = dict(self.profile.settings) canonical_id_strategy = ( str( - self.profile.settings.get( - "canonical_id_strategy", - self.profile.settings.get("CANONICAL_ID_STRATEGY", "legacy"), + _setting( + settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy" ) ) .strip() .lower() ) self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) - _postprocessor_runtime = _resolve_postprocessor_runtime( - dict(self.profile.settings) - ) + _postprocessor_runtime = _resolve_postprocessor_runtime(settings) logger.info( "Resolved postprocessor runtime for profile '%s': %s (origin=%s)", self.profile.name, _postprocessor_runtime, self.profile.origins.get("postprocessor_runtime", "default"), ) - _pool_size = int( - self.profile.settings.get( - "concurrency", self.profile.settings.get("CONCURRENCY", 4) - ) - ) + _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) _pp_pool_size = int( - self.profile.settings.get( + _setting( + settings, "postprocessor_pool_size", - self.profile.settings.get("POSTPROCESSOR_POOL_SIZE", _pool_size), + "POSTPROCESSOR_POOL_SIZE", + _pool_size, ) ) logger.info( @@ -136,9 +142,8 @@ def __init__( runtime=_postprocessor_runtime, ) _mapping_pool_size = int( - self.profile.settings.get( - "mapping_pool_size", - self.profile.settings.get("MAPPING_POOL_SIZE", os.cpu_count() or 4), + _setting( + settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4 ) ) logger.info( @@ -154,27 +159,21 @@ def __init__( max_workers=_pool_size, thread_name_prefix="worai_ml" ) shacl_mode = self._resolve_validation_mode( - self.profile.settings.get( - "shacl_validate_mode", - self.profile.settings.get("SHACL_VALIDATE_MODE", "warn"), - ) + _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn") ) shacl_builtin_shapes = self._resolve_list_setting( - self.profile.settings.get( - "shacl_builtin_shapes", - self.profile.settings.get("SHACL_BUILTIN_SHAPES"), - ) + _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None) ) shacl_exclude_builtin_shapes = self._resolve_list_setting( - self.profile.settings.get( + _setting( + settings, "shacl_exclude_builtin_shapes", - self.profile.settings.get("SHACL_EXCLUDE_BUILTIN_SHAPES"), + "SHACL_EXCLUDE_BUILTIN_SHAPES", + None, ) ) shacl_extra_shapes = self._resolve_list_setting( - self.profile.settings.get( - "shacl_extra_shapes", self.profile.settings.get("SHACL_EXTRA_SHAPES") - ) + _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None) ) self._shacl_shape_specs = resolve_shape_specs( builtin_shapes=shacl_builtin_shapes or None, @@ -182,9 +181,8 @@ def __init__( extra_shapes=shacl_extra_shapes or None, ) _shacl_pool_size = int( - self.profile.settings.get( - "shacl_pool_size", - self.profile.settings.get("SHACL_POOL_SIZE", max(2, _pool_size // 2)), + _setting( + settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, _pool_size // 2) ) ) self._shacl_validator = ShaclValidationService( @@ -193,10 +191,7 @@ def __init__( pool_size=_shacl_pool_size, ) self._import_hash_mode = self._resolve_import_hash_mode( - self.profile.settings.get( - "import_hash_mode", - self.profile.settings.get("IMPORT_HASH_MODE", "on"), - ) + _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") ) self._kpi = KgBuildKpiCollector( dataset_uri=getattr(self.context.account, "dataset_uri", None), From d68f883fb307040434c601f626c278a4f8433b13 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:03:44 +0100 Subject: [PATCH 37/63] refactor: extract _init_postprocessor_service/_mapping_executor/_shacl_validator from __init__ --- wordlift_sdk/kg_build/protocol.py | 91 +++++++++++++++++-------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 9b117d8..6f3e1c9 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -112,36 +112,61 @@ def __init__( .lower() ) self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) - _postprocessor_runtime = _resolve_postprocessor_runtime(settings) + _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) + self._init_postprocessor_service(settings, context, _pool_size) + self._init_mapping_executor(settings, _pool_size) + self._init_shacl_validator(settings, _pool_size) + self._import_hash_mode = self._resolve_import_hash_mode( + _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") + ) + self._kpi = KgBuildKpiCollector( + dataset_uri=getattr(self.context.account, "dataset_uri", None), + validation_enabled=self._shacl_validator.mode != ValidationMode.OFF, + ) + logger.debug( + "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s", + self.profile.name, + self.mappings_dir, + self.profile.origins.get("mappings_dir", "default"), + len(self.profile.routes), + self.profile.origins.get("routes", "default"), + [str(p) for p in self._mapping_dirs], + ) + + def _init_postprocessor_service( + self, settings: dict, context: Context, pool_size: int + ) -> None: + runtime = _resolve_postprocessor_runtime(settings) logger.info( "Resolved postprocessor runtime for profile '%s': %s (origin=%s)", self.profile.name, - _postprocessor_runtime, + runtime, self.profile.origins.get("postprocessor_runtime", "default"), ) - _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) - _pp_pool_size = int( + pp_pool_size = int( _setting( settings, "postprocessor_pool_size", "POSTPROCESSOR_POOL_SIZE", - _pool_size, + pool_size, ) ) logger.info( "Postprocessor pool size for profile '%s': %d (concurrency=%d)", self.profile.name, - _pp_pool_size, - _pool_size, + pp_pool_size, + pool_size, ) self._postprocessor_service = PostprocessorService( root_dir=self.root_dir, profile=self.profile, context=context, - pool_size=_pp_pool_size, - runtime=_postprocessor_runtime, + pool_size=pp_pool_size, + runtime=runtime, ) - _mapping_pool_size = int( + + def _init_mapping_executor(self, settings: dict, pool_size: int) -> None: + mapping_pool_size = int( _setting( settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4 ) @@ -149,22 +174,24 @@ def __init__( logger.info( "Mapping pool size for profile '%s': %d", self.profile.name, - _mapping_pool_size, + mapping_pool_size, ) - init_morph_kgc_pool(_mapping_pool_size) + init_morph_kgc_pool(mapping_pool_size) # Wraps apply_mapping calls so they run in a thread rather than blocking # the asyncio event loop. The thread itself blocks on the morph_kgc # ProcessPoolExecutor slot, leaving the event loop free for I/O. self._mapping_executor = ThreadPoolExecutor( - max_workers=_pool_size, thread_name_prefix="worai_ml" + max_workers=pool_size, thread_name_prefix="worai_ml" ) - shacl_mode = self._resolve_validation_mode( + + def _init_shacl_validator(self, settings: dict, pool_size: int) -> None: + mode = self._resolve_validation_mode( _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn") ) - shacl_builtin_shapes = self._resolve_list_setting( + builtin_shapes = self._resolve_list_setting( _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None) ) - shacl_exclude_builtin_shapes = self._resolve_list_setting( + exclude_builtin_shapes = self._resolve_list_setting( _setting( settings, "shacl_exclude_builtin_shapes", @@ -172,39 +199,23 @@ def __init__( None, ) ) - shacl_extra_shapes = self._resolve_list_setting( + extra_shapes = self._resolve_list_setting( _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None) ) self._shacl_shape_specs = resolve_shape_specs( - builtin_shapes=shacl_builtin_shapes or None, - exclude_builtin_shapes=shacl_exclude_builtin_shapes or None, - extra_shapes=shacl_extra_shapes or None, + builtin_shapes=builtin_shapes or None, + exclude_builtin_shapes=exclude_builtin_shapes or None, + extra_shapes=extra_shapes or None, ) - _shacl_pool_size = int( + shacl_pool_size = int( _setting( - settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, _pool_size // 2) + settings, "shacl_pool_size", "SHACL_POOL_SIZE", max(2, pool_size // 2) ) ) self._shacl_validator = ShaclValidationService( shape_specs=self._shacl_shape_specs or None, - mode=shacl_mode, - pool_size=_shacl_pool_size, - ) - self._import_hash_mode = self._resolve_import_hash_mode( - _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") - ) - self._kpi = KgBuildKpiCollector( - dataset_uri=getattr(self.context.account, "dataset_uri", None), - validation_enabled=self._shacl_validator.mode != ValidationMode.OFF, - ) - logger.debug( - "Resolved mappings for profile '%s': effective_dir=%s (origin=%s), routes=%s (origin=%s), overlay_dirs=%s", - self.profile.name, - self.mappings_dir, - self.profile.origins.get("mappings_dir", "default"), - len(self.profile.routes), - self.profile.origins.get("routes", "default"), - [str(p) for p in self._mapping_dirs], + mode=mode, + pool_size=shacl_pool_size, ) async def callback( From ec977d20fa9ca7e26bc96296cf4729889f6a7ff1 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:08:40 +0100 Subject: [PATCH 38/63] refactor: split callback into _run_mapping_stage and _run_postprocessing_stage --- wordlift_sdk/kg_build/protocol.py | 124 +++++++++++++++++------------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 6f3e1c9..8c03701 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -229,66 +229,26 @@ async def callback( if hasattr(response, "web_page") and response.web_page else "Unknown URL" ) - if hasattr(response, "errors") and response.errors: logger.error("Cloud callback error for %s: %s", url, response.errors) return - if not response.web_page or not response.web_page.html: logger.warning("No HTML content for %s, skipping mapping", url) return await self._patch_static_templates_once() - mapping_path = self._resolve_mapping_path(url) - rendered_mapping = self._get_mapping_content(mapping_path) - mapping_response = self._mapping_response(response, existing_web_page_id) debug_output: dict[str, str] | None = {} if self.debug_dir else None - - # apply_mapping has no awaits — all work is synchronous (morph_kgc). - # Run it in a thread so the event loop stays free for I/O while the - # thread waits for its morph_kgc subprocess slot to become available. - _timing: dict[str, int] = {} - - def _run_mapping() -> Graph | None: - mapping: MappingResult = asyncio.run( - self.rml_service.apply_mapping( - html=response.web_page.html, - url=url, - mapping_file_path=mapping_path, - mapping_content=rendered_mapping, - response=mapping_response, - debug_output=debug_output, - ) - ) - _timing["mapping_wait_ms"] = mapping.queue_wait_ms - _timing["mapping_ms"] = mapping.mapping_ms - return mapping.graph - - _loop = asyncio.get_event_loop() - graph = await _loop.run_in_executor(self._mapping_executor, _run_mapping) - _t_mapping = _timing.get("mapping_ms", 0) - _t_mapping_wait = _timing.get("mapping_wait_ms", 0) - if not graph or len(graph) == 0: + mapping = await self._run_mapping_stage( + response, url, existing_web_page_id, debug_output + ) + if not mapping.graph or len(mapping.graph) == 0: logger.warning("No triples produced for %s", url) return - if existing_web_page_id: - self._reconcile_root_id(graph, existing_web_page_id) - pp_result = await self._postprocessor_service.apply( - graph, url, response, existing_web_page_id, self._template_exports or {} + graph, pp_result = await self._run_postprocessing_stage( + mapping.graph, url, response, existing_web_page_id, existing_import_hash ) - graph = pp_result.graph - # Canonical IDs must run after custom postprocessors so any nodes minted - # by local logic are normalized before graph sync patching. - graph = self._core_ids.process_graph( - graph, - self._postprocessor_service.build_context( - url, response, existing_web_page_id, self._template_exports or {} - ), - ) - self._set_source(graph, existing_web_page_id) - self._set_existing_import_hash(graph, existing_import_hash) if self.debug_dir: xhtml = (debug_output or {}).get("xhtml") @@ -313,19 +273,16 @@ def _run_mapping() -> Graph | None: warning_sources=outcome.warning_sources, error_sources=outcome.error_sources, ) - _t_validation_wait = outcome.queue_wait_ms if outcome else 0 - _t_validation_actual = outcome.validation_ms if outcome else 0 - graph_metrics = self._kpi.graph_metrics(graph) + self._kpi.record_graph(graph) self._emit_progress( { "kind": "graph", "profile": self.profile.name, "url": url, - "graph": graph_metrics, + "graph": self._kpi.graph_metrics(graph), "validation": outcome.to_dict() if outcome else None, } ) - self._kpi.record_graph(graph) if ( outcome is not None and self._shacl_validator.mode == ValidationMode.FAIL @@ -337,12 +294,12 @@ def _run_mapping() -> Graph | None: "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]", len(graph), url, - _t_mapping_wait, - _t_mapping, + mapping.queue_wait_ms, + mapping.mapping_ms, pp_result.queue_wait_ms, pp_result.postprocessors_ms, - _t_validation_wait, - _t_validation_actual, + outcome.queue_wait_ms if outcome else 0, + outcome.validation_ms if outcome else 0, ) def close(self) -> None: @@ -353,6 +310,63 @@ def close(self) -> None: def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) + async def _run_mapping_stage( + self, + response: WebPageScrapeResponse, + url: str, + existing_web_page_id: str | None, + debug_output: dict[str, str] | None, + ) -> MappingResult: + mapping_path = self._resolve_mapping_path(url) + rendered_mapping = self._get_mapping_content(mapping_path) + mapping_response = self._mapping_response(response, existing_web_page_id) + + def _run() -> MappingResult: + # apply_mapping has no awaits — all work is synchronous (morph_kgc). + # Run in a thread so the event loop stays free for I/O while the + # thread waits for its morph_kgc subprocess slot. + return asyncio.run( + self.rml_service.apply_mapping( + html=response.web_page.html, + url=url, + mapping_file_path=mapping_path, + mapping_content=rendered_mapping, + response=mapping_response, + debug_output=debug_output, + ) + ) + + return await asyncio.get_event_loop().run_in_executor( + self._mapping_executor, _run + ) + + async def _run_postprocessing_stage( + self, + graph: Graph, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + existing_import_hash: str | None, + ) -> tuple[Graph, Any]: + if existing_web_page_id: + self._reconcile_root_id(graph, existing_web_page_id) + exports = self._template_exports or {} + pp_result = await self._postprocessor_service.apply( + graph, url, response, existing_web_page_id, exports + ) + graph = pp_result.graph + # Canonical IDs must run after custom postprocessors so any nodes minted + # by local logic are normalised before graph sync patching. + graph = self._core_ids.process_graph( + graph, + self._postprocessor_service.build_context( + url, response, existing_web_page_id, exports + ), + ) + self._set_source(graph, existing_web_page_id) + self._set_existing_import_hash(graph, existing_import_hash) + return graph, pp_result + def _resolve_path(self, raw_path: str) -> Path: path = Path(raw_path) if path.is_absolute(): From 2f141e96a0dd8117f60d8fae8b1f8d06055efdc1 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:12:46 +0100 Subject: [PATCH 39/63] refactor: move canonical_id_strategy and _core_ids into _init_postprocessor_service --- wordlift_sdk/kg_build/protocol.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 8c03701..87bbc72 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -102,16 +102,6 @@ def __init__( self._static_templates_lock = asyncio.Lock() settings = dict(self.profile.settings) - canonical_id_strategy = ( - str( - _setting( - settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy" - ) - ) - .strip() - .lower() - ) - self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) self._init_postprocessor_service(settings, context, _pool_size) self._init_mapping_executor(settings, _pool_size) @@ -136,6 +126,16 @@ def __init__( def _init_postprocessor_service( self, settings: dict, context: Context, pool_size: int ) -> None: + canonical_id_strategy = ( + str( + _setting( + settings, "canonical_id_strategy", "CANONICAL_ID_STRATEGY", "legacy" + ) + ) + .strip() + .lower() + ) + self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) runtime = _resolve_postprocessor_runtime(settings) logger.info( "Resolved postprocessor runtime for profile '%s': %s (origin=%s)", From a7dddad1bb7b4a032d992cbf1156823bef79d80f Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:15:46 +0100 Subject: [PATCH 40/63] refactor: move RmlMappingService construction into _init_mapping_service --- wordlift_sdk/kg_build/protocol.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 87bbc72..9b1e943 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -90,7 +90,6 @@ def __init__( self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,) ) - self.rml_service = RmlMappingService(context) self.patcher = EntityPatcher(context) self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs) self.text_renderer = TemplateTextRenderer() @@ -104,7 +103,7 @@ def __init__( settings = dict(self.profile.settings) _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) self._init_postprocessor_service(settings, context, _pool_size) - self._init_mapping_executor(settings, _pool_size) + self._init_mapping_service(settings, context, _pool_size) self._init_shacl_validator(settings, _pool_size) self._import_hash_mode = self._resolve_import_hash_mode( _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") @@ -165,7 +164,10 @@ def _init_postprocessor_service( runtime=runtime, ) - def _init_mapping_executor(self, settings: dict, pool_size: int) -> None: + def _init_mapping_service( + self, settings: dict, context: Context, pool_size: int + ) -> None: + self.rml_service = RmlMappingService(context) mapping_pool_size = int( _setting( settings, "mapping_pool_size", "MAPPING_POOL_SIZE", os.cpu_count() or 4 From 1fd0273be3fb5714e3b9db4a846200dc1bfdbb81 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:17:45 +0100 Subject: [PATCH 41/63] refactor: extract _init_graph_writer, moving patcher and import_hash_mode together --- wordlift_sdk/kg_build/protocol.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 9b1e943..1ee81c1 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -90,7 +90,6 @@ def __init__( self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,) ) - self.patcher = EntityPatcher(context) self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs) self.text_renderer = TemplateTextRenderer() @@ -105,9 +104,7 @@ def __init__( self._init_postprocessor_service(settings, context, _pool_size) self._init_mapping_service(settings, context, _pool_size) self._init_shacl_validator(settings, _pool_size) - self._import_hash_mode = self._resolve_import_hash_mode( - _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") - ) + self._init_graph_writer(settings, context) self._kpi = KgBuildKpiCollector( dataset_uri=getattr(self.context.account, "dataset_uri", None), validation_enabled=self._shacl_validator.mode != ValidationMode.OFF, @@ -220,6 +217,12 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None: pool_size=shacl_pool_size, ) + def _init_graph_writer(self, settings: dict, context: Context) -> None: + self.patcher = EntityPatcher(context) + self._import_hash_mode = self._resolve_import_hash_mode( + _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") + ) + async def callback( self, response: WebPageScrapeResponse, From 31437f1088f5d93e29981ac45679a835ee56dfc4 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 14:29:09 +0100 Subject: [PATCH 42/63] refactor(kg_build): decompose ProfileImportProtocol.__init__ and reduce class surface --- wordlift_sdk/kg_build/protocol.py | 169 +++++++++++++++--------------- 1 file changed, 82 insertions(+), 87 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 1ee81c1..f642b7a 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -27,7 +27,7 @@ from .entity_patcher import EntityPatcher from .id_postprocessor import CanonicalIdsPostprocessor from .kpi import KgBuildKpiCollector -from .postprocessor_service import PostprocessorService +from .postprocessor_service import PostprocessorService, PostprocessorResult from .rml_mapping import MappingResult, RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer from wordlift_sdk.structured_data.engine import init_morph_kgc_pool @@ -41,6 +41,59 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts +def _find_web_page_iri(graph: Graph) -> URIRef | None: + for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")): + return subject + for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")): + return subject + return None + + +def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None: + for subject, predicate, obj in list(graph.triples((old_iri, None, None))): + graph.remove((subject, predicate, obj)) + graph.add((new_iri, predicate, obj)) + for subject, predicate, obj in list(graph.triples((None, None, old_iri))): + graph.remove((subject, predicate, obj)) + graph.add((subject, predicate, new_iri)) + + +def _resolve_list_setting(value: Any) -> list[str]: + if value is None: + return [] + if isinstance(value, str): + return [part.strip() for part in value.split(",") if part.strip()] + if isinstance(value, (list, tuple)): + return [text for item in value if (text := str(item).strip())] + return [str(value).strip()] if str(value).strip() else [] + + +def _resolve_validation_mode(value: Any) -> ValidationMode: + if value is None: + return ValidationMode.WARN + mode = str(value).strip().lower() + if mode == "strict": + logger.warning( + "Deprecated SHACL validation mode 'strict' detected; using 'fail'." + ) + return ValidationMode.FAIL + try: + return ValidationMode(mode) + except ValueError: + logger.warning("Unsupported SHACL validation mode '%s'; using 'warn'.", mode) + return ValidationMode.WARN + + +def _resolve_import_hash_mode(value: Any) -> str: + if value is None: + return "on" + mode = str(value).strip().lower() + if mode in {"on", "write", "off"}: + return mode + logger.warning("Unsupported import hash mode '%s'; using 'on'.", mode) + return "on" + + def _setting(settings: dict, name: str, fallback: str, default: Any) -> Any: """Read a profile setting by snake_case name, falling back to UPPER_CASE, then default.""" v = settings.get(name) @@ -81,23 +134,6 @@ def __init__( self._graph_write_strategy = graph_write_strategy self.profile_dir = self.root_dir / "profiles" / self.profile.name - self.templates_dir = self._resolve_path(self.profile.templates_dir) - self.mappings_dir = self._resolve_path(self.profile.mappings_dir) - self._template_dirs = self._resolve_overlay_paths( - self.profile.template_overlay_dirs or (self.profile.templates_dir,) - ) - self._mapping_dirs = self._resolve_overlay_paths( - self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,) - ) - - self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs) - self.text_renderer = TemplateTextRenderer() - - self._template_graph: Graph | None = None - self._template_exports: dict[str, Any] | None = None - self._mapping_cache: dict[Path, str] = {} - self._static_templates_patched = False - self._static_templates_lock = asyncio.Lock() settings = dict(self.profile.settings) _pool_size = int(_setting(settings, "concurrency", "CONCURRENCY", 4)) @@ -164,6 +200,21 @@ def _init_postprocessor_service( def _init_mapping_service( self, settings: dict, context: Context, pool_size: int ) -> None: + self.templates_dir = self._resolve_path(self.profile.templates_dir) + self.mappings_dir = self._resolve_path(self.profile.mappings_dir) + self._template_dirs = self._resolve_overlay_paths( + self.profile.template_overlay_dirs or (self.profile.templates_dir,) + ) + self._mapping_dirs = self._resolve_overlay_paths( + self.profile.mapping_overlay_dirs or (self.profile.mappings_dir,) + ) + self.template_reifier = JinjaRdfTemplateReifier(self._template_dirs) + self.text_renderer = TemplateTextRenderer() + self._template_graph: Graph | None = None + self._template_exports: dict[str, Any] | None = None + self._mapping_cache: dict[Path, str] = {} + self._static_templates_patched = False + self._static_templates_lock = asyncio.Lock() self.rml_service = RmlMappingService(context) mapping_pool_size = int( _setting( @@ -184,13 +235,13 @@ def _init_mapping_service( ) def _init_shacl_validator(self, settings: dict, pool_size: int) -> None: - mode = self._resolve_validation_mode( + mode = _resolve_validation_mode( _setting(settings, "shacl_validate_mode", "SHACL_VALIDATE_MODE", "warn") ) - builtin_shapes = self._resolve_list_setting( + builtin_shapes = _resolve_list_setting( _setting(settings, "shacl_builtin_shapes", "SHACL_BUILTIN_SHAPES", None) ) - exclude_builtin_shapes = self._resolve_list_setting( + exclude_builtin_shapes = _resolve_list_setting( _setting( settings, "shacl_exclude_builtin_shapes", @@ -198,10 +249,10 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None: None, ) ) - extra_shapes = self._resolve_list_setting( + extra_shapes = _resolve_list_setting( _setting(settings, "shacl_extra_shapes", "SHACL_EXTRA_SHAPES", None) ) - self._shacl_shape_specs = resolve_shape_specs( + shape_specs = resolve_shape_specs( builtin_shapes=builtin_shapes or None, exclude_builtin_shapes=exclude_builtin_shapes or None, extra_shapes=extra_shapes or None, @@ -212,14 +263,14 @@ def _init_shacl_validator(self, settings: dict, pool_size: int) -> None: ) ) self._shacl_validator = ShaclValidationService( - shape_specs=self._shacl_shape_specs or None, + shape_specs=shape_specs or None, mode=mode, pool_size=shacl_pool_size, ) def _init_graph_writer(self, settings: dict, context: Context) -> None: self.patcher = EntityPatcher(context) - self._import_hash_mode = self._resolve_import_hash_mode( + self._import_hash_mode = _resolve_import_hash_mode( _setting(settings, "import_hash_mode", "IMPORT_HASH_MODE", "on") ) @@ -352,7 +403,7 @@ async def _run_postprocessing_stage( response: WebPageScrapeResponse, existing_web_page_id: str | None, existing_import_hash: str | None, - ) -> tuple[Graph, Any]: + ) -> tuple[Graph, PostprocessorResult]: if existing_web_page_id: self._reconcile_root_id(graph, existing_web_page_id) exports = self._template_exports or {} @@ -368,7 +419,7 @@ async def _run_postprocessing_stage( url, response, existing_web_page_id, exports ), ) - self._set_source(graph, existing_web_page_id) + self._set_source(graph) self._set_existing_import_hash(graph, existing_import_hash) return graph, pp_result @@ -609,27 +660,11 @@ def _write_debug_source_documents( xhtml_file.write_text(xhtml, encoding="utf-8") def _reconcile_root_id(self, graph: Graph, root_id: str) -> None: - old_iri = self._find_web_page_iri(graph) + old_iri = _find_web_page_iri(graph) if old_iri and str(old_iri) != root_id: - self._swap_iris(graph, old_iri, URIRef(root_id)) - - def _find_web_page_iri(self, graph: Graph) -> URIRef | None: - for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")): - return subject - for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")): - return subject - return None - - def _swap_iris(self, graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None: - for subject, predicate, obj in list(graph.triples((old_iri, None, None))): - graph.remove((subject, predicate, obj)) - graph.add((new_iri, predicate, obj)) - for subject, predicate, obj in list(graph.triples((None, None, old_iri))): - graph.remove((subject, predicate, obj)) - graph.add((subject, predicate, new_iri)) - - def _set_source(self, graph: Graph, existing_web_page_id: str | None) -> None: - del existing_web_page_id + _swap_iris(graph, old_iri, URIRef(root_id)) + + def _set_source(self, graph: Graph) -> None: for subject in self._first_level_subjects(graph): graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import"))) @@ -696,43 +731,3 @@ def _emit_progress(self, payload: dict[str, Any]) -> None: self._on_progress(payload) except Exception: logger.warning("Failed to emit kg_build progress payload.", exc_info=True) - - def _resolve_list_setting(self, value: Any) -> list[str]: - if value is None: - return [] - if isinstance(value, str): - return [part.strip() for part in value.split(",") if part.strip()] - if isinstance(value, (list, tuple)): - specs: list[str] = [] - for item in value: - text = str(item).strip() - if text: - specs.append(text) - return specs - return [str(value).strip()] if str(value).strip() else [] - - def _resolve_validation_mode(self, value: Any) -> ValidationMode: - if value is None: - return ValidationMode.WARN - mode = str(value).strip().lower() - if mode == "strict": - logger.warning( - "Deprecated SHACL validation mode 'strict' detected; using 'fail'." - ) - return ValidationMode.FAIL - try: - return ValidationMode(mode) - except ValueError: - logger.warning( - "Unsupported SHACL validation mode '%s'; using 'warn'.", mode - ) - return ValidationMode.WARN - - def _resolve_import_hash_mode(self, value: Any) -> str: - if value is None: - return "on" - mode = str(value).strip().lower() - if mode in {"on", "write", "off"}: - return mode - logger.warning("Unsupported import hash mode '%s'; using 'on'.", mode) - return "on" From f72e457204ccb68df33365aa46a8bdafbcb8c5a6 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:00:34 +0100 Subject: [PATCH 43/63] refactor: extract RootIdReconcilerPostprocessor from protocol._reconcile_root_id --- wordlift_sdk/kg_build/id_postprocessor.py | 38 ++++++++++++++++++++++- wordlift_sdk/kg_build/protocol.py | 31 +++--------------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/wordlift_sdk/kg_build/id_postprocessor.py b/wordlift_sdk/kg_build/id_postprocessor.py index 7660a13..ae4326e 100644 --- a/wordlift_sdk/kg_build/id_postprocessor.py +++ b/wordlift_sdk/kg_build/id_postprocessor.py @@ -1,11 +1,47 @@ from __future__ import annotations -from rdflib import Graph +from rdflib import Graph, RDF, URIRef from .id_generator import CanonicalIdGenerator from .iri_lookup import IriLookup +def _find_web_page_iri(graph: Graph) -> URIRef | None: + for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")): + return subject + for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")): + return subject + return None + + +def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None: + for subject, predicate, obj in list(graph.triples((old_iri, None, None))): + graph.remove((subject, predicate, obj)) + graph.add((new_iri, predicate, obj)) + for subject, predicate, obj in list(graph.triples((None, None, old_iri))): + graph.remove((subject, predicate, obj)) + graph.add((subject, predicate, new_iri)) + + +class RootIdReconcilerPostprocessor: + """Rewrites the WebPage node IRI to match the existing web page ID. + + When a page has been imported before, the mapping may generate a different + IRI than the one already stored. This postprocessor swaps all triples + referencing the old IRI to use the canonical one from the system. + Runs before custom postprocessors so they always see the correct subject. + """ + + def process_graph(self, graph: Graph, context) -> Graph: + root_id = getattr(context, "existing_web_page_id", None) + if not root_id: + return graph + old_iri = _find_web_page_iri(graph) + if old_iri and str(old_iri) != root_id: + _swap_iris(graph, old_iri, URIRef(root_id)) + return graph + + class CanonicalIdsPostprocessor: """Postprocessor adapter that applies canonical ID generation to a graph.""" diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index f642b7a..02e4e54 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -10,7 +10,7 @@ from typing import Any from jinja2 import UndefinedError -from rdflib import Graph, Literal, RDF, URIRef +from rdflib import Graph, Literal, URIRef from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse from wordlift_sdk.protocol import Context from wordlift_sdk.protocol.web_page_import_protocol import ( @@ -25,7 +25,7 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher -from .id_postprocessor import CanonicalIdsPostprocessor +from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor from .kpi import KgBuildKpiCollector from .postprocessor_service import PostprocessorService, PostprocessorResult from .rml_mapping import MappingResult, RmlMappingService @@ -41,23 +41,6 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts -def _find_web_page_iri(graph: Graph) -> URIRef | None: - for subject in graph.subjects(RDF.type, URIRef("http://schema.org/WebPage")): - return subject - for subject in graph.subjects(RDF.type, URIRef("https://schema.org/WebPage")): - return subject - return None - - -def _swap_iris(graph: Graph, old_iri: URIRef, new_iri: URIRef) -> None: - for subject, predicate, obj in list(graph.triples((old_iri, None, None))): - graph.remove((subject, predicate, obj)) - graph.add((new_iri, predicate, obj)) - for subject, predicate, obj in list(graph.triples((None, None, old_iri))): - graph.remove((subject, predicate, obj)) - graph.add((subject, predicate, new_iri)) - - def _resolve_list_setting(value: Any) -> list[str]: if value is None: return [] @@ -404,8 +387,9 @@ async def _run_postprocessing_stage( existing_web_page_id: str | None, existing_import_hash: str | None, ) -> tuple[Graph, PostprocessorResult]: - if existing_web_page_id: - self._reconcile_root_id(graph, existing_web_page_id) + graph = RootIdReconcilerPostprocessor().process_graph( + graph, SimpleNamespace(existing_web_page_id=existing_web_page_id) + ) exports = self._template_exports or {} pp_result = await self._postprocessor_service.apply( graph, url, response, existing_web_page_id, exports @@ -659,11 +643,6 @@ def _write_debug_source_documents( xhtml_file = self.debug_dir / f"{safe_name}.xhtml" xhtml_file.write_text(xhtml, encoding="utf-8") - def _reconcile_root_id(self, graph: Graph, root_id: str) -> None: - old_iri = _find_web_page_iri(graph) - if old_iri and str(old_iri) != root_id: - _swap_iris(graph, old_iri, URIRef(root_id)) - def _set_source(self, graph: Graph) -> None: for subject in self._first_level_subjects(graph): graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import"))) From 9be8fd57297aa1070413cc3b42794952f59f0e73 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:04:04 +0100 Subject: [PATCH 44/63] refactor: extract ImportAnnotationPostprocessor --- wordlift_sdk/kg_build/graph_annotation.py | 60 ++++++++++++++++++++++ wordlift_sdk/kg_build/protocol.py | 62 ++++------------------- 2 files changed, 70 insertions(+), 52 deletions(-) create mode 100644 wordlift_sdk/kg_build/graph_annotation.py diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/graph_annotation.py new file mode 100644 index 0000000..731d57c --- /dev/null +++ b/wordlift_sdk/kg_build/graph_annotation.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from rdflib import Graph, Literal, URIRef + +SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") +SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash") + + +def _first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]: + subjects = {s for s in graph.subjects() if isinstance(s, URIRef)} + if dataset_uri: + first_level_by_id = { + s + for s in subjects + if str(s).startswith(f"{dataset_uri}/") + and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2 + } + if first_level_by_id: + return first_level_by_id + + referenced = { + obj + for _, _, obj in graph.triples((None, None, None)) + if isinstance(obj, URIRef) and obj in subjects + } + first_level = subjects - referenced + return first_level or subjects + + +class ImportAnnotationPostprocessor: + """Stamps first-level graph subjects with web-page-import provenance metadata. + + Sets seovoc:source to 'web-page-import' on every first-level subject, and + optionally propagates the existing import hash to all URIRef subjects when + import_hash_mode is not 'off'. Both are needed before graph persistence so + the KG can track provenance and skip unchanged imports. + + Reads from context: + - account.dataset_uri — for first-level subject resolution + - existing_import_hash — hash from a prior import of the same page + - import_hash_mode — 'on' | 'write' | 'off' + """ + + def process_graph(self, graph: Graph, context) -> Graph: + dataset_uri = str( + getattr(getattr(context, "account", None), "dataset_uri", "") or "" + ).rstrip("/") + for subject in _first_level_subjects(graph, dataset_uri): + graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import"))) + + import_hash_mode = getattr(context, "import_hash_mode", "on") + if import_hash_mode == "off": + return graph + existing_import_hash = getattr(context, "existing_import_hash", None) + if not existing_import_hash: + return graph + for subject in (s for s in graph.subjects() if isinstance(s, URIRef)): + graph.set((subject, SEOVOC_IMPORT_HASH, Literal(existing_import_hash))) + + return graph diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 02e4e54..b06844d 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -10,7 +10,7 @@ from typing import Any from jinja2 import UndefinedError -from rdflib import Graph, Literal, URIRef +from rdflib import Graph, URIRef from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse from wordlift_sdk.protocol import Context from wordlift_sdk.protocol.web_page_import_protocol import ( @@ -25,6 +25,7 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher +from .graph_annotation import ImportAnnotationPostprocessor from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor from .kpi import KgBuildKpiCollector from .postprocessor_service import PostprocessorService, PostprocessorResult @@ -33,8 +34,6 @@ from wordlift_sdk.structured_data.engine import init_morph_kgc_pool logger = logging.getLogger(__name__) -SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") -SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash") def _path_contains_part(path: str, part: str) -> bool: @@ -403,8 +402,14 @@ async def _run_postprocessing_stage( url, response, existing_web_page_id, exports ), ) - self._set_source(graph) - self._set_existing_import_hash(graph, existing_import_hash) + graph = ImportAnnotationPostprocessor().process_graph( + graph, + SimpleNamespace( + account=self.context.account, + existing_import_hash=existing_import_hash, + import_hash_mode=self._import_hash_mode, + ), + ) return graph, pp_result def _resolve_path(self, raw_path: str) -> Path: @@ -643,53 +648,6 @@ def _write_debug_source_documents( xhtml_file = self.debug_dir / f"{safe_name}.xhtml" xhtml_file.write_text(xhtml, encoding="utf-8") - def _set_source(self, graph: Graph) -> None: - for subject in self._first_level_subjects(graph): - graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import"))) - - def _set_existing_import_hash(self, graph: Graph, import_hash: str | None) -> None: - if self._import_hash_mode == "off": - return - if not import_hash: - return - subjects = { - subject for subject in graph.subjects() if isinstance(subject, URIRef) - } - for subject in subjects: - graph.set((subject, SEOVOC_IMPORT_HASH, Literal(import_hash))) - - def _first_level_subjects(self, graph: Graph) -> set[URIRef]: - subjects = { - subject for subject in graph.subjects() if isinstance(subject, URIRef) - } - dataset_uri = str( - getattr(self.context.account, "dataset_uri", "") or "" - ).rstrip("/") - if dataset_uri: - first_level_by_id = { - subject - for subject in subjects - if str(subject).startswith(f"{dataset_uri}/") - and len( - [ - part - for part in str(subject)[len(dataset_uri) + 1 :].split("/") - if part - ] - ) - == 2 - } - if first_level_by_id: - return first_level_by_id - - referenced = { - obj - for _, _, obj in graph.triples((None, None, None)) - if isinstance(obj, URIRef) and obj in subjects - } - first_level = subjects - referenced - return first_level or subjects - def _mapping_response( self, response: WebPageScrapeResponse, From bb020adcda7d380ba21f8d74fc8342cd19823585 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:21:47 +0100 Subject: [PATCH 45/63] refactor: make PostprocessorService profile-agnostic, unify postprocessor pipeline --- .../kg_build/postprocessor_service.py | 137 +++--------------- wordlift_sdk/kg_build/postprocessors.py | 2 + wordlift_sdk/kg_build/protocol.py | 128 ++++++++++++---- 3 files changed, 116 insertions(+), 151 deletions(-) diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessor_service.py index cd8c21d..c9d4d82 100644 --- a/wordlift_sdk/kg_build/postprocessor_service.py +++ b/wordlift_sdk/kg_build/postprocessor_service.py @@ -3,74 +3,48 @@ import asyncio import functools import logging -import os import time from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict -from pathlib import Path -from typing import Any +from collections.abc import Iterable +from typing import Callable from rdflib import Graph -from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse -from wordlift_sdk.protocol import Context -from .config import ProfileDefinition -from .id_allocator import IdAllocator from .postprocessors import ( + LoadedPostprocessor, PostprocessorContext, PostprocessorResult, close_loaded_postprocessors, - load_postprocessors_for_profile, ) logger = logging.getLogger(__name__) -def _clean_key(value: Any) -> str | None: - if value is None: - return None - key = str(value).strip() - return key or None +class PostprocessorService: + """Executes an ordered list of postprocessors against a graph. + Completely agnostic to profiles and pipeline composition — callers are + responsible for assembling the postprocessor list and building the context. + """ -class PostprocessorService: def __init__( self, *, - root_dir: Path, - profile: ProfileDefinition, - context: Context, + postprocessors_factory: Callable[[], Iterable[LoadedPostprocessor]], pool_size: int, - runtime: str, ) -> None: - self._profile = profile - self._context = context self._executor = ThreadPoolExecutor( max_workers=pool_size, thread_name_prefix="worai_pp" ) self._queue: asyncio.Queue = asyncio.Queue() for _ in range(pool_size): - self._queue.put_nowait( - load_postprocessors_for_profile( - root_dir=root_dir, - profile_name=profile.name, - runtime=runtime, - ) - ) - logger.info( - "Created postprocessor pool for profile '%s' (pool_size=%d runtime=%s)", - profile.name, - pool_size, - runtime, - ) + self._queue.put_nowait(postprocessors_factory()) + logger.info("Created postprocessor pool (pool_size=%d)", pool_size) async def apply( self, graph: Graph, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - exports: dict[str, Any], + context: PostprocessorContext, ) -> PostprocessorResult: _t1 = time.perf_counter() postprocessors = await self._queue.get() @@ -80,44 +54,12 @@ async def apply( return await loop.run_in_executor( self._executor, functools.partial( - self._run, - graph, - url, - response, - existing_web_page_id, - postprocessors, - queue_wait_ms, - exports, + self._run, graph, context, postprocessors, queue_wait_ms ), ) finally: self._queue.put_nowait(postprocessors) - def build_context( - self, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - exports: dict[str, Any], - ) -> PostprocessorContext: - dataset_uri = str(getattr(self._context.account, "dataset_uri", "")).rstrip("/") - ids = IdAllocator(dataset_uri) if dataset_uri else None - profile_payload = asdict(self._profile) - profile_settings = dict(profile_payload.get("settings", {}) or {}) - profile_settings.setdefault("api_url", "https://api.wordlift.io") - profile_payload["settings"] = profile_settings - return PostprocessorContext( - profile_name=self._profile.name, - profile=profile_payload, - url=url, - account=self._context.account, - account_key=self._resolve_account_key(), - exports=exports, - response=response, - existing_web_page_id=existing_web_page_id, - ids=ids, - ) - def close(self) -> None: while not self._queue.empty(): try: @@ -129,33 +71,18 @@ def close(self) -> None: def _run( self, graph: Graph, - url: str, - response: WebPageScrapeResponse, - existing_web_page_id: str | None, - postprocessors: list, + context: PostprocessorContext, + postprocessors: Iterable[LoadedPostprocessor], queue_wait_ms: int, - exports: dict[str, Any], ) -> PostprocessorResult: _t_start = time.perf_counter() - if not postprocessors: - return PostprocessorResult( - graph=graph, queue_wait_ms=queue_wait_ms, postprocessors_ms=0 - ) - - pp_context = self.build_context(url, response, existing_web_page_id, exports) - if not pp_context.account_key: - raise RuntimeError( - "Postprocessor runtime requires an API key. Configure one via profile " - "'api_key', WORDLIFT_KEY, or WORDLIFT_API_KEY." - ) - for processor in postprocessors: _tp = time.perf_counter() - graph = processor.run(graph, pp_context) + graph = processor.run(graph, context) logger.info( "Applied postprocessor '%s' for %s [%dms]", processor.name, - url, + context.url, int((time.perf_counter() - _tp) * 1000), ) return PostprocessorResult( @@ -163,33 +90,3 @@ def _run( queue_wait_ms=queue_wait_ms, postprocessors_ms=int((time.perf_counter() - _t_start) * 1000), ) - - def _resolve_account_key(self) -> str | None: - profile_key = _clean_key(self._profile.api_key) - if profile_key: - return profile_key - - client_config = getattr(self._context, "client_configuration", None) - if client_config is not None: - api_key_map = getattr(client_config, "api_key", None) - if isinstance(api_key_map, dict): - runtime_key = _clean_key(api_key_map.get("ApiKey")) - if runtime_key: - return runtime_key - - provider = getattr(self._context, "configuration_provider", None) - if provider is not None: - for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): - try: - key = _clean_key(provider.get_value(name)) - except Exception: - key = None - if key: - return key - - for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): - key = _clean_key(os.getenv(name)) - if key: - return key - - return None diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors.py index d29b8f5..0acd84a 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors.py @@ -40,6 +40,8 @@ class PostprocessorContext: exports: dict[str, Any] response: Any existing_web_page_id: str | None + existing_import_hash: str | None = None + import_hash_mode: str = "on" ids: Any | None = None diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index b06844d..0c91671 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -5,6 +5,7 @@ import logging import os from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict from pathlib import Path from types import SimpleNamespace from typing import Any @@ -26,9 +27,16 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher from .graph_annotation import ImportAnnotationPostprocessor +from .id_allocator import IdAllocator from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor from .kpi import KgBuildKpiCollector -from .postprocessor_service import PostprocessorService, PostprocessorResult +from .postprocessor_service import PostprocessorService +from .postprocessors import ( + LoadedPostprocessor, + PostprocessorContext, + PostprocessorResult, + load_postprocessors_for_profile, +) from .rml_mapping import MappingResult, RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer from wordlift_sdk.structured_data.engine import init_morph_kgc_pool @@ -40,6 +48,34 @@ def _path_contains_part(path: str, part: str) -> bool: return part in Path(path).parts +def _clean_key(value: Any) -> str | None: + key = str(value).strip() if value is not None else "" + return key or None + + +def _resolve_account_key(profile: Any, context: Any) -> str | None: + if key := _clean_key(getattr(profile, "api_key", None)): + return key + api_key_map = getattr( + getattr(context, "client_configuration", None), "api_key", None + ) + if isinstance(api_key_map, dict): + if key := _clean_key(api_key_map.get("ApiKey")): + return key + provider = getattr(context, "configuration_provider", None) + if provider is not None: + for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): + try: + if key := _clean_key(provider.get_value(name)): + return key + except Exception: + pass + for name in ("WORDLIFT_KEY", "WORDLIFT_API_KEY"): + if key := _clean_key(os.getenv(name)): + return key + return None + + def _resolve_list_setting(value: Any) -> list[str]: if value is None: return [] @@ -149,7 +185,7 @@ def _init_postprocessor_service( .strip() .lower() ) - self._core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) + core_ids = CanonicalIdsPostprocessor(strategy=canonical_id_strategy) runtime = _resolve_postprocessor_runtime(settings) logger.info( "Resolved postprocessor runtime for profile '%s': %s (origin=%s)", @@ -171,12 +207,35 @@ def _init_postprocessor_service( pp_pool_size, pool_size, ) + account_key = _resolve_account_key(self.profile, context) + root_dir = self.root_dir + profile = self.profile + + def _postprocessors_factory() -> list[LoadedPostprocessor]: + leading = [ + LoadedPostprocessor( + name="root_id_reconciler", + handler=RootIdReconcilerPostprocessor(), + ) + ] + custom = load_postprocessors_for_profile( + root_dir=root_dir, + profile_name=profile.name, + runtime=runtime, + ) + trailing = [ + LoadedPostprocessor(name="canonical_ids", handler=core_ids), + LoadedPostprocessor( + name="import_annotation", + handler=ImportAnnotationPostprocessor(), + ), + ] + return leading + custom + trailing + + self._account_key = account_key self._postprocessor_service = PostprocessorService( - root_dir=self.root_dir, - profile=self.profile, - context=context, + postprocessors_factory=_postprocessors_factory, pool_size=pp_pool_size, - runtime=runtime, ) def _init_mapping_service( @@ -386,31 +445,38 @@ async def _run_postprocessing_stage( existing_web_page_id: str | None, existing_import_hash: str | None, ) -> tuple[Graph, PostprocessorResult]: - graph = RootIdReconcilerPostprocessor().process_graph( - graph, SimpleNamespace(existing_web_page_id=existing_web_page_id) - ) - exports = self._template_exports or {} - pp_result = await self._postprocessor_service.apply( - graph, url, response, existing_web_page_id, exports - ) - graph = pp_result.graph - # Canonical IDs must run after custom postprocessors so any nodes minted - # by local logic are normalised before graph sync patching. - graph = self._core_ids.process_graph( - graph, - self._postprocessor_service.build_context( - url, response, existing_web_page_id, exports - ), - ) - graph = ImportAnnotationPostprocessor().process_graph( - graph, - SimpleNamespace( - account=self.context.account, - existing_import_hash=existing_import_hash, - import_hash_mode=self._import_hash_mode, - ), - ) - return graph, pp_result + context = self._build_pp_context( + url, response, existing_web_page_id, existing_import_hash + ) + pp_result = await self._postprocessor_service.apply(graph, context) + return pp_result.graph, pp_result + + def _build_pp_context( + self, + url: str, + response: WebPageScrapeResponse, + existing_web_page_id: str | None, + existing_import_hash: str | None, + ) -> PostprocessorContext: + dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/") + ids = IdAllocator(dataset_uri) if dataset_uri else None + profile_payload = asdict(self.profile) + profile_settings = dict(profile_payload.get("settings", {}) or {}) + profile_settings.setdefault("api_url", "https://api.wordlift.io") + profile_payload["settings"] = profile_settings + return PostprocessorContext( + profile_name=self.profile.name, + profile=profile_payload, + url=url, + account=self.context.account, + account_key=self._account_key, + exports=self._template_exports or {}, + response=response, + existing_web_page_id=existing_web_page_id, + existing_import_hash=existing_import_hash, + import_hash_mode=self._import_hash_mode, + ids=ids, + ) def _resolve_path(self, raw_path: str) -> Path: path = Path(raw_path) From d60a654e665fe0fa9649a2c6a79c482cd4b90f56 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:34:44 +0100 Subject: [PATCH 46/63] refactor: extract first_level_subjects into graph_utils helper --- wordlift_sdk/kg_build/graph_annotation.py | 25 +++--------------- wordlift_sdk/kg_build/graph_utils.py | 31 +++++++++++++++++++++++ wordlift_sdk/kg_build/protocol.py | 11 ++++---- 3 files changed, 40 insertions(+), 27 deletions(-) create mode 100644 wordlift_sdk/kg_build/graph_utils.py diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/graph_annotation.py index 731d57c..281cee0 100644 --- a/wordlift_sdk/kg_build/graph_annotation.py +++ b/wordlift_sdk/kg_build/graph_annotation.py @@ -2,31 +2,12 @@ from rdflib import Graph, Literal, URIRef +from .graph_utils import first_level_subjects + SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash") -def _first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]: - subjects = {s for s in graph.subjects() if isinstance(s, URIRef)} - if dataset_uri: - first_level_by_id = { - s - for s in subjects - if str(s).startswith(f"{dataset_uri}/") - and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2 - } - if first_level_by_id: - return first_level_by_id - - referenced = { - obj - for _, _, obj in graph.triples((None, None, None)) - if isinstance(obj, URIRef) and obj in subjects - } - first_level = subjects - referenced - return first_level or subjects - - class ImportAnnotationPostprocessor: """Stamps first-level graph subjects with web-page-import provenance metadata. @@ -45,7 +26,7 @@ def process_graph(self, graph: Graph, context) -> Graph: dataset_uri = str( getattr(getattr(context, "account", None), "dataset_uri", "") or "" ).rstrip("/") - for subject in _first_level_subjects(graph, dataset_uri): + for subject in first_level_subjects(graph, dataset_uri): graph.set((subject, SEOVOC_SOURCE, Literal("web-page-import"))) import_hash_mode = getattr(context, "import_hash_mode", "on") diff --git a/wordlift_sdk/kg_build/graph_utils.py b/wordlift_sdk/kg_build/graph_utils.py new file mode 100644 index 0000000..df35268 --- /dev/null +++ b/wordlift_sdk/kg_build/graph_utils.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from rdflib import Graph, URIRef + + +def first_level_subjects(graph: Graph, dataset_uri: str) -> set[URIRef]: + """Return the first-level URIRef subjects of *graph*. + + When *dataset_uri* is set, first-level subjects are those whose IRI matches + ``//`` (exactly two non-empty path segments after the + base URI). Falls back to subjects that are not referenced as objects by any + other triple; if every subject is referenced, returns all subjects. + """ + subjects = {s for s in graph.subjects() if isinstance(s, URIRef)} + if dataset_uri: + first_level_by_id = { + s + for s in subjects + if str(s).startswith(f"{dataset_uri}/") + and len([p for p in str(s)[len(dataset_uri) + 1 :].split("/") if p]) == 2 + } + if first_level_by_id: + return first_level_by_id + + referenced = { + obj + for _, _, obj in graph.triples((None, None, None)) + if isinstance(obj, URIRef) and obj in subjects + } + first_level = subjects - referenced + return first_level or subjects diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 0c91671..7f49aad 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -27,6 +27,7 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher from .graph_annotation import ImportAnnotationPostprocessor +from .graph_utils import first_level_subjects from .id_allocator import IdAllocator from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor from .kpi import KgBuildKpiCollector @@ -670,23 +671,23 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool: if not subjects: return False - first_level_subjects = { + page_subjects = { subject - for subject in self._first_level_subjects(graph) + for subject in first_level_subjects(graph, dataset_uri) if subject in subjects } - if not first_level_subjects: + if not page_subjects: return False if self._import_hash_mode == "off": return True - representative = next(iter(first_level_subjects)) + representative = next(iter(page_subjects)) existing_hash = self.patcher._existing_import_hash(representative, graph) import_hash = self.patcher._compute_import_hash( representative, graph, dataset_uri ) - for subject in first_level_subjects: + for subject in page_subjects: self.patcher._set_import_hash(subject, graph, import_hash) return not ( From 37d928b5e3d0dff8082818718401cc5ee82d49fd Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:36:48 +0100 Subject: [PATCH 47/63] refactor: drop redundant tuple from _run_postprocessing_stage --- wordlift_sdk/kg_build/protocol.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 7f49aad..015a45d 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -344,7 +344,7 @@ async def callback( logger.warning("No triples produced for %s", url) return - graph, pp_result = await self._run_postprocessing_stage( + pp_result = await self._run_postprocessing_stage( mapping.graph, url, response, existing_web_page_id, existing_import_hash ) @@ -353,9 +353,11 @@ async def callback( self._write_debug_source_documents( url=url, html=response.web_page.html, xhtml=xhtml ) - self._write_debug_graph(graph, url) + self._write_debug_graph(pp_result.graph, url) - outcome: ValidationOutcome | None = await self._shacl_validator.validate(graph) + outcome: ValidationOutcome | None = await self._shacl_validator.validate( + pp_result.graph + ) if outcome is not None: logger.info( "SHACL validation for %s: pass=%s warnings=%d errors=%d", @@ -371,13 +373,13 @@ async def callback( warning_sources=outcome.warning_sources, error_sources=outcome.error_sources, ) - self._kpi.record_graph(graph) + self._kpi.record_graph(pp_result.graph) self._emit_progress( { "kind": "graph", "profile": self.profile.name, "url": url, - "graph": self._kpi.graph_metrics(graph), + "graph": self._kpi.graph_metrics(pp_result.graph), "validation": outcome.to_dict() if outcome else None, } ) @@ -387,10 +389,10 @@ async def callback( and outcome.failed ): raise RuntimeError(f"SHACL validation failed for {url} in fail mode.") - await self._write_graph(graph) + await self._write_graph(pp_result.graph) logger.info( "Wrote %s triples for %s [mapping_wait=%dms mapping=%dms postprocessor_wait=%dms postprocessors=%dms validation_wait=%dms validation=%dms]", - len(graph), + len(pp_result.graph), url, mapping.queue_wait_ms, mapping.mapping_ms, @@ -445,12 +447,11 @@ async def _run_postprocessing_stage( response: WebPageScrapeResponse, existing_web_page_id: str | None, existing_import_hash: str | None, - ) -> tuple[Graph, PostprocessorResult]: + ) -> PostprocessorResult: context = self._build_pp_context( url, response, existing_web_page_id, existing_import_hash ) - pp_result = await self._postprocessor_service.apply(graph, context) - return pp_result.graph, pp_result + return await self._postprocessor_service.apply(graph, context) def _build_pp_context( self, From bb3826d57251af9e9a85e14406b08ae01a9b3b75 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:38:50 +0100 Subject: [PATCH 48/63] refactor: extract _dataset_uri property and _url_hash helper --- wordlift_sdk/kg_build/protocol.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 015a45d..44f93ff 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -410,6 +410,14 @@ def close(self) -> None: def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) + @property + def _dataset_uri(self) -> str: + return str(getattr(self.context.account, "dataset_uri", "") or "").rstrip("/") + + @staticmethod + def _url_hash(url: str) -> str: + return hashlib.sha256(url.encode("utf-8")).hexdigest() + async def _run_mapping_stage( self, response: WebPageScrapeResponse, @@ -460,7 +468,7 @@ def _build_pp_context( existing_web_page_id: str | None, existing_import_hash: str | None, ) -> PostprocessorContext: - dataset_uri = str(getattr(self.context.account, "dataset_uri", "")).rstrip("/") + dataset_uri = self._dataset_uri ids = IdAllocator(dataset_uri) if dataset_uri else None profile_payload = asdict(self.profile) profile_settings = dict(profile_payload.get("settings", {}) or {}) @@ -561,13 +569,13 @@ def _ensure_templates_loaded(self) -> None: if self._template_graph is not None and self._template_exports is not None: return - dataset_uri = getattr(self.context.account, "dataset_uri", None) + dataset_uri = self._dataset_uri if not dataset_uri: raise RuntimeError("Dataset URI not available on context.account.") base_context = { "account": self.context.account, - "dataset_uri": str(dataset_uri).rstrip("/"), + "dataset_uri": dataset_uri, } exports, exports_summary = self.text_renderer.load_exports_with_summary( self._template_dirs, base_context @@ -633,7 +641,7 @@ def _get_mapping_content(self, mapping_path: Path) -> str: if cached is not None: return cached - dataset_uri = getattr(self.context.account, "dataset_uri", None) + dataset_uri = self._dataset_uri if not dataset_uri: raise RuntimeError("Dataset URI not available on context.account.") @@ -641,7 +649,7 @@ def _get_mapping_content(self, mapping_path: Path) -> str: context = { "account": self.context.account, - "dataset_uri": str(dataset_uri).rstrip("/"), + "dataset_uri": dataset_uri, "exports": self._template_exports or {}, } template_path = self.text_renderer.resolve_mapping_template(mapping_path) @@ -658,9 +666,7 @@ async def _write_graph(self, graph: Graph) -> None: await self.patcher.patch_all(graph, import_hash_mode=self._import_hash_mode) def _prepare_graph_for_put(self, graph: Graph) -> bool: - dataset_uri = str( - getattr(self.context.account, "dataset_uri", "") or "" - ).rstrip("/") + dataset_uri = self._dataset_uri if not dataset_uri: return False @@ -700,7 +706,7 @@ def _prepare_graph_for_put(self, graph: Graph) -> bool: def _write_debug_graph(self, graph: Graph, url: str) -> None: assert self.debug_dir is not None self.debug_dir.mkdir(parents=True, exist_ok=True) - safe_name = hashlib.sha256(url.encode("utf-8")).hexdigest() + safe_name = self._url_hash(url) debug_file = self.debug_dir / f"{safe_name}.ttl" graph.serialize(destination=debug_file, format="turtle") @@ -709,7 +715,7 @@ def _write_debug_source_documents( ) -> None: assert self.debug_dir is not None self.debug_dir.mkdir(parents=True, exist_ok=True) - safe_name = hashlib.sha256(url.encode("utf-8")).hexdigest() + safe_name = self._url_hash(url) html_file = self.debug_dir / f"{safe_name}.html" html_file.write_text(html, encoding="utf-8") if xhtml: From 5f62d496dbef2c553002ee321b37f583c0f57aa5 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 15:43:37 +0100 Subject: [PATCH 49/63] refactor: simplify record_validation to accept ValidationOutcome directly --- wordlift_sdk/kg_build/kpi.py | 26 ++++++++++---------------- wordlift_sdk/kg_build/protocol.py | 16 ++-------------- 2 files changed, 12 insertions(+), 30 deletions(-) diff --git a/wordlift_sdk/kg_build/kpi.py b/wordlift_sdk/kg_build/kpi.py index 5edea07..f6c0822 100644 --- a/wordlift_sdk/kg_build/kpi.py +++ b/wordlift_sdk/kg_build/kpi.py @@ -6,6 +6,8 @@ from rdflib import Graph, RDF, URIRef +from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome + @dataclass class KgBuildKpiCollector: @@ -98,26 +100,18 @@ def record_graph(self, graph: Graph) -> None: self._property_assertions_total += 1 self._properties_by_predicate[str(predicate)] += 1 - def record_validation( - self, - *, - passed: bool, - warning_count: int, - error_count: int, - warning_sources: dict[str, int] | Counter[str] | None = None, - error_sources: dict[str, int] | Counter[str] | None = None, - ) -> None: + def record_validation(self, outcome: ValidationOutcome) -> None: self._validation_total += 1 - if passed: + if outcome.passed: self._validation_pass += 1 else: self._validation_fail += 1 - self._warning_count += warning_count - self._error_count += error_count - if warning_sources: - self._warning_sources.update(warning_sources) - if error_sources: - self._error_sources.update(error_sources) + self._warning_count += outcome.warning_count + self._error_count += outcome.error_count + if outcome.warning_sources: + self._warning_sources.update(outcome.warning_sources) + if outcome.error_sources: + self._error_sources.update(outcome.error_sources) def summary(self, profile_name: str) -> dict[str, object]: entities_by_type = { diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 44f93ff..7cb025e 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -366,13 +366,7 @@ async def callback( outcome.warning_count, outcome.error_count, ) - self._kpi.record_validation( - passed=outcome.passed, - warning_count=outcome.warning_count, - error_count=outcome.error_count, - warning_sources=outcome.warning_sources, - error_sources=outcome.error_sources, - ) + self._kpi.record_validation(outcome) self._kpi.record_graph(pp_result.graph) self._emit_progress( { @@ -528,13 +522,7 @@ async def _patch_static_templates_once(self) -> None: outcome.warning_count, outcome.error_count, ) - self._kpi.record_validation( - passed=outcome.passed, - warning_count=outcome.warning_count, - error_count=outcome.error_count, - warning_sources=outcome.warning_sources, - error_sources=outcome.error_sources, - ) + self._kpi.record_validation(outcome) self._emit_progress( { "kind": "static_templates", From a5be4d9196f10ba7889f8c75531d189e9fe2e5e5 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 16:00:04 +0100 Subject: [PATCH 50/63] refactor: reorganise into postprocessors/ subpackage --- tests/kg_build/test_id_allocator.py | 7 +++++-- tests/kg_build/test_id_postprocessor.py | 4 +++- tests/kg_build/test_kg_build_id_generator.py | 4 +++- tests/kg_build/test_postprocessors.py | 4 ++-- .../__init__.py} | 4 ++-- .../kg_build/postprocessors/processors/__init__.py | 0 .../processors}/graph_annotation.py | 2 +- .../{ => postprocessors/processors}/id_allocator.py | 2 +- .../{ => postprocessors/processors}/id_generator.py | 4 ++-- .../processors}/id_postprocessor.py | 2 +- .../runner.py} | 6 +++--- .../service.py} | 2 +- .../worker.py} | 2 +- wordlift_sdk/kg_build/protocol.py | 13 ++++++++----- 14 files changed, 33 insertions(+), 23 deletions(-) rename wordlift_sdk/kg_build/{postprocessors.py => postprocessors/__init__.py} (99%) create mode 100644 wordlift_sdk/kg_build/postprocessors/processors/__init__.py rename wordlift_sdk/kg_build/{ => postprocessors/processors}/graph_annotation.py (97%) rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_allocator.py (99%) rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_generator.py (99%) rename wordlift_sdk/kg_build/{ => postprocessors/processors}/id_postprocessor.py (98%) rename wordlift_sdk/kg_build/{postprocessor_runner.py => postprocessors/runner.py} (96%) rename wordlift_sdk/kg_build/{postprocessor_service.py => postprocessors/service.py} (98%) rename wordlift_sdk/kg_build/{postprocessor_worker.py => postprocessors/worker.py} (98%) diff --git a/tests/kg_build/test_id_allocator.py b/tests/kg_build/test_id_allocator.py index bbd77a6..c420626 100644 --- a/tests/kg_build/test_id_allocator.py +++ b/tests/kg_build/test_id_allocator.py @@ -2,8 +2,11 @@ from rdflib import Graph, Literal, RDF, URIRef -import wordlift_sdk.kg_build.id_allocator as id_allocator_module -from wordlift_sdk.kg_build.id_allocator import IdAllocator, normalize_slug +import wordlift_sdk.kg_build.postprocessors.processors.id_allocator as id_allocator_module +from wordlift_sdk.kg_build.postprocessors.processors.id_allocator import ( + IdAllocator, + normalize_slug, +) def _graph(subject: URIRef) -> Graph: diff --git a/tests/kg_build/test_id_postprocessor.py b/tests/kg_build/test_id_postprocessor.py index 9f2a0c5..d5a94ee 100644 --- a/tests/kg_build/test_id_postprocessor.py +++ b/tests/kg_build/test_id_postprocessor.py @@ -4,7 +4,9 @@ from rdflib import Graph, Literal, RDF, URIRef -from wordlift_sdk.kg_build.id_postprocessor import CanonicalIdsPostprocessor +from wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor import ( + CanonicalIdsPostprocessor, +) def test_id_postprocessor_no_dataset_uri_returns_original_graph() -> None: diff --git a/tests/kg_build/test_kg_build_id_generator.py b/tests/kg_build/test_kg_build_id_generator.py index d59d394..b46acfa 100644 --- a/tests/kg_build/test_kg_build_id_generator.py +++ b/tests/kg_build/test_kg_build_id_generator.py @@ -3,7 +3,9 @@ from rdflib import Graph, Literal, RDF, URIRef from rdflib.namespace import XSD -from wordlift_sdk.kg_build.id_generator import CanonicalIdGenerator +from wordlift_sdk.kg_build.postprocessors.processors.id_generator import ( + CanonicalIdGenerator, +) from wordlift_sdk.kg_build.iri_lookup import IriLookup from wordlift_sdk.kg_build.id_policy import DEFAULT_ID_POLICY, IdPolicy diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py index 92afadc..a6f5aac 100644 --- a/tests/kg_build/test_postprocessors.py +++ b/tests/kg_build/test_postprocessors.py @@ -12,7 +12,7 @@ import pytest from rdflib import Dataset, Graph, Literal, URIRef -from wordlift_sdk.kg_build.postprocessor_runner import ( +from wordlift_sdk.kg_build.postprocessors.runner import ( _build_context, _read_graph_nquads, ) @@ -471,7 +471,7 @@ def process_graph(self, graph, context): [ sys.executable, "-m", - "wordlift_sdk.kg_build.postprocessor_runner", + "wordlift_sdk.kg_build.postprocessors.runner", "--class", "test_pp:AddRunnerTriple", "--input-graph", diff --git a/wordlift_sdk/kg_build/postprocessors.py b/wordlift_sdk/kg_build/postprocessors/__init__.py similarity index 99% rename from wordlift_sdk/kg_build/postprocessors.py rename to wordlift_sdk/kg_build/postprocessors/__init__.py index 0acd84a..8dfee63 100644 --- a/wordlift_sdk/kg_build/postprocessors.py +++ b/wordlift_sdk/kg_build/postprocessors/__init__.py @@ -197,7 +197,7 @@ def _ensure_started(self) -> subprocess.Popen[str]: cmd = [ self._spec.python, "-m", - "wordlift_sdk.kg_build.postprocessor_worker", + "wordlift_sdk.kg_build.postprocessors.worker", "--class", self._spec.class_path, ] @@ -359,7 +359,7 @@ def _run( cmd = [ self.spec.python, "-m", - "wordlift_sdk.kg_build.postprocessor_runner", + "wordlift_sdk.kg_build.postprocessors.runner", "--class", self.spec.class_path, "--input-graph", diff --git a/wordlift_sdk/kg_build/postprocessors/processors/__init__.py b/wordlift_sdk/kg_build/postprocessors/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordlift_sdk/kg_build/graph_annotation.py b/wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py similarity index 97% rename from wordlift_sdk/kg_build/graph_annotation.py rename to wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py index 281cee0..5769615 100644 --- a/wordlift_sdk/kg_build/graph_annotation.py +++ b/wordlift_sdk/kg_build/postprocessors/processors/graph_annotation.py @@ -2,7 +2,7 @@ from rdflib import Graph, Literal, URIRef -from .graph_utils import first_level_subjects +from ...graph_utils import first_level_subjects SEOVOC_SOURCE = URIRef("https://w3id.org/seovoc/source") SEOVOC_IMPORT_HASH = URIRef("https://w3id.org/seovoc/importHash") diff --git a/wordlift_sdk/kg_build/id_allocator.py b/wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py similarity index 99% rename from wordlift_sdk/kg_build/id_allocator.py rename to wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py index d597272..de4e6ce 100644 --- a/wordlift_sdk/kg_build/id_allocator.py +++ b/wordlift_sdk/kg_build/postprocessors/processors/id_allocator.py @@ -6,7 +6,7 @@ from rdflib import Graph, Literal, RDF, URIRef -from .id_policy import DEFAULT_ID_POLICY, IdPolicy +from ...id_policy import DEFAULT_ID_POLICY, IdPolicy SCHEMA = "http://schema.org/" diff --git a/wordlift_sdk/kg_build/id_generator.py b/wordlift_sdk/kg_build/postprocessors/processors/id_generator.py similarity index 99% rename from wordlift_sdk/kg_build/id_generator.py rename to wordlift_sdk/kg_build/postprocessors/processors/id_generator.py index 3741c6f..d063f59 100644 --- a/wordlift_sdk/kg_build/id_generator.py +++ b/wordlift_sdk/kg_build/postprocessors/processors/id_generator.py @@ -7,8 +7,8 @@ from rdflib import Graph, Literal, RDF, URIRef -from .id_policy import DEFAULT_ID_POLICY, IdPolicy -from .iri_lookup import IriLookup +from ...id_policy import DEFAULT_ID_POLICY, IdPolicy +from ...iri_lookup import IriLookup SCHEMA = "http://schema.org/" diff --git a/wordlift_sdk/kg_build/id_postprocessor.py b/wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py similarity index 98% rename from wordlift_sdk/kg_build/id_postprocessor.py rename to wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py index ae4326e..ae51a92 100644 --- a/wordlift_sdk/kg_build/id_postprocessor.py +++ b/wordlift_sdk/kg_build/postprocessors/processors/id_postprocessor.py @@ -3,7 +3,7 @@ from rdflib import Graph, RDF, URIRef from .id_generator import CanonicalIdGenerator -from .iri_lookup import IriLookup +from ...iri_lookup import IriLookup def _find_web_page_iri(graph: Graph) -> URIRef | None: diff --git a/wordlift_sdk/kg_build/postprocessor_runner.py b/wordlift_sdk/kg_build/postprocessors/runner.py similarity index 96% rename from wordlift_sdk/kg_build/postprocessor_runner.py rename to wordlift_sdk/kg_build/postprocessors/runner.py index f85fce6..7601a5d 100644 --- a/wordlift_sdk/kg_build/postprocessor_runner.py +++ b/wordlift_sdk/kg_build/postprocessors/runner.py @@ -10,8 +10,8 @@ from rdflib import Dataset, Graph -from .id_allocator import IdAllocator -from .postprocessors import PostprocessorContext +from . import PostprocessorContext +from .processors.id_allocator import IdAllocator def _build_context(payload: dict[str, Any]) -> PostprocessorContext: @@ -90,7 +90,7 @@ def main() -> None: output_graph = graph if result is None else result _write_graph_nquads(output_graph, Path(args.output_graph)) except Exception as exc: # pragma: no cover - process boundary - print(f"[postprocessor_runner] {exc}", file=sys.stderr) + print(f"[postprocessors.runner] {exc}", file=sys.stderr) raise SystemExit(1) from exc diff --git a/wordlift_sdk/kg_build/postprocessor_service.py b/wordlift_sdk/kg_build/postprocessors/service.py similarity index 98% rename from wordlift_sdk/kg_build/postprocessor_service.py rename to wordlift_sdk/kg_build/postprocessors/service.py index c9d4d82..a2b266a 100644 --- a/wordlift_sdk/kg_build/postprocessor_service.py +++ b/wordlift_sdk/kg_build/postprocessors/service.py @@ -10,7 +10,7 @@ from rdflib import Graph -from .postprocessors import ( +from . import ( LoadedPostprocessor, PostprocessorContext, PostprocessorResult, diff --git a/wordlift_sdk/kg_build/postprocessor_worker.py b/wordlift_sdk/kg_build/postprocessors/worker.py similarity index 98% rename from wordlift_sdk/kg_build/postprocessor_worker.py rename to wordlift_sdk/kg_build/postprocessors/worker.py index 3a62fbb..a1dd25c 100644 --- a/wordlift_sdk/kg_build/postprocessor_worker.py +++ b/wordlift_sdk/kg_build/postprocessors/worker.py @@ -12,7 +12,7 @@ from rdflib import Dataset, Graph -from .postprocessor_runner import _build_context +from .runner import _build_context def _load_class(class_path: str): diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 7cb025e..d6cb99b 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -26,18 +26,21 @@ from .config import ProfileDefinition from .entity_patcher import EntityPatcher -from .graph_annotation import ImportAnnotationPostprocessor -from .graph_utils import first_level_subjects -from .id_allocator import IdAllocator -from .id_postprocessor import CanonicalIdsPostprocessor, RootIdReconcilerPostprocessor from .kpi import KgBuildKpiCollector -from .postprocessor_service import PostprocessorService from .postprocessors import ( LoadedPostprocessor, PostprocessorContext, PostprocessorResult, load_postprocessors_for_profile, ) +from .postprocessors.processors.graph_annotation import ImportAnnotationPostprocessor +from .graph_utils import first_level_subjects +from .postprocessors.processors.id_allocator import IdAllocator +from .postprocessors.processors.id_postprocessor import ( + CanonicalIdsPostprocessor, + RootIdReconcilerPostprocessor, +) +from .postprocessors.service import PostprocessorService from .rml_mapping import MappingResult, RmlMappingService from .templates import JinjaRdfTemplateReifier, TemplateTextRenderer from wordlift_sdk.structured_data.engine import init_morph_kgc_pool From 9abfdbd909d1ed4158d33239f32ee1e72761def8 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 16:10:16 +0100 Subject: [PATCH 51/63] =?UTF-8?q?rename:=20runner.py=20=E2=86=92=20oneshot?= =?UTF-8?q?.py,=20worker.py=20=E2=86=92=20persistent.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/kg_build/test_postprocessors.py | 2 +- .../kg_build/postprocessors/__init__.py | 591 ++---------------- .../kg_build/postprocessors/graph_io.py | 101 +++ .../postprocessors/{runner.py => oneshot.py} | 2 +- .../{worker.py => persistent.py} | 2 +- .../kg_build/postprocessors/service.py | 8 +- .../kg_build/postprocessors/subprocess.py | 396 ++++++++++++ wordlift_sdk/kg_build/postprocessors/types.py | 86 +++ 8 files changed, 625 insertions(+), 563 deletions(-) create mode 100644 wordlift_sdk/kg_build/postprocessors/graph_io.py rename wordlift_sdk/kg_build/postprocessors/{runner.py => oneshot.py} (98%) rename wordlift_sdk/kg_build/postprocessors/{worker.py => persistent.py} (99%) create mode 100644 wordlift_sdk/kg_build/postprocessors/subprocess.py create mode 100644 wordlift_sdk/kg_build/postprocessors/types.py diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py index a6f5aac..6ce1284 100644 --- a/tests/kg_build/test_postprocessors.py +++ b/tests/kg_build/test_postprocessors.py @@ -12,7 +12,7 @@ import pytest from rdflib import Dataset, Graph, Literal, URIRef -from wordlift_sdk.kg_build.postprocessors.runner import ( +from wordlift_sdk.kg_build.postprocessors.oneshot import ( _build_context, _read_graph_nquads, ) diff --git a/wordlift_sdk/kg_build/postprocessors/__init__.py b/wordlift_sdk/kg_build/postprocessors/__init__.py index 8dfee63..b05f6f4 100644 --- a/wordlift_sdk/kg_build/postprocessors/__init__.py +++ b/wordlift_sdk/kg_build/postprocessors/__init__.py @@ -1,20 +1,24 @@ from __future__ import annotations -import asyncio -import importlib -import inspect -import json import logging -import select -import shutil -import subprocess -import tempfile -from dataclasses import dataclass, field -from enum import Enum from pathlib import Path -from typing import Any, Protocol, runtime_checkable -from rdflib import Dataset, Graph +from .graph_io import close_loaded_postprocessors +from .subprocess import ( + _build_handler, + _normalize_runtime, +) +from .types import ( + Closeable, + GraphPostprocessor, + LoadedPostprocessor, + PostprocessorContext, + PostprocessorResult, + PostprocessorRuntime, + PostprocessorSpec, + PersistentWorkerJobError, + PersistentWorkerTransportError, +) logger = logging.getLogger(__name__) @@ -24,427 +28,7 @@ import tomli as tomllib -class PostprocessorRuntime(str, Enum): - ONESHOT = "oneshot" - PERSISTENT = "persistent" - INPROCESS = "inprocess" - - -@dataclass(frozen=True) -class PostprocessorContext: - profile_name: str - profile: dict[str, Any] - url: str - account: Any - account_key: str | None - exports: dict[str, Any] - response: Any - existing_web_page_id: str | None - existing_import_hash: str | None = None - import_hash_mode: str = "on" - ids: Any | None = None - - -class _SubprocessRunner(Protocol): - def __call__( - self, - *, - input_graph_path: Path, - output_graph_path: Path, - context_path: Path, - context_payload: dict[str, Any], - ) -> None: ... - - -@runtime_checkable -class Closeable(Protocol): - def close(self) -> None: ... - - -@runtime_checkable -class GraphPostprocessor(Protocol): - def process_graph( - self, graph: Graph, context: PostprocessorContext - ) -> Graph | None: ... - - -@dataclass(frozen=True) -class PostprocessorResult: - graph: Graph - queue_wait_ms: int - postprocessors_ms: int - - -@dataclass(frozen=True) -class LoadedPostprocessor: - name: str - handler: GraphPostprocessor - - def run(self, graph: Graph, context: PostprocessorContext) -> Graph: - result = self.handler.process_graph(graph, context) - return graph if result is None else result - - -@dataclass(frozen=True) -class PostprocessorSpec: - class_path: str - python: str - timeout_seconds: int - enabled: bool - keep_temp_on_error: bool - - -class PersistentWorkerTransportError(RuntimeError): - pass - - -class PersistentWorkerJobError(RuntimeError): - pass - - -class PersistentPostprocessorClient: - def __init__(self, *, spec: PostprocessorSpec, root_dir: Path) -> None: - self._spec = spec - self._root_dir = root_dir - self._process: subprocess.Popen[str] | None = None - self._next_job_id = 0 - - def close(self) -> None: - process = self._process - self._process = None - if process is None: - return - - try: - if process.poll() is None and process.stdin is not None: - process.stdin.write(json.dumps({"op": "shutdown"}) + "\n") - process.stdin.flush() - except Exception: - pass - - self._terminate(process) - - def process_graph( - self, - *, - input_graph_path: Path, - output_graph_path: Path, - context_payload: dict[str, Any], - ) -> None: - for attempt in range(2): - try: - self._process_graph_once( - input_graph_path=input_graph_path, - output_graph_path=output_graph_path, - context_payload=context_payload, - ) - return - except PersistentWorkerTransportError: - self.close() - if attempt == 1: - raise - - def _process_graph_once( - self, - *, - input_graph_path: Path, - output_graph_path: Path, - context_payload: dict[str, Any], - ) -> None: - process = self._ensure_started() - self._next_job_id += 1 - job_id = self._next_job_id - - payload = { - "op": "process", - "id": job_id, - "input_graph": str(input_graph_path), - "output_graph": str(output_graph_path), - "context": context_payload, - } - - try: - assert process.stdin is not None - process.stdin.write( - json.dumps(payload, ensure_ascii=True, default=str) + "\n" - ) - process.stdin.flush() - except Exception as exc: - raise PersistentWorkerTransportError( - f"Postprocessor worker stdin failed: {self._spec.class_path}" - ) from exc - - message = self._read_message( - process, timeout_seconds=self._spec.timeout_seconds - ) - if message.get("id") != job_id: - raise PersistentWorkerTransportError( - f"Postprocessor worker returned invalid response id for {self._spec.class_path}." - ) - if message.get("ok") is True: - return - - error = str(message.get("error") or "unknown worker error") - raise PersistentWorkerJobError( - f"Postprocessor failed: {self._spec.class_path}\n{error}".strip() - ) - - def _ensure_started(self) -> subprocess.Popen[str]: - process = self._process - if process is not None and process.poll() is None: - return process - - cmd = [ - self._spec.python, - "-m", - "wordlift_sdk.kg_build.postprocessors.worker", - "--class", - self._spec.class_path, - ] - process = subprocess.Popen( - cmd, - text=True, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=str(self._root_dir), - bufsize=1, - ) - - try: - ready = self._read_message( - process, timeout_seconds=min(self._spec.timeout_seconds, 60) - ) - except Exception: - self._terminate(process) - raise - - if ready.get("op") != "ready" or ready.get("ok") is not True: - stderr = self._read_stderr(process) - self._terminate(process) - raise PersistentWorkerTransportError( - f"Postprocessor worker failed to start: {self._spec.class_path}" - + (f"\n{stderr}" if stderr else "") - ) - - self._process = process - return process - - def _read_message( - self, - process: subprocess.Popen[str], - *, - timeout_seconds: int, - ) -> dict[str, Any]: - if process.stdout is None: - raise PersistentWorkerTransportError("Worker stdout is unavailable.") - - ready, _, _ = select.select([process.stdout], [], [], timeout_seconds) - if not ready: - self._terminate(process) - cmd = ( - process.args if isinstance(process.args, list) else [str(process.args)] - ) - raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout_seconds) - - line = process.stdout.readline() - if not line: - stderr = self._read_stderr(process) - self._terminate(process) - raise PersistentWorkerTransportError( - f"Postprocessor worker exited unexpectedly: {self._spec.class_path}" - + (f"\n{stderr}" if stderr else "") - ) - - try: - return json.loads(line) - except json.JSONDecodeError as exc: - raise PersistentWorkerTransportError( - "Postprocessor worker returned invalid JSON response." - ) from exc - - def _read_stderr(self, process: subprocess.Popen[str]) -> str: - if process.stderr is None: - return "" - try: - return (process.stderr.read() or "").strip() - except Exception: - return "" - - def _terminate(self, process: subprocess.Popen[str]) -> None: - if process.poll() is None: - process.kill() - try: - process.wait(timeout=5) - except subprocess.TimeoutExpired: - pass - - -def _run_subprocess( - spec: PostprocessorSpec, - root_dir: Path, - graph: Graph, - payload: dict[str, Any], - runner: _SubprocessRunner, -) -> Graph | None: - """Shared scaffolding for subprocess-based postprocessors. - - Handles temp-dir lifecycle, graph serialization, output verification, - and debug-copy on failure. *runner* is called with the prepared paths - and is responsible only for the actual subprocess execution step. - """ - temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_")) - failed = False - try: - input_graph_path = temp_dir_path / "input_graph.nq" - output_graph_path = temp_dir_path / "output_graph.nq" - context_path = temp_dir_path / "context.json" - - _write_graph_nquads(graph, input_graph_path) - context_path.write_text( - json.dumps(payload, ensure_ascii=True, default=str), - encoding="utf-8", - ) - - runner( - input_graph_path=input_graph_path, - output_graph_path=output_graph_path, - context_path=context_path, - context_payload=payload, - ) - - if not output_graph_path.exists(): - failed = True - raise RuntimeError( - f"Postprocessor did not produce output graph: {spec.class_path}" - ) - - return _read_graph_nquads(output_graph_path) - except Exception: - failed = True - raise - finally: - if failed and spec.keep_temp_on_error: - debug_dir = root_dir / "output" / "postprocessor_debug" - debug_dir.mkdir(parents=True, exist_ok=True) - target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_")) - if target.exists(): - shutil.rmtree(target) - shutil.copytree(temp_dir_path, target) - _redact_debug_context(target / "context.json") - if temp_dir_path.exists(): - shutil.rmtree(temp_dir_path, ignore_errors=True) - - -@dataclass(frozen=True) -class OneshotSubprocessPostprocessor: - spec: PostprocessorSpec - root_dir: Path - - def process_graph( - self, graph: Graph, context: PostprocessorContext - ) -> Graph | None: - return _run_subprocess( - self.spec, self.root_dir, graph, _build_runner_payload(context), self._run - ) - - def _run( - self, - *, - input_graph_path: Path, - output_graph_path: Path, - context_path: Path, - **_: Any, - ) -> None: - cmd = [ - self.spec.python, - "-m", - "wordlift_sdk.kg_build.postprocessors.runner", - "--class", - self.spec.class_path, - "--input-graph", - str(input_graph_path), - "--output-graph", - str(output_graph_path), - "--context", - str(context_path), - ] - completed = subprocess.run( - cmd, - text=True, - capture_output=True, - cwd=str(self.root_dir), - timeout=self.spec.timeout_seconds, - check=False, - ) - if completed.returncode != 0: - stderr = (completed.stderr or "").strip() - raise RuntimeError( - f"Postprocessor failed: {self.spec.class_path} " - f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "") - ) - - -@dataclass -class PersistentSubprocessPostprocessor: - spec: PostprocessorSpec - root_dir: Path - _client: PersistentPostprocessorClient | None = field( - init=False, - default=None, - repr=False, - ) - - def close(self) -> None: - if self._client is not None: - self._client.close() - self._client = None - - def process_graph( - self, graph: Graph, context: PostprocessorContext - ) -> Graph | None: - return _run_subprocess( - self.spec, self.root_dir, graph, _build_runner_payload(context), self._run - ) - - def _run( - self, - *, - input_graph_path: Path, - output_graph_path: Path, - context_payload: dict[str, Any], - **_: Any, - ) -> None: - if self._client is None: - self._client = PersistentPostprocessorClient( - spec=self.spec, - root_dir=self.root_dir, - ) - self._client.process_graph( - input_graph_path=input_graph_path, - output_graph_path=output_graph_path, - context_payload=context_payload, - ) - - -@dataclass(frozen=True) -class InProcessPostprocessor: - class_path: str - - def process_graph( - self, graph: Graph, context: PostprocessorContext - ) -> Graph | None: - module_name, class_name = self.class_path.split(":", 1) - module = importlib.import_module(module_name) - klass = getattr(module, class_name) - processor = klass() - result = processor.process_graph(graph, context) - if inspect.isawaitable(result): - result = asyncio.run(result) - return result - - -def _as_bool(value: Any, default: bool) -> bool: +def _as_bool(value, default: bool) -> bool: if value is None: return default if isinstance(value, bool): @@ -452,7 +36,7 @@ def _as_bool(value: Any, default: bool) -> bool: raise TypeError("Expected boolean value.") -def _as_str(value: Any, default: str) -> str: +def _as_str(value, default: str) -> str: if value is None: return default if not isinstance(value, str) or not value.strip(): @@ -460,7 +44,7 @@ def _as_str(value: Any, default: str) -> str: return value -def _as_positive_int(value: Any, default: int) -> int: +def _as_positive_int(value, default: int) -> int: if value is None: return default if not isinstance(value, int) or value <= 0: @@ -468,26 +52,6 @@ def _as_positive_int(value: Any, default: int) -> int: return value -def _build_handler( - spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime -) -> GraphPostprocessor: - if runtime == PostprocessorRuntime.INPROCESS: - return InProcessPostprocessor(class_path=spec.class_path) - if runtime == PostprocessorRuntime.PERSISTENT: - return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir) - return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir) - - -def _normalize_runtime(value: str | None) -> PostprocessorRuntime: - raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower() - try: - return PostprocessorRuntime(raw) - except ValueError: - raise ValueError( - "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess." - ) - - def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]: if not manifest_path.exists(): return [] @@ -519,57 +83,16 @@ def _load_manifest_specs(manifest_path: Path) -> list[PostprocessorSpec]: f"{manifest_path}: postprocessors[{index}].class must be " "'package.module:ClassName'." ) - spec = PostprocessorSpec( + specs.append(PostprocessorSpec( class_path=class_path.strip(), python=_as_str(row.get("python"), default_python), - timeout_seconds=_as_positive_int( - row.get("timeout_seconds"), default_timeout - ), + timeout_seconds=_as_positive_int(row.get("timeout_seconds"), default_timeout), enabled=_as_bool(row.get("enabled"), default_enabled), - keep_temp_on_error=_as_bool( - row.get("keep_temp_on_error"), default_keep_temp - ), - ) - specs.append(spec) + keep_temp_on_error=_as_bool(row.get("keep_temp_on_error"), default_keep_temp), + )) return specs -def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]: - account = getattr(context, "account", None) - dataset_uri = str(getattr(account, "dataset_uri", "")).rstrip("/") - country_code = str(getattr(account, "country_code", "")).strip().lower() - account_key = ( - str(context.account_key).strip() - if getattr(context, "account_key", None) is not None - else "" - ) - profile = dict(getattr(context, "profile", {}) or {}) - if "settings" not in profile or not isinstance(profile.get("settings"), dict): - profile["settings"] = {} - profile_settings = dict(profile.get("settings", {}) or {}) - profile_settings.setdefault("api_url", "https://api.wordlift.io") - profile["settings"] = profile_settings - response = getattr(context, "response", None) - web_page = getattr(response, "web_page", None) if response else None - return { - "profile_name": context.profile_name, - "profile": profile, - "url": context.url, - "dataset_uri": dataset_uri, - "country_code": country_code, - "account_key": account_key or None, - "exports": context.exports, - "existing_web_page_id": context.existing_web_page_id, - "response": { - "id": getattr(response, "id", None) or context.existing_web_page_id, - "web_page": { - "url": getattr(web_page, "url", None), - "html": getattr(web_page, "html", None), - }, - }, - } - - def _load_from_specs( specs: list[PostprocessorSpec], root_dir: Path, @@ -629,57 +152,17 @@ def load_postprocessors( return loaded -def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None: - for processor in postprocessors: - if isinstance(processor.handler, Closeable): - processor.handler.close() - - -def _write_graph_nquads(graph: Graph, path: Path) -> None: - dataset = Dataset() - for triple in graph: - dataset.add(triple) - dataset.serialize(destination=path, format="nquads") - - -def _read_graph_nquads(path: Path) -> Graph: - dataset = Dataset() - dataset.parse(path, format="nquads") - graph = Graph() - for triple in dataset.triples((None, None, None)): - graph.add(triple) - return graph - - -def _redact_debug_context(path: Path) -> None: - if not path.exists(): - return - try: - payload = json.loads(path.read_text(encoding="utf-8")) - except Exception: - return - if not isinstance(payload, dict): - return - if payload.get("account_key"): - payload["account_key"] = "***REDACTED***" - profile = payload.get("profile") - if isinstance(profile, dict) and profile.get("api_key"): - profile["api_key"] = "***REDACTED***" - settings = ( - profile.get("settings") - if isinstance(profile, dict) and isinstance(profile.get("settings"), dict) - else None - ) - if settings and settings.get("api_key"): - settings["api_key"] = "***REDACTED***" - if settings and settings.get("wordlift_key"): - settings["wordlift_key"] = "***REDACTED***" - if settings and settings.get("WORDLIFT_KEY"): - settings["WORDLIFT_KEY"] = "***REDACTED***" - if settings and settings.get("WORDLIFT_API_KEY"): - settings["WORDLIFT_API_KEY"] = "***REDACTED***" - payload["profile"] = profile - path.write_text( - json.dumps(payload, ensure_ascii=True, default=str), - encoding="utf-8", - ) +__all__ = [ + "Closeable", + "GraphPostprocessor", + "LoadedPostprocessor", + "PostprocessorContext", + "PostprocessorResult", + "PostprocessorRuntime", + "PostprocessorSpec", + "PersistentWorkerJobError", + "PersistentWorkerTransportError", + "close_loaded_postprocessors", + "load_postprocessors", + "load_postprocessors_for_profile", +] diff --git a/wordlift_sdk/kg_build/postprocessors/graph_io.py b/wordlift_sdk/kg_build/postprocessors/graph_io.py new file mode 100644 index 0000000..866189c --- /dev/null +++ b/wordlift_sdk/kg_build/postprocessors/graph_io.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from rdflib import Dataset, Graph + +from .types import Closeable, LoadedPostprocessor, PostprocessorContext + + +def _build_runner_payload(context: PostprocessorContext) -> dict[str, Any]: + account = getattr(context, "account", None) + dataset_uri = str(getattr(account, "dataset_uri", "")).rstrip("/") + country_code = str(getattr(account, "country_code", "")).strip().lower() + account_key = ( + str(context.account_key).strip() + if getattr(context, "account_key", None) is not None + else "" + ) + profile = dict(getattr(context, "profile", {}) or {}) + if "settings" not in profile or not isinstance(profile.get("settings"), dict): + profile["settings"] = {} + profile_settings = dict(profile.get("settings", {}) or {}) + profile_settings.setdefault("api_url", "https://api.wordlift.io") + profile["settings"] = profile_settings + response = getattr(context, "response", None) + web_page = getattr(response, "web_page", None) if response else None + return { + "profile_name": context.profile_name, + "profile": profile, + "url": context.url, + "dataset_uri": dataset_uri, + "country_code": country_code, + "account_key": account_key or None, + "exports": context.exports, + "existing_web_page_id": context.existing_web_page_id, + "response": { + "id": getattr(response, "id", None) or context.existing_web_page_id, + "web_page": { + "url": getattr(web_page, "url", None), + "html": getattr(web_page, "html", None), + }, + }, + } + + +def close_loaded_postprocessors(postprocessors: list[LoadedPostprocessor]) -> None: + for processor in postprocessors: + if isinstance(processor.handler, Closeable): + processor.handler.close() + + +def _write_graph_nquads(graph: Graph, path: Path) -> None: + dataset = Dataset() + for triple in graph: + dataset.add(triple) + dataset.serialize(destination=path, format="nquads") + + +def _read_graph_nquads(path: Path) -> Graph: + dataset = Dataset() + dataset.parse(path, format="nquads") + graph = Graph() + for triple in dataset.triples((None, None, None)): + graph.add(triple) + return graph + + +def _redact_debug_context(path: Path) -> None: + if not path.exists(): + return + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + return + if not isinstance(payload, dict): + return + if payload.get("account_key"): + payload["account_key"] = "***REDACTED***" + profile = payload.get("profile") + if isinstance(profile, dict) and profile.get("api_key"): + profile["api_key"] = "***REDACTED***" + settings = ( + profile.get("settings") + if isinstance(profile, dict) and isinstance(profile.get("settings"), dict) + else None + ) + if settings and settings.get("api_key"): + settings["api_key"] = "***REDACTED***" + if settings and settings.get("wordlift_key"): + settings["wordlift_key"] = "***REDACTED***" + if settings and settings.get("WORDLIFT_KEY"): + settings["WORDLIFT_KEY"] = "***REDACTED***" + if settings and settings.get("WORDLIFT_API_KEY"): + settings["WORDLIFT_API_KEY"] = "***REDACTED***" + payload["profile"] = profile + path.write_text( + json.dumps(payload, ensure_ascii=True, default=str), + encoding="utf-8", + ) diff --git a/wordlift_sdk/kg_build/postprocessors/runner.py b/wordlift_sdk/kg_build/postprocessors/oneshot.py similarity index 98% rename from wordlift_sdk/kg_build/postprocessors/runner.py rename to wordlift_sdk/kg_build/postprocessors/oneshot.py index 7601a5d..6b8ceda 100644 --- a/wordlift_sdk/kg_build/postprocessors/runner.py +++ b/wordlift_sdk/kg_build/postprocessors/oneshot.py @@ -10,7 +10,7 @@ from rdflib import Dataset, Graph -from . import PostprocessorContext +from .types import PostprocessorContext from .processors.id_allocator import IdAllocator diff --git a/wordlift_sdk/kg_build/postprocessors/worker.py b/wordlift_sdk/kg_build/postprocessors/persistent.py similarity index 99% rename from wordlift_sdk/kg_build/postprocessors/worker.py rename to wordlift_sdk/kg_build/postprocessors/persistent.py index a1dd25c..eb04efb 100644 --- a/wordlift_sdk/kg_build/postprocessors/worker.py +++ b/wordlift_sdk/kg_build/postprocessors/persistent.py @@ -12,7 +12,7 @@ from rdflib import Dataset, Graph -from .runner import _build_context +from .oneshot import _build_context def _load_class(class_path: str): diff --git a/wordlift_sdk/kg_build/postprocessors/service.py b/wordlift_sdk/kg_build/postprocessors/service.py index a2b266a..f508bb6 100644 --- a/wordlift_sdk/kg_build/postprocessors/service.py +++ b/wordlift_sdk/kg_build/postprocessors/service.py @@ -10,12 +10,8 @@ from rdflib import Graph -from . import ( - LoadedPostprocessor, - PostprocessorContext, - PostprocessorResult, - close_loaded_postprocessors, -) +from .graph_io import close_loaded_postprocessors +from .types import LoadedPostprocessor, PostprocessorContext, PostprocessorResult logger = logging.getLogger(__name__) diff --git a/wordlift_sdk/kg_build/postprocessors/subprocess.py b/wordlift_sdk/kg_build/postprocessors/subprocess.py new file mode 100644 index 0000000..52b1dfe --- /dev/null +++ b/wordlift_sdk/kg_build/postprocessors/subprocess.py @@ -0,0 +1,396 @@ +from __future__ import annotations + +import asyncio +import importlib +import inspect +import json +import logging +import select +import shutil +import subprocess +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from rdflib import Graph + +from .types import ( + Closeable, + GraphPostprocessor, + LoadedPostprocessor, + PostprocessorContext, + PostprocessorRuntime, + PostprocessorSpec, + PersistentWorkerJobError, + PersistentWorkerTransportError, + _SubprocessRunner, +) + +logger = logging.getLogger(__name__) + + +class PersistentPostprocessorClient: + def __init__(self, *, spec: PostprocessorSpec, root_dir: Path) -> None: + self._spec = spec + self._root_dir = root_dir + self._process: subprocess.Popen[str] | None = None + self._next_job_id = 0 + + def close(self) -> None: + process = self._process + self._process = None + if process is None: + return + + try: + if process.poll() is None and process.stdin is not None: + process.stdin.write(json.dumps({"op": "shutdown"}) + "\n") + process.stdin.flush() + except Exception: + pass + + self._terminate(process) + + def process_graph( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_payload: dict[str, Any], + ) -> None: + for attempt in range(2): + try: + self._process_graph_once( + input_graph_path=input_graph_path, + output_graph_path=output_graph_path, + context_payload=context_payload, + ) + return + except PersistentWorkerTransportError: + self.close() + if attempt == 1: + raise + + def _process_graph_once( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_payload: dict[str, Any], + ) -> None: + process = self._ensure_started() + self._next_job_id += 1 + job_id = self._next_job_id + + payload = { + "op": "process", + "id": job_id, + "input_graph": str(input_graph_path), + "output_graph": str(output_graph_path), + "context": context_payload, + } + + try: + assert process.stdin is not None + process.stdin.write( + json.dumps(payload, ensure_ascii=True, default=str) + "\n" + ) + process.stdin.flush() + except Exception as exc: + raise PersistentWorkerTransportError( + f"Postprocessor worker stdin failed: {self._spec.class_path}" + ) from exc + + message = self._read_message( + process, timeout_seconds=self._spec.timeout_seconds + ) + if message.get("id") != job_id: + raise PersistentWorkerTransportError( + f"Postprocessor worker returned invalid response id for {self._spec.class_path}." + ) + if message.get("ok") is True: + return + + error = str(message.get("error") or "unknown worker error") + raise PersistentWorkerJobError( + f"Postprocessor failed: {self._spec.class_path}\n{error}".strip() + ) + + def _ensure_started(self) -> subprocess.Popen[str]: + process = self._process + if process is not None and process.poll() is None: + return process + + cmd = [ + self._spec.python, + "-m", + "wordlift_sdk.kg_build.postprocessors.persistent", + "--class", + self._spec.class_path, + ] + process = subprocess.Popen( + cmd, + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(self._root_dir), + bufsize=1, + ) + + try: + ready = self._read_message( + process, timeout_seconds=min(self._spec.timeout_seconds, 60) + ) + except Exception: + self._terminate(process) + raise + + if ready.get("op") != "ready" or ready.get("ok") is not True: + stderr = self._read_stderr(process) + self._terminate(process) + raise PersistentWorkerTransportError( + f"Postprocessor worker failed to start: {self._spec.class_path}" + + (f"\n{stderr}" if stderr else "") + ) + + self._process = process + return process + + def _read_message( + self, + process: subprocess.Popen[str], + *, + timeout_seconds: int, + ) -> dict[str, Any]: + if process.stdout is None: + raise PersistentWorkerTransportError("Worker stdout is unavailable.") + + ready, _, _ = select.select([process.stdout], [], [], timeout_seconds) + if not ready: + self._terminate(process) + cmd = ( + process.args if isinstance(process.args, list) else [str(process.args)] + ) + raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout_seconds) + + line = process.stdout.readline() + if not line: + stderr = self._read_stderr(process) + self._terminate(process) + raise PersistentWorkerTransportError( + f"Postprocessor worker exited unexpectedly: {self._spec.class_path}" + + (f"\n{stderr}" if stderr else "") + ) + + try: + return json.loads(line) + except json.JSONDecodeError as exc: + raise PersistentWorkerTransportError( + "Postprocessor worker returned invalid JSON response." + ) from exc + + def _read_stderr(self, process: subprocess.Popen[str]) -> str: + if process.stderr is None: + return "" + try: + return (process.stderr.read() or "").strip() + except Exception: + return "" + + def _terminate(self, process: subprocess.Popen[str]) -> None: + if process.poll() is None: + process.kill() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + pass + + +def _run_subprocess( + spec: PostprocessorSpec, + root_dir: Path, + graph: Graph, + payload: dict[str, Any], + runner: _SubprocessRunner, +) -> Graph | None: + """Shared scaffolding for subprocess-based postprocessors. + + Handles temp-dir lifecycle, graph serialization, output verification, + and debug-copy on failure. *runner* is called with the prepared paths + and is responsible only for the actual subprocess execution step. + """ + from .graph_io import _redact_debug_context, _read_graph_nquads, _write_graph_nquads + + temp_dir_path = Path(tempfile.mkdtemp(prefix="worai_pp_")) + failed = False + try: + input_graph_path = temp_dir_path / "input_graph.nq" + output_graph_path = temp_dir_path / "output_graph.nq" + context_path = temp_dir_path / "context.json" + + _write_graph_nquads(graph, input_graph_path) + context_path.write_text( + json.dumps(payload, ensure_ascii=True, default=str), + encoding="utf-8", + ) + + runner( + input_graph_path=input_graph_path, + output_graph_path=output_graph_path, + context_path=context_path, + context_payload=payload, + ) + + if not output_graph_path.exists(): + failed = True + raise RuntimeError( + f"Postprocessor did not produce output graph: {spec.class_path}" + ) + + return _read_graph_nquads(output_graph_path) + except Exception: + failed = True + raise + finally: + if failed and spec.keep_temp_on_error: + debug_dir = root_dir / "output" / "postprocessor_debug" + debug_dir.mkdir(parents=True, exist_ok=True) + target = debug_dir / (spec.class_path.replace(":", "_").replace(".", "_")) + if target.exists(): + shutil.rmtree(target) + shutil.copytree(temp_dir_path, target) + _redact_debug_context(target / "context.json") + if temp_dir_path.exists(): + shutil.rmtree(temp_dir_path, ignore_errors=True) + + +@dataclass(frozen=True) +class OneshotSubprocessPostprocessor: + spec: PostprocessorSpec + root_dir: Path + + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: + from .graph_io import _build_runner_payload + return _run_subprocess( + self.spec, self.root_dir, graph, _build_runner_payload(context), self._run + ) + + def _run( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_path: Path, + **_: Any, + ) -> None: + cmd = [ + self.spec.python, + "-m", + "wordlift_sdk.kg_build.postprocessors.oneshot", + "--class", + self.spec.class_path, + "--input-graph", + str(input_graph_path), + "--output-graph", + str(output_graph_path), + "--context", + str(context_path), + ] + completed = subprocess.run( + cmd, + text=True, + capture_output=True, + cwd=str(self.root_dir), + timeout=self.spec.timeout_seconds, + check=False, + ) + if completed.returncode != 0: + stderr = (completed.stderr or "").strip() + raise RuntimeError( + f"Postprocessor failed: {self.spec.class_path} " + f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "") + ) + + +@dataclass +class PersistentSubprocessPostprocessor: + spec: PostprocessorSpec + root_dir: Path + _client: PersistentPostprocessorClient | None = field( + init=False, + default=None, + repr=False, + ) + + def close(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: + from .graph_io import _build_runner_payload + return _run_subprocess( + self.spec, self.root_dir, graph, _build_runner_payload(context), self._run + ) + + def _run( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_payload: dict[str, Any], + **_: Any, + ) -> None: + if self._client is None: + self._client = PersistentPostprocessorClient( + spec=self.spec, + root_dir=self.root_dir, + ) + self._client.process_graph( + input_graph_path=input_graph_path, + output_graph_path=output_graph_path, + context_payload=context_payload, + ) + + +@dataclass(frozen=True) +class InProcessPostprocessor: + class_path: str + + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: + module_name, class_name = self.class_path.split(":", 1) + module = importlib.import_module(module_name) + klass = getattr(module, class_name) + processor = klass() + result = processor.process_graph(graph, context) + if inspect.isawaitable(result): + result = asyncio.run(result) + return result + + +def _build_handler( + spec: PostprocessorSpec, root_dir: Path, runtime: PostprocessorRuntime +) -> GraphPostprocessor: + if runtime == PostprocessorRuntime.INPROCESS: + return InProcessPostprocessor(class_path=spec.class_path) + if runtime == PostprocessorRuntime.PERSISTENT: + return PersistentSubprocessPostprocessor(spec=spec, root_dir=root_dir) + return OneshotSubprocessPostprocessor(spec=spec, root_dir=root_dir) + + +def _normalize_runtime(value: str | None) -> PostprocessorRuntime: + raw = (value or PostprocessorRuntime.ONESHOT.value).strip().lower() + try: + return PostprocessorRuntime(raw) + except ValueError: + raise ValueError( + "POSTPROCESSOR_RUNTIME must be one of: oneshot, persistent, inprocess." + ) diff --git a/wordlift_sdk/kg_build/postprocessors/types.py b/wordlift_sdk/kg_build/postprocessors/types.py new file mode 100644 index 0000000..a9323bd --- /dev/null +++ b/wordlift_sdk/kg_build/postprocessors/types.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any, Protocol, runtime_checkable + +from rdflib import Graph + + +class PostprocessorRuntime(str, Enum): + ONESHOT = "oneshot" + PERSISTENT = "persistent" + INPROCESS = "inprocess" + + +@dataclass(frozen=True) +class PostprocessorContext: + profile_name: str + profile: dict[str, Any] + url: str + account: Any + account_key: str | None + exports: dict[str, Any] + response: Any + existing_web_page_id: str | None + existing_import_hash: str | None = None + import_hash_mode: str = "on" + ids: Any | None = None + + +@dataclass(frozen=True) +class PostprocessorSpec: + class_path: str + python: str + timeout_seconds: int + enabled: bool + keep_temp_on_error: bool + + +class _SubprocessRunner(Protocol): + def __call__( + self, + *, + input_graph_path: Path, + output_graph_path: Path, + context_path: Path, + context_payload: dict[str, Any], + ) -> None: ... + + +@runtime_checkable +class Closeable(Protocol): + def close(self) -> None: ... + + +@runtime_checkable +class GraphPostprocessor(Protocol): + def process_graph( + self, graph: Graph, context: PostprocessorContext + ) -> Graph | None: ... + + +@dataclass(frozen=True) +class PostprocessorResult: + graph: Graph + queue_wait_ms: int + postprocessors_ms: int + + +@dataclass(frozen=True) +class LoadedPostprocessor: + name: str + handler: GraphPostprocessor + + def run(self, graph: Graph, context: PostprocessorContext) -> Graph: + result = self.handler.process_graph(graph, context) + return graph if result is None else result + + +class PersistentWorkerTransportError(RuntimeError): + pass + + +class PersistentWorkerJobError(RuntimeError): + pass From 2af8809a4a315521124bac917f4d1709f94ef160 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 17:12:02 +0100 Subject: [PATCH 52/63] fix: close GraphQueue ApiClient on protocol shutdown --- wordlift_sdk/kg_build/protocol.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index d6cb99b..1295f64 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -399,10 +399,11 @@ async def callback( outcome.validation_ms if outcome else 0, ) - def close(self) -> None: + async def close(self) -> None: self._postprocessor_service.close() self._mapping_executor.shutdown(wait=False) self._shacl_validator.close() + await self.context.graph_queue.close() def get_kpi_summary(self) -> dict[str, object]: return self._kpi.summary(self.profile.name) From 1b7fa15a8dd5c543b6e5d9e635a9e9fd49f9f167 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Thu, 19 Mar 2026 19:13:14 +0100 Subject: [PATCH 53/63] feat(tests): align kg_build tests with refactored protocol and postprocessor API --- tests/kg_build/test_graph_utils.py | 100 ++++ tests/kg_build/test_kpi.py | 13 +- .../test_postprocessor_runner_helpers.py | 2 +- .../test_postprocessor_runner_main.py | 2 +- tests/kg_build/test_postprocessor_service.py | 164 ++++++ tests/kg_build/test_postprocessor_worker.py | 2 +- tests/kg_build/test_postprocessors.py | 54 +- tests/kg_build/test_profile_inheritance.py | 3 +- tests/kg_build/test_protocol.py | 521 +++++++++--------- tests/kg_build/test_rml_mapping.py | 31 +- .../kg_build/postprocessors/subprocess.py | 7 +- 11 files changed, 575 insertions(+), 324 deletions(-) create mode 100644 tests/kg_build/test_graph_utils.py create mode 100644 tests/kg_build/test_postprocessor_service.py diff --git a/tests/kg_build/test_graph_utils.py b/tests/kg_build/test_graph_utils.py new file mode 100644 index 0000000..4f1dec4 --- /dev/null +++ b/tests/kg_build/test_graph_utils.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from rdflib import Graph, Literal, URIRef + +from wordlift_sdk.kg_build.graph_utils import first_level_subjects + +DATASET = "https://data.example.com" + + +def _uri(path: str) -> URIRef: + return URIRef(f"{DATASET}/{path}") + + +def _ext(path: str) -> URIRef: + return URIRef(f"https://external.example.com/{path}") + + +def test_empty_graph_returns_empty_set() -> None: + assert first_level_subjects(Graph(), DATASET) == set() + + +def test_dataset_uri_match_returns_two_segment_subjects() -> None: + g = Graph() + canonical = _uri("articles/my-article") # 2 segments → first-level + deep = _uri("articles/my-article/comments/1") # 4 segments → not first-level + g.add((canonical, URIRef("https://schema.org/name"), Literal("Article"))) + g.add((deep, URIRef("https://schema.org/name"), Literal("Comment"))) + + result = first_level_subjects(g, DATASET) + assert canonical in result + assert deep not in result + + +def test_dataset_uri_match_ignores_single_segment() -> None: + g = Graph() + one_seg = _uri("articles") # 1 segment → not first-level by-id + two_seg = _uri("articles/slug") # 2 segments → first-level + g.add((one_seg, URIRef("https://schema.org/name"), Literal("Collection"))) + g.add((two_seg, URIRef("https://schema.org/name"), Literal("Item"))) + + result = first_level_subjects(g, DATASET) + assert two_seg in result + assert one_seg not in result + + +def test_fallback_to_unreferenced_subjects_when_no_dataset_match() -> None: + g = Graph() + root = _ext("root") + child = _ext("child") + # child is referenced by root, so root is the unreferenced subject + g.add((root, URIRef("https://schema.org/hasPart"), child)) + g.add((child, URIRef("https://schema.org/name"), Literal("Child"))) + + # No dataset_uri prefix match; fall back to "not referenced" logic + result = first_level_subjects(g, "") + assert root in result + assert child not in result + + +def test_fallback_returns_all_when_everything_is_referenced() -> None: + g = Graph() + a = _ext("a") + b = _ext("b") + # mutual references: both are referenced + g.add((a, URIRef("https://schema.org/hasPart"), b)) + g.add((b, URIRef("https://schema.org/hasPart"), a)) + + result = first_level_subjects(g, "") + assert result == {a, b} + + +def test_blank_dataset_uri_uses_reference_fallback() -> None: + g = Graph() + page = _ext("page") + product = _ext("product") + g.add((page, URIRef("https://schema.org/mentions"), product)) + g.add((product, URIRef("https://schema.org/name"), Literal("Product"))) + + result = first_level_subjects(g, "") + assert page in result + assert product not in result + + +def test_dataset_uri_prefix_no_match_falls_back_gracefully() -> None: + g = Graph() + ext_subject = _ext("item") + g.add((ext_subject, URIRef("https://schema.org/name"), Literal("External"))) + + # dataset_uri set but no subject matches the prefix + result = first_level_subjects(g, DATASET) + assert ext_subject in result + + +def test_literal_objects_are_not_counted_as_subjects() -> None: + g = Graph() + s = _uri("things/item") + g.add((s, URIRef("https://schema.org/name"), Literal("Name"))) + + result = first_level_subjects(g, DATASET) + assert s in result diff --git a/tests/kg_build/test_kpi.py b/tests/kg_build/test_kpi.py index 905a54b..69ef8e2 100644 --- a/tests/kg_build/test_kpi.py +++ b/tests/kg_build/test_kpi.py @@ -1,6 +1,7 @@ from rdflib import Graph, Literal, RDF, URIRef from wordlift_sdk.kg_build.kpi import KgBuildKpiCollector +from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome def test_kpi_collector_records_graph_and_validation() -> None: @@ -14,11 +15,13 @@ def test_kpi_collector_records_graph_and_validation() -> None: collector.record_graph(graph) collector.record_validation( - passed=False, - warning_count=2, - error_count=1, - warning_sources={"google-article": 2}, - error_sources={"google-product": 1}, + ValidationOutcome( + passed=False, + warning_sources={"google-article": 2}, + error_sources={"google-product": 1}, + queue_wait_ms=0, + validation_ms=0, + ) ) summary = collector.summary("demo") diff --git a/tests/kg_build/test_postprocessor_runner_helpers.py b/tests/kg_build/test_postprocessor_runner_helpers.py index 1083eaa..1034147 100644 --- a/tests/kg_build/test_postprocessor_runner_helpers.py +++ b/tests/kg_build/test_postprocessor_runner_helpers.py @@ -4,7 +4,7 @@ from rdflib import Graph, Literal, URIRef -from wordlift_sdk.kg_build import postprocessor_runner as runner +from wordlift_sdk.kg_build.postprocessors import oneshot as runner def test_load_class_variants(monkeypatch) -> None: diff --git a/tests/kg_build/test_postprocessor_runner_main.py b/tests/kg_build/test_postprocessor_runner_main.py index 0b3bc8e..224010f 100644 --- a/tests/kg_build/test_postprocessor_runner_main.py +++ b/tests/kg_build/test_postprocessor_runner_main.py @@ -6,7 +6,7 @@ from rdflib import Graph, Literal, URIRef -from wordlift_sdk.kg_build import postprocessor_runner as runner +from wordlift_sdk.kg_build.postprocessors import oneshot as runner def _graph() -> Graph: diff --git a/tests/kg_build/test_postprocessor_service.py b/tests/kg_build/test_postprocessor_service.py new file mode 100644 index 0000000..df0cf49 --- /dev/null +++ b/tests/kg_build/test_postprocessor_service.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import asyncio +from types import SimpleNamespace + +import pytest +from rdflib import Graph, Literal, URIRef + +from wordlift_sdk.kg_build.postprocessors.service import PostprocessorService +from wordlift_sdk.kg_build.postprocessors.types import ( + LoadedPostprocessor, + PostprocessorContext, +) + + +def _sample_graph() -> Graph: + g = Graph() + g.add( + ( + URIRef("https://example.com/s"), + URIRef("https://example.com/p"), + Literal("v"), + ) + ) + return g + + +def _sample_context() -> PostprocessorContext: + return PostprocessorContext( + profile_name="test", + profile={}, + url="https://example.com/page", + account=SimpleNamespace(dataset_uri="https://data.example.com"), + account_key=None, + exports={}, + response=SimpleNamespace( + id=None, web_page=SimpleNamespace(url=None, html=None) + ), + existing_web_page_id=None, + ) + + +def _make_service(pool_size: int = 1, processors=None) -> PostprocessorService: + if processors is None: + + class _Passthrough: + def process_graph(self, graph: Graph, context) -> Graph: + return graph + + processors = [LoadedPostprocessor(name="passthrough", handler=_Passthrough())] + + return PostprocessorService( + postprocessors_factory=lambda: processors, + pool_size=pool_size, + ) + + +def test_apply_returns_result_with_graph_and_timings() -> None: + service = _make_service() + result = asyncio.run(service.apply(_sample_graph(), _sample_context())) + service.close() + + assert isinstance(result.graph, Graph) + assert len(result.graph) == 1 + assert result.queue_wait_ms >= 0 + assert result.postprocessors_ms >= 0 + + +def test_apply_runs_processors_in_order() -> None: + additions: list[int] = [] + + class _Mark: + def __init__(self, n: int) -> None: + self._n = n + + def process_graph(self, graph: Graph, context) -> Graph: + additions.append(self._n) + graph.add( + ( + URIRef(f"https://example.com/s{self._n}"), + URIRef("https://example.com/p"), + Literal(self._n), + ) + ) + return graph + + processors = [ + LoadedPostprocessor(name="first", handler=_Mark(1)), + LoadedPostprocessor(name="second", handler=_Mark(2)), + ] + service = PostprocessorService( + postprocessors_factory=lambda: processors, + pool_size=1, + ) + result = asyncio.run(service.apply(_sample_graph(), _sample_context())) + service.close() + + assert additions == [1, 2] + assert len(result.graph) == 3 # original + 2 added + + +def test_close_calls_close_on_closeable_handlers() -> None: + class _Closeable: + def __init__(self) -> None: + self.closed = False + + def close(self) -> None: + self.closed = True + + def process_graph(self, graph: Graph, context) -> Graph: + return graph + + handler = _Closeable() + service = PostprocessorService( + postprocessors_factory=lambda: [LoadedPostprocessor(name="c", handler=handler)], + pool_size=1, + ) + service.close() + + assert handler.closed is True + + +def test_pool_isolates_slots() -> None: + """Each slot in the pool should be an independent list of processors.""" + slot_ids: list[int] = [] + + class _Recorder: + def __init__(self, slot_id: int) -> None: + self._slot_id = slot_id + + def process_graph(self, graph: Graph, context) -> Graph: + slot_ids.append(self._slot_id) + return graph + + slot_counter = [0] + + def factory() -> list[LoadedPostprocessor]: + slot_counter[0] += 1 + sid = slot_counter[0] + return [LoadedPostprocessor(name=f"slot-{sid}", handler=_Recorder(sid))] + + pool_size = 2 + service = PostprocessorService(postprocessors_factory=factory, pool_size=pool_size) + try: + # Run both slots sequentially + asyncio.run(service.apply(_sample_graph(), _sample_context())) + asyncio.run(service.apply(_sample_graph(), _sample_context())) + finally: + service.close() + + # Both slots should have been used (order may vary but both IDs present) + assert len(slot_ids) == 2 + assert set(slot_ids) == {1, 2} + + +@pytest.mark.asyncio +async def test_apply_async_returns_correct_graph() -> None: + service = _make_service() + graph = _sample_graph() + result = await service.apply(graph, _sample_context()) + service.close() + + assert isinstance(result.graph, Graph) + assert len(result.graph) == 1 diff --git a/tests/kg_build/test_postprocessor_worker.py b/tests/kg_build/test_postprocessor_worker.py index 7f359a3..0cfe3b7 100644 --- a/tests/kg_build/test_postprocessor_worker.py +++ b/tests/kg_build/test_postprocessor_worker.py @@ -7,7 +7,7 @@ from rdflib import Graph, Literal, URIRef -from wordlift_sdk.kg_build import postprocessor_worker as worker +from wordlift_sdk.kg_build.postprocessors import persistent as worker def _graph() -> Graph: diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/test_postprocessors.py index 6ce1284..ed812ab 100644 --- a/tests/kg_build/test_postprocessors.py +++ b/tests/kg_build/test_postprocessors.py @@ -20,11 +20,14 @@ LoadedPostprocessor, PostprocessorContext, PostprocessorSpec, - SubprocessPostprocessor, - _build_runner_payload, close_loaded_postprocessors, load_postprocessors_for_profile, ) +from wordlift_sdk.kg_build.postprocessors.graph_io import _build_runner_payload +from wordlift_sdk.kg_build.postprocessors.subprocess import ( + OneshotSubprocessPostprocessor, + PersistentSubprocessPostprocessor, +) PROJECT_ROOT = Path(__file__).resolve().parents[2] _current_pythonpath = os.environ.get("PYTHONPATH", "") @@ -162,8 +165,8 @@ class = "test_pp:ProfileTwo" first = loaded[0].handler second = loaded[1].handler - assert isinstance(second, SubprocessPostprocessor) - assert isinstance(first, SubprocessPostprocessor) + assert isinstance(second, OneshotSubprocessPostprocessor) + assert isinstance(first, OneshotSubprocessPostprocessor) assert first.spec.python == "/profile/python" assert first.spec.timeout_seconds == 17 assert first.spec.keep_temp_on_error is True @@ -190,7 +193,7 @@ class = "test_pp:BaseOne" assert [item.name for item in loaded] == ["test_pp:BaseOne"] first = loaded[0].handler - assert isinstance(first, SubprocessPostprocessor) + assert isinstance(first, OneshotSubprocessPostprocessor) assert first.spec.python == "/base/python" assert first.spec.timeout_seconds == 11 assert first.spec.keep_temp_on_error is False @@ -219,8 +222,7 @@ class = "test_pp:ProfileOne" runtime="persistent", ) assert len(loaded) == 1 - assert isinstance(loaded[0].handler, SubprocessPostprocessor) - assert loaded[0].handler.runtime == "persistent" + assert isinstance(loaded[0].handler, PersistentSubprocessPostprocessor) def test_subprocess_execution_and_nquads_exchange(tmp_path: Path) -> None: @@ -249,7 +251,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root) + processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root) output = processor.process_graph(_sample_graph(), _sample_context()) assert output is not None @@ -291,11 +293,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor( - spec=spec, - root_dir=root, - runtime="persistent", - ) + processor = PersistentSubprocessPostprocessor(spec=spec, root_dir=root) first = processor.process_graph(_sample_graph(), _sample_context()) second = processor.process_graph(_sample_graph(), _sample_context()) @@ -351,7 +349,12 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root, runtime=runtime) + cls = ( + PersistentSubprocessPostprocessor + if runtime == "persistent" + else OneshotSubprocessPostprocessor + ) + processor = cls(spec=spec, root_dir=root) try: output = processor.process_graph( _sample_graph(), @@ -405,7 +408,12 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root, runtime=runtime) + cls = ( + PersistentSubprocessPostprocessor + if runtime == "persistent" + else OneshotSubprocessPostprocessor + ) + processor = cls(spec=spec, root_dir=root) try: output = processor.process_graph( _sample_graph(), @@ -471,7 +479,7 @@ def process_graph(self, graph, context): [ sys.executable, "-m", - "wordlift_sdk.kg_build.postprocessors.runner", + "wordlift_sdk.kg_build.postprocessors.oneshot", "--class", "test_pp:AddRunnerTriple", "--input-graph", @@ -517,7 +525,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root) + processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root) with pytest.raises(subprocess.TimeoutExpired): processor.process_graph(_sample_graph(), _sample_context()) @@ -543,11 +551,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor( - spec=spec, - root_dir=root, - runtime="persistent", - ) + processor = PersistentSubprocessPostprocessor(spec=spec, root_dir=root) with pytest.raises(subprocess.TimeoutExpired): processor.process_graph(_sample_graph(), _sample_context()) @@ -571,7 +575,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=True, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root) + processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root) with pytest.raises(RuntimeError): processor.process_graph(_sample_graph(), _sample_context()) @@ -607,7 +611,7 @@ def process_graph(self, graph, context): enabled=True, keep_temp_on_error=True, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root) + processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root) secret = "top-secret-key" with pytest.raises(RuntimeError): @@ -683,7 +687,7 @@ def test_subprocess_uses_inherited_environment_without_pythonpath_injection( enabled=True, keep_temp_on_error=False, ) - processor = SubprocessPostprocessor(spec=spec, root_dir=root) + processor = OneshotSubprocessPostprocessor(spec=spec, root_dir=root) captured: dict[str, object] = {} def fake_run(*args, **kwargs): diff --git a/tests/kg_build/test_profile_inheritance.py b/tests/kg_build/test_profile_inheritance.py index 8bb90e1..fbdea15 100644 --- a/tests/kg_build/test_profile_inheritance.py +++ b/tests/kg_build/test_profile_inheritance.py @@ -45,7 +45,8 @@ def test_runtime_inherits_from_base_when_selected_missing(tmp_path: Path) -> Non ) assert profile.settings["postprocessor_runtime"] == "persistent" - assert protocol._postprocessor_runtime == "persistent" + # Verify the protocol accepted the inherited runtime (service is initialised without error) + assert protocol._postprocessor_service is not None def test_validation_settings_parse_into_profile_settings(tmp_path: Path) -> None: diff --git a/tests/kg_build/test_protocol.py b/tests/kg_build/test_protocol.py index 3e46fed..43efe7e 100644 --- a/tests/kg_build/test_protocol.py +++ b/tests/kg_build/test_protocol.py @@ -7,7 +7,6 @@ from jinja2 import UndefinedError from rdflib import BNode, Graph, Literal, RDF, URIRef from wordlift_client import WebPage, WebPageScrapeResponse -from wordlift_sdk.validation.shacl import ValidationResult from wordlift_sdk.kg_build.config.loader import ProfileDefinition, ProfileMappingRoute import wordlift_sdk.kg_build.protocol as protocol_module @@ -16,6 +15,17 @@ _path_contains_part, _resolve_postprocessor_runtime, ) +from wordlift_sdk.kg_build.rml_mapping import MappingResult +from wordlift_sdk.kg_build.postprocessors.types import PostprocessorResult +from wordlift_sdk.kg_build.postprocessors.processors.graph_annotation import ( + ImportAnnotationPostprocessor, +) +from wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor import ( + CanonicalIdsPostprocessor, + RootIdReconcilerPostprocessor, + _find_web_page_iri as _find_web_page_iri_impl, +) +from wordlift_sdk.validation.shacl_validation_service import ValidationOutcome def _make_profile() -> ProfileDefinition: @@ -60,7 +70,7 @@ def _make_context() -> SimpleNamespace: return SimpleNamespace( account=SimpleNamespace(dataset_uri="https://data.example.com/dataset"), client_configuration=SimpleNamespace(api_key={}), - graph_queue=SimpleNamespace(put=AsyncMock()), + graph_queue=SimpleNamespace(put=AsyncMock(), close=AsyncMock()), configuration_provider=SimpleNamespace( get_value=lambda *_args, **_kwargs: None ), @@ -71,13 +81,73 @@ def _make_context_without_dataset() -> SimpleNamespace: return SimpleNamespace( account=SimpleNamespace(dataset_uri=None), client_configuration=SimpleNamespace(api_key={}), - graph_queue=SimpleNamespace(put=AsyncMock()), + graph_queue=SimpleNamespace(put=AsyncMock(), close=AsyncMock()), configuration_provider=SimpleNamespace( get_value=lambda *_args, **_kwargs: None ), ) +def _make_mapping_result(graph: Graph) -> MappingResult: + return MappingResult(graph=graph, queue_wait_ms=0, mapping_ms=0) + + +def _make_validation_outcome( + *, + passed: bool, + warning_sources: dict | None = None, + error_sources: dict | None = None, +) -> ValidationOutcome: + return ValidationOutcome( + passed=passed, + warning_sources=warning_sources or {}, + error_sources=error_sources or {}, + queue_wait_ms=0, + validation_ms=0, + ) + + +def _passthrough_pp() -> AsyncMock: + return AsyncMock( + side_effect=lambda g, url, resp, ewi, eih: PostprocessorResult( + graph=g, queue_wait_ms=0, postprocessors_ms=0 + ) + ) + + +def _annotating_pp( + dataset_uri: str = "https://data.example.com/dataset", + import_hash_mode: str = "on", +) -> AsyncMock: + async def _stage(graph, url, resp, ewi, eih): + ctx = SimpleNamespace( + account=SimpleNamespace(dataset_uri=dataset_uri), + existing_import_hash=eih, + import_hash_mode=import_hash_mode, + ) + g = ImportAnnotationPostprocessor().process_graph(graph, ctx) + return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0) + + return AsyncMock(side_effect=_stage) + + +def _reconciling_pp( + dataset_uri: str = "https://data.example.com/dataset", +) -> AsyncMock: + async def _stage(graph, url, resp, ewi, eih): + ctx = SimpleNamespace( + account=SimpleNamespace(dataset_uri=dataset_uri), + existing_import_hash=eih, + import_hash_mode="on", + existing_web_page_id=ewi, + ) + g = RootIdReconcilerPostprocessor().process_graph(graph, ctx) + g = ImportAnnotationPostprocessor().process_graph(g, ctx) + return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0) + + return AsyncMock(side_effect=_stage) + + def _make_graph(subject: str) -> Graph: graph = Graph() s = URIRef(subject) @@ -130,12 +200,13 @@ async def test_profile_protocol_reconciles_to_existing_id_and_sets_source(): protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_graph("https://example.com/mapped-web-page") + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result( + _make_graph("https://example.com/mapped-web-page") + ) ) + protocol._run_postprocessing_stage = _reconciling_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -169,12 +240,11 @@ async def test_profile_protocol_put_strategy_writes_to_graph_queue() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -197,7 +267,7 @@ async def test_static_templates_use_graph_queue_when_put_strategy_enabled() -> N ) protocol._template_graph = _make_dataset_scoped_graph() protocol._template_exports = {} - protocol._validate_graph_if_enabled = MagicMock(return_value=None) + protocol._shacl_validator.validate = AsyncMock(return_value=None) protocol._emit_progress = MagicMock() protocol._kpi.record_graph = MagicMock() protocol.patcher.patch_all = AsyncMock() @@ -220,8 +290,7 @@ async def test_profile_protocol_put_strategy_honors_import_hash_write_mode() -> protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) + protocol._run_postprocessing_stage = _passthrough_pp() protocol.patcher.patch_all = AsyncMock() graph = _make_dataset_scoped_graph() child = URIRef("https://data.example.com/dataset/entities/article-1/faq/1") @@ -233,7 +302,7 @@ async def test_profile_protocol_put_strategy_honors_import_hash_write_mode() -> ) ) graph.add((child, RDF.type, URIRef("https://schema.org/Question"))) - protocol.rml_service.apply_mapping = AsyncMock(return_value=graph) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph)) response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -264,17 +333,22 @@ async def test_profile_protocol_put_strategy_skips_when_import_hash_matches() -> protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() graph = _make_dataset_scoped_graph() - protocol._set_source(graph, existing_web_page_id=None) + # Pre-annotate so the expected hash matches what the pipeline will produce + ann_ctx = SimpleNamespace( + account=context.account, + existing_import_hash=None, + import_hash_mode="on", + ) + ImportAnnotationPostprocessor().process_graph(graph, ann_ctx) expected_hash = protocol.patcher._compute_import_hash( URIRef("https://data.example.com/dataset/web-pages/1"), graph, "https://data.example.com/dataset", ) - protocol.rml_service.apply_mapping = AsyncMock(return_value=graph) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph)) + protocol._run_postprocessing_stage = _annotating_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -297,12 +371,11 @@ async def test_profile_protocol_put_strategy_honors_import_hash_off_mode() -> No protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -335,13 +408,12 @@ async def test_profile_protocol_sets_source_on_mapped_subject_when_existing_id_m protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() mapped_subject = "https://example.com/mapped-web-page" - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_graph(mapped_subject) + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_graph(mapped_subject)) ) + protocol._run_postprocessing_stage = _annotating_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -367,12 +439,11 @@ async def test_profile_protocol_sets_source_only_on_first_level_uri_subjects(): protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_multi_entity_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_multi_entity_graph()) ) + protocol._run_postprocessing_stage = _annotating_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -423,11 +494,8 @@ async def test_callback_runs_canonical_ids_after_postprocessors() -> None: Literal("https://translated.com/developers"), ) ) - protocol.rml_service.apply_mapping = AsyncMock(return_value=mapped_graph) - def _inject_service_product_and_fragment_offer( - graph: Graph, *_args, **_kwargs - ) -> Graph: + async def _pp_with_injection(graph, url, resp, ewi, eih): graph.add((root, RDF.type, URIRef("http://schema.org/Product"))) graph.add((root, RDF.type, URIRef("http://schema.org/Service"))) graph.add( @@ -444,11 +512,17 @@ def _inject_service_product_and_fragment_offer( URIRef(f"{root}#aggregate-offer-usd"), ) ) - return graph + ctx = SimpleNamespace( + account=SimpleNamespace(dataset_uri="https://data.example.com/dataset"), + extensions=None, + ) + g = CanonicalIdsPostprocessor().process_graph(graph, ctx) + return PostprocessorResult(graph=g, queue_wait_ms=0, postprocessors_ms=0) - protocol._apply_postprocessors = MagicMock( - side_effect=_inject_service_product_and_fragment_offer + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(mapped_graph) ) + protocol._run_postprocessing_stage = AsyncMock(side_effect=_pp_with_injection) response = WebPageScrapeResponse( web_page=WebPage(url="https://translated.com/developers", html="") @@ -484,12 +558,11 @@ async def test_profile_protocol_applies_existing_import_hash_to_all_uri_subjects protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_multi_entity_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_multi_entity_graph()) ) + protocol._run_postprocessing_stage = _annotating_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -520,15 +593,14 @@ async def test_profile_protocol_sets_source_when_web_page_absent_but_uri_subject protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) + protocol._run_postprocessing_stage = _annotating_pp() protocol.patcher.patch_all = AsyncMock() graph = Graph() article = URIRef("https://example.com/entities/article-only") graph.add((article, RDF.type, URIRef("http://schema.org/Article"))) graph.add((article, URIRef("http://schema.org/headline"), Literal("Title"))) - protocol.rml_service.apply_mapping = AsyncMock(return_value=graph) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph)) response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -554,8 +626,7 @@ async def test_profile_protocol_sets_source_by_dataset_id_depth() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) + protocol._run_postprocessing_stage = _annotating_pp() protocol.patcher.patch_all = AsyncMock() graph = Graph() @@ -567,7 +638,7 @@ async def test_profile_protocol_sets_source_by_dataset_id_depth() -> None: graph.add((entity, RDF.type, URIRef("https://schema.org/Article"))) graph.add((entity, URIRef("https://schema.org/hasPart"), child)) graph.add((child, RDF.type, URIRef("https://schema.org/Question"))) - protocol.rml_service.apply_mapping = AsyncMock(return_value=graph) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph)) response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -598,8 +669,7 @@ async def test_profile_protocol_does_not_set_source_on_blank_nodes(): protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) + protocol._run_postprocessing_stage = _annotating_pp() protocol.patcher.patch_all = AsyncMock() graph = Graph() @@ -608,7 +678,7 @@ async def test_profile_protocol_does_not_set_source_on_blank_nodes(): graph.add((article, RDF.type, URIRef("http://schema.org/Article"))) graph.add((blank, RDF.type, URIRef("http://schema.org/Thing"))) graph.add((article, URIRef("http://schema.org/mentions"), blank)) - protocol.rml_service.apply_mapping = AsyncMock(return_value=graph) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(graph)) response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -640,12 +710,11 @@ def fake_loader(*, root_dir, profile_name, runtime=None): return [] monkeypatch.setattr(protocol_module, "load_postprocessors_for_profile", fake_loader) - protocol = ProfileImportProtocol( + ProfileImportProtocol( context=_make_context(), profile=_make_profile_with_settings({"POSTPROCESSOR_RUNTIME": "persistent"}), root_dir=Path.cwd(), ) - assert protocol._postprocessor_runtime == "persistent" assert captured["runtime"] == "persistent" @@ -675,7 +744,10 @@ def test_build_pp_context_exposes_resolved_profile_and_account_key() -> None: ) context = protocol._build_pp_context( - "https://example.com/page", response, existing_web_page_id=None + "https://example.com/page", + response, + existing_web_page_id=None, + existing_import_hash=None, ) assert context.account_key == "profile-secret" @@ -699,44 +771,37 @@ def test_build_pp_context_preserves_custom_profile_settings() -> None: ) context = protocol._build_pp_context( - "https://example.com/page", response, existing_web_page_id=None + "https://example.com/page", + response, + existing_web_page_id=None, + existing_import_hash=None, ) assert context.profile["settings"]["disable_article_markup"] is True -def test_apply_postprocessors_fails_fast_when_account_key_missing() -> None: +def test_account_key_resolved_from_profile_api_key() -> None: + profile = ProfileDefinition( + **{ + **_make_profile().__dict__, + "api_key": "profile-secret", + } + ) protocol = ProfileImportProtocol( context=_make_context(), - profile=_make_profile(), + profile=profile, root_dir=Path.cwd(), ) + assert protocol._account_key == "profile-secret" - class _NeverRun: - name = "never-run" - called = False - - def run(self, graph, context): - self.called = True - return graph - - handler = _NeverRun() - protocol._postprocessors = [handler] # type: ignore[assignment] - response = WebPageScrapeResponse( - web_page=WebPage(url="https://example.com/page", html="") +def test_account_key_is_none_when_no_key_configured() -> None: + protocol = ProfileImportProtocol( + context=_make_context(), + profile=_make_profile(), + root_dir=Path.cwd(), ) - graph = _make_graph("https://example.com/mapped-web-page") - - with pytest.raises(RuntimeError, match="Postprocessor runtime requires an API key"): - protocol._apply_postprocessors( - graph, - "https://example.com/page", - response, - existing_web_page_id=None, - ) - - assert handler.called is False + assert protocol._account_key is None def test_protocol_helpers_runtime_and_path_part() -> None: @@ -792,7 +857,7 @@ async def test_callback_returns_early_when_mapping_has_no_triples() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol.rml_service.apply_mapping = AsyncMock(return_value=Graph()) + protocol._run_mapping_stage = AsyncMock(return_value=_make_mapping_result(Graph())) protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( @@ -803,21 +868,16 @@ async def test_callback_returns_early_when_mapping_has_no_triples() -> None: protocol.patcher.patch_all.assert_not_called() -def test_close_invokes_postprocessor_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: - called: dict[str, object] = {} - - def fake_close(postprocessors): - called["value"] = postprocessors - - monkeypatch.setattr(protocol_module, "close_loaded_postprocessors", fake_close) +def test_close_invokes_postprocessor_service_close() -> None: protocol = ProfileImportProtocol( context=_make_context(), profile=_make_profile(), root_dir=Path.cwd(), ) - protocol._postprocessors = ["x"] # type: ignore[assignment] - protocol.close() - assert called["value"] == ["x"] + mock_close = MagicMock() + protocol._postprocessor_service.close = mock_close + asyncio.run(protocol.close()) + mock_close.assert_called_once() def test_resolve_path_and_overlay_paths(tmp_path: Path) -> None: @@ -1018,88 +1078,69 @@ def test_get_mapping_content_uses_cache_and_requires_dataset() -> None: protocol2._get_mapping_content(path) -def test_apply_postprocessors_runs_all_processors() -> None: +def test_postprocessor_factory_builds_required_processors( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Verify the factory used by PostprocessorService includes the standard processors.""" + + def fake_loader(*, root_dir, profile_name, runtime=None): + return [] + + monkeypatch.setattr(protocol_module, "load_postprocessors_for_profile", fake_loader) protocol = ProfileImportProtocol( context=_make_context(), - profile=_make_profile_with_settings({"api_key": "x"}), + profile=_make_profile(), root_dir=Path.cwd(), ) - response = WebPageScrapeResponse( - web_page=WebPage(url="https://example.com/page", html="") - ) - graph = _make_graph("https://example.com/page") - - class _P1: - name = "p1" - - def run(self, g, _ctx): - g.add( - ( - URIRef("https://example.com/page"), - URIRef("https://schema.org/name"), - Literal("a"), - ) - ) - return g - - class _P2: - name = "p2" - - def run(self, g, _ctx): - return g - - protocol._postprocessors = [_P1(), _P2()] # type: ignore[assignment] - protocol._resolve_postprocessor_account_key = MagicMock(return_value="secret") - out = protocol._apply_postprocessors( - graph, "https://example.com/page", response, None - ) - assert len(out) >= len(graph) + # Get one slot from the pool to inspect the processors + processors = list(protocol._postprocessor_service._queue.get_nowait()) + names = [p.name for p in processors] + assert "root_id_reconciler" in names + assert "canonical_ids" in names + assert "import_annotation" in names -def test_resolve_postprocessor_account_key_priority( +def test_resolve_account_key_priority( monkeypatch: pytest.MonkeyPatch, ) -> None: - protocol = ProfileImportProtocol( - context=_make_context(), - profile=_make_profile(), - root_dir=Path.cwd(), + profile = _make_profile() + context = _make_context() + + profile_with_key = ProfileDefinition( + **{**profile.__dict__, "api_key": "profile-key"} ) - protocol.profile = ProfileDefinition( - **{**protocol.profile.__dict__, "api_key": "profile-key"} + assert ( + protocol_module._resolve_account_key(profile_with_key, context) == "profile-key" ) - assert protocol._resolve_postprocessor_account_key() == "profile-key" - protocol.profile = ProfileDefinition( - **{**protocol.profile.__dict__, "api_key": None} - ) - protocol.context.client_configuration.api_key = {"ApiKey": "runtime-key"} - assert protocol._resolve_postprocessor_account_key() == "runtime-key" + context.client_configuration.api_key = {"ApiKey": "runtime-key"} + assert protocol_module._resolve_account_key(profile, context) == "runtime-key" - protocol.context.client_configuration.api_key = {} - protocol.context.configuration_provider = SimpleNamespace( + context.client_configuration.api_key = {} + context.configuration_provider = SimpleNamespace( get_value=lambda name: "provider-key" if name == "WORDLIFT_KEY" else None ) - assert protocol._resolve_postprocessor_account_key() == "provider-key" + assert protocol_module._resolve_account_key(profile, context) == "provider-key" - protocol.context.configuration_provider = SimpleNamespace( + context.configuration_provider = SimpleNamespace( get_value=lambda _name: (_ for _ in ()).throw(RuntimeError("nope")) ) monkeypatch.setenv("WORDLIFT_API_KEY", "env-key") - assert protocol._resolve_postprocessor_account_key() == "env-key" + assert protocol_module._resolve_account_key(profile, context) == "env-key" monkeypatch.delenv("WORDLIFT_API_KEY", raising=False) def test_clean_key_write_debug_and_reconcile(tmp_path: Path) -> None: + assert protocol_module._clean_key(None) is None + assert protocol_module._clean_key(" ") is None + assert protocol_module._clean_key(" x ") == "x" + protocol = ProfileImportProtocol( context=_make_context(), profile=_make_profile(), root_dir=tmp_path, debug_dir=tmp_path / "debug", ) - assert protocol._clean_key(None) is None - assert protocol._clean_key(" ") is None - assert protocol._clean_key(" x ") == "x" - graph = _make_graph("https://example.com/old") protocol._write_debug_graph(graph, "https://example.com/page") protocol._write_debug_source_documents( @@ -1113,8 +1154,14 @@ def test_clean_key_write_debug_and_reconcile(tmp_path: Path) -> None: child = URIRef("https://example.com/child") https_graph.add((old, RDF.type, URIRef("https://schema.org/WebPage"))) https_graph.add((child, URIRef("https://schema.org/about"), old)) - assert protocol._find_web_page_iri(https_graph) == old - protocol._reconcile_root_id(https_graph, str(new)) + assert _find_web_page_iri_impl(https_graph) == old + ctx = SimpleNamespace( + existing_web_page_id=str(new), + account=SimpleNamespace(dataset_uri=""), + existing_import_hash=None, + import_hash_mode="on", + ) + RootIdReconcilerPostprocessor().process_graph(https_graph, ctx) assert (new, RDF.type, URIRef("https://schema.org/WebPage")) in https_graph assert (child, URIRef("https://schema.org/about"), new) in https_graph @@ -1132,17 +1179,15 @@ async def test_callback_writes_html_xhtml_and_ttl_debug_artifacts( protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - async def _apply_mapping(**kwargs): - debug_output = kwargs.get("debug_output") + async def _mapping_stage(response, url, ewi, debug_output): if isinstance(debug_output, dict): debug_output["xhtml"] = "Converted" - return _make_graph("https://example.com/mapped-web-page") + return _make_mapping_result(_make_graph("https://example.com/mapped-web-page")) - protocol.rml_service.apply_mapping = AsyncMock(side_effect=_apply_mapping) + protocol._run_mapping_stage = AsyncMock(side_effect=_mapping_stage) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="Raw") @@ -1190,14 +1235,10 @@ def test_protocol_setting_parsers_and_progress_error_logging( profile=profile, root_dir=Path.cwd(), ) - assert protocol._shacl_mode == "warn" - assert protocol._shacl_shape_specs == [ - "google-article.ttl", - "https://example.com/custom-shape.ttl", - ] + assert protocol._shacl_validator.mode.value == "warn" assert protocol._import_hash_mode == "write" - assert protocol._resolve_list_setting(["a", " ", "b"]) == ["a", "b"] - assert protocol._resolve_list_setting(123) == ["123"] + assert protocol_module._resolve_list_setting(["a", " ", "b"]) == ["a", "b"] + assert protocol_module._resolve_list_setting(123) == ["123"] protocol._on_progress = lambda _payload: (_ for _ in ()).throw(RuntimeError("boom")) with caplog.at_level("WARNING"): @@ -1265,8 +1306,8 @@ async def test_patch_static_templates_fail_validation_raises() -> None: protocol._template_graph = graph protocol._template_exports = {} protocol.patcher.patch_all = AsyncMock() - protocol._validate_graph = MagicMock( - return_value=_make_validation_result(conforms=False) + protocol._shacl_validator.validate = AsyncMock( + return_value=_make_validation_outcome(passed=False) ) with pytest.raises( @@ -1281,11 +1322,6 @@ async def test_patch_static_templates_fail_validation_raises() -> None: def test_find_web_page_iri_returns_none_when_missing() -> None: - protocol = ProfileImportProtocol( - context=_make_context(), - profile=_make_profile(), - root_dir=Path.cwd(), - ) graph = Graph() graph.add( ( @@ -1294,61 +1330,17 @@ def test_find_web_page_iri_returns_none_when_missing() -> None: URIRef("https://schema.org/Thing"), ) ) - assert protocol._find_web_page_iri(graph) is None + assert _find_web_page_iri_impl(graph) is None -def _make_validation_result( - *, - conforms: bool, - warning_shapes: list[URIRef] | None = None, - error_shapes: list[URIRef] | None = None, - shape_map: dict[URIRef, str] | None = None, -) -> ValidationResult: - warning_shapes = warning_shapes or [] - error_shapes = error_shapes or [] - shape_map = shape_map or {} - report = Graph() - sh_result_severity = URIRef("http://www.w3.org/ns/shacl#resultSeverity") - sh_warning = URIRef("http://www.w3.org/ns/shacl#Warning") - sh_violation = URIRef("http://www.w3.org/ns/shacl#Violation") - sh_source_shape = URIRef("http://www.w3.org/ns/shacl#sourceShape") - - for index, shape in enumerate(warning_shapes): - node = URIRef(f"https://example.com/report/w/{index}") - report.add((node, sh_result_severity, sh_warning)) - report.add((node, sh_source_shape, shape)) - for index, shape in enumerate(error_shapes): - node = URIRef(f"https://example.com/report/e/{index}") - report.add((node, sh_result_severity, sh_violation)) - report.add((node, sh_source_shape, shape)) - - return ValidationResult( - conforms=conforms, - report_text="report", - report_graph=report, - data_graph=Graph(), - shape_source_map=shape_map, - warning_count=len(warning_shapes), - ) - - -def test_summarize_validation_aggregates_sources() -> None: - protocol = ProfileImportProtocol( - context=_make_context(), - profile=_make_profile(), - root_dir=Path.cwd(), - ) - article_shape = URIRef("https://shape.example/article") - product_shape = URIRef("https://shape.example/product") - result = _make_validation_result( - conforms=False, - warning_shapes=[article_shape], - error_shapes=[article_shape, product_shape], - shape_map={article_shape: "google-article", product_shape: "google-product"}, +def test_validation_outcome_to_dict_aggregates_sources() -> None: + outcome = _make_validation_outcome( + passed=False, + warning_sources={"google-article": 1}, + error_sources={"google-article": 1, "google-product": 1}, ) - summary = protocol._summarize_validation(result) + summary = outcome.to_dict() assert summary == { - "total": 1, "pass": False, "fail": True, "warnings": {"count": 1, "sources": {"google-article": 1}}, @@ -1376,21 +1368,16 @@ async def test_profile_protocol_emits_progress_and_validation_in_warn_mode() -> protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) + ) + protocol._run_postprocessing_stage = _passthrough_pp() protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() - ) - protocol._validate_graph = MagicMock( - return_value=_make_validation_result( - conforms=False, - warning_shapes=[URIRef("https://shape.example/w")], - error_shapes=[URIRef("https://shape.example/e")], - shape_map={ - URIRef("https://shape.example/w"): "google-article", - URIRef("https://shape.example/e"): "google-product", - }, + protocol._shacl_validator.validate = AsyncMock( + return_value=_make_validation_outcome( + passed=False, + warning_sources={"google-article": 1}, + error_sources={"google-product": 1}, ) ) @@ -1404,7 +1391,6 @@ async def test_profile_protocol_emits_progress_and_validation_in_warn_mode() -> assert payload["kind"] == "graph" assert payload["url"] == "https://example.com/page" assert payload["validation"] == { - "total": 1, "pass": False, "fail": True, "warnings": {"count": 1, "sources": {"google-article": 1}}, @@ -1440,14 +1426,13 @@ async def test_profile_protocol_validation_fail_mode_raises() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) - protocol._validate_graph = MagicMock( - return_value=_make_validation_result(conforms=False) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() + protocol._shacl_validator.validate = AsyncMock( + return_value=_make_validation_outcome(passed=False) ) response = WebPageScrapeResponse( @@ -1475,12 +1460,11 @@ async def test_profile_protocol_emits_null_validation_when_disabled() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -1504,12 +1488,11 @@ async def test_profile_protocol_passes_import_hash_mode_to_patcher() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -1535,12 +1518,11 @@ async def test_profile_protocol_emits_graph_and_static_template_events() -> None protocol._template_exports = {} protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _passthrough_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -1560,12 +1542,11 @@ async def test_profile_protocol_collects_run_level_kpis() -> None: protocol._patch_static_templates_once = AsyncMock() protocol._resolve_mapping_path = MagicMock(return_value=Path("mapping.yarrrml")) protocol._get_mapping_content = MagicMock(return_value="mapping") - protocol._core_ids.process_graph = MagicMock(side_effect=lambda g, _: g) - protocol._apply_postprocessors = MagicMock(side_effect=lambda g, *_: g) - protocol.patcher.patch_all = AsyncMock() - protocol.rml_service.apply_mapping = AsyncMock( - return_value=_make_dataset_scoped_graph() + protocol._run_mapping_stage = AsyncMock( + return_value=_make_mapping_result(_make_dataset_scoped_graph()) ) + protocol._run_postprocessing_stage = _annotating_pp() + protocol.patcher.patch_all = AsyncMock() response = WebPageScrapeResponse( web_page=WebPage(url="https://example.com/page", html="") @@ -1604,7 +1585,7 @@ def test_protocol_validation_mode_normalization_and_deprecation( ), root_dir=Path.cwd(), ) - assert strict_protocol._shacl_mode == "fail" + assert strict_protocol._shacl_validator.mode.value == "fail" assert "Deprecated SHACL validation mode 'strict' detected" in caplog.text with caplog.at_level("WARNING"): @@ -1615,7 +1596,7 @@ def test_protocol_validation_mode_normalization_and_deprecation( ), root_dir=Path.cwd(), ) - assert unknown_protocol._shacl_mode == "warn" + assert unknown_protocol._shacl_validator.mode.value == "warn" assert "Unsupported SHACL validation mode" in caplog.text with caplog.at_level("WARNING"): diff --git a/tests/kg_build/test_rml_mapping.py b/tests/kg_build/test_rml_mapping.py index 7d4d72e..810e31c 100644 --- a/tests/kg_build/test_rml_mapping.py +++ b/tests/kg_build/test_rml_mapping.py @@ -7,7 +7,6 @@ import pytest from rdflib import Graph -import wordlift_sdk.kg_build.rml_mapping as rml_module from wordlift_sdk.kg_build.rml_mapping import RmlMappingService @@ -34,50 +33,46 @@ def _context(dataset_uri: str | None): @pytest.mark.asyncio -async def test_apply_mapping_from_content_success( - monkeypatch: pytest.MonkeyPatch, -) -> None: - service = RmlMappingService(_context("https://data.example.com")) +async def test_apply_mapping_from_content_success() -> None: + service = RmlMappingService( + _context("https://data.example.com"), pipeline=_Pipeline() + ) service._html_converter.convert = MagicMock(return_value="") - monkeypatch.setattr(rml_module, "MaterializationPipeline", _Pipeline) debug_output: dict[str, str] = {} - graph = await service.apply_mapping( + result = await service.apply_mapping( html="", url="https://example.com/page", mapping_file_path="demo.yarrrml", mapping_content="m: 1", debug_output=debug_output, ) - assert isinstance(graph, Graph) - assert len(graph) > 0 + assert isinstance(result.graph, Graph) + assert len(result.graph) > 0 assert debug_output["xhtml"] == "" @pytest.mark.asyncio async def test_apply_mapping_file_not_found_returns_none() -> None: service = RmlMappingService(_context("https://data.example.com")) - out = await service.apply_mapping( + result = await service.apply_mapping( html="", url="https://example.com", mapping_file_path=Path("/no/such/file.yarrrml"), ) - assert out is None + assert result.graph is None @pytest.mark.asyncio -async def test_apply_mapping_missing_dataset_uri_returns_none( - monkeypatch: pytest.MonkeyPatch, -) -> None: - service = RmlMappingService(_context(None)) - monkeypatch.setattr(rml_module, "MaterializationPipeline", _Pipeline) - out = await service.apply_mapping( +async def test_apply_mapping_missing_dataset_uri_returns_none() -> None: + service = RmlMappingService(_context(None), pipeline=_Pipeline()) + result = await service.apply_mapping( html="", url="https://example.com", mapping_file_path="x", mapping_content="m: 1", ) - assert out is None + assert result.graph is None def test_normalize_schema_uris() -> None: diff --git a/wordlift_sdk/kg_build/postprocessors/subprocess.py b/wordlift_sdk/kg_build/postprocessors/subprocess.py index 52b1dfe..e5954af 100644 --- a/wordlift_sdk/kg_build/postprocessors/subprocess.py +++ b/wordlift_sdk/kg_build/postprocessors/subprocess.py @@ -16,9 +16,7 @@ from rdflib import Graph from .types import ( - Closeable, GraphPostprocessor, - LoadedPostprocessor, PostprocessorContext, PostprocessorRuntime, PostprocessorSpec, @@ -275,6 +273,7 @@ def process_graph( self, graph: Graph, context: PostprocessorContext ) -> Graph | None: from .graph_io import _build_runner_payload + return _run_subprocess( self.spec, self.root_dir, graph, _build_runner_payload(context), self._run ) @@ -315,6 +314,9 @@ def _run( f"(exit={completed.returncode})" + (f"\n{stderr}" if stderr else "") ) + def close(self) -> None: + pass # oneshot processors have no persistent resources to release + @dataclass class PersistentSubprocessPostprocessor: @@ -335,6 +337,7 @@ def process_graph( self, graph: Graph, context: PostprocessorContext ) -> Graph | None: from .graph_io import _build_runner_payload + return _run_subprocess( self.spec, self.root_dir, graph, _build_runner_payload(context), self._run ) From e193c5a1f7402d3f8d8d6f9bdf8536dd69b8a0d3 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 09:31:37 +0100 Subject: [PATCH 54/63] feat(tests): reorganise kg_build tests to mirror source package structure --- tests/kg_build/postprocessors/__init__.py | 0 tests/kg_build/postprocessors/processors/__init__.py | 0 .../kg_build/{ => postprocessors/processors}/test_id_allocator.py | 0 .../processors/test_id_generator.py} | 0 .../{ => postprocessors/processors}/test_id_postprocessor.py | 0 .../test_oneshot_helpers.py} | 0 .../test_oneshot_main.py} | 0 .../test_persistent.py} | 0 tests/kg_build/{ => postprocessors}/test_postprocessors.py | 0 .../test_service.py} | 0 tests/{kg_build => workflow}/test_ingestion_bridge_url_handler.py | 0 tests/{kg_build => workflow}/test_web_page_scrape_url_handler.py | 0 12 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/kg_build/postprocessors/__init__.py create mode 100644 tests/kg_build/postprocessors/processors/__init__.py rename tests/kg_build/{ => postprocessors/processors}/test_id_allocator.py (100%) rename tests/kg_build/{test_kg_build_id_generator.py => postprocessors/processors/test_id_generator.py} (100%) rename tests/kg_build/{ => postprocessors/processors}/test_id_postprocessor.py (100%) rename tests/kg_build/{test_postprocessor_runner_helpers.py => postprocessors/test_oneshot_helpers.py} (100%) rename tests/kg_build/{test_postprocessor_runner_main.py => postprocessors/test_oneshot_main.py} (100%) rename tests/kg_build/{test_postprocessor_worker.py => postprocessors/test_persistent.py} (100%) rename tests/kg_build/{ => postprocessors}/test_postprocessors.py (100%) rename tests/kg_build/{test_postprocessor_service.py => postprocessors/test_service.py} (100%) rename tests/{kg_build => workflow}/test_ingestion_bridge_url_handler.py (100%) rename tests/{kg_build => workflow}/test_web_page_scrape_url_handler.py (100%) diff --git a/tests/kg_build/postprocessors/__init__.py b/tests/kg_build/postprocessors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/kg_build/postprocessors/processors/__init__.py b/tests/kg_build/postprocessors/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/kg_build/test_id_allocator.py b/tests/kg_build/postprocessors/processors/test_id_allocator.py similarity index 100% rename from tests/kg_build/test_id_allocator.py rename to tests/kg_build/postprocessors/processors/test_id_allocator.py diff --git a/tests/kg_build/test_kg_build_id_generator.py b/tests/kg_build/postprocessors/processors/test_id_generator.py similarity index 100% rename from tests/kg_build/test_kg_build_id_generator.py rename to tests/kg_build/postprocessors/processors/test_id_generator.py diff --git a/tests/kg_build/test_id_postprocessor.py b/tests/kg_build/postprocessors/processors/test_id_postprocessor.py similarity index 100% rename from tests/kg_build/test_id_postprocessor.py rename to tests/kg_build/postprocessors/processors/test_id_postprocessor.py diff --git a/tests/kg_build/test_postprocessor_runner_helpers.py b/tests/kg_build/postprocessors/test_oneshot_helpers.py similarity index 100% rename from tests/kg_build/test_postprocessor_runner_helpers.py rename to tests/kg_build/postprocessors/test_oneshot_helpers.py diff --git a/tests/kg_build/test_postprocessor_runner_main.py b/tests/kg_build/postprocessors/test_oneshot_main.py similarity index 100% rename from tests/kg_build/test_postprocessor_runner_main.py rename to tests/kg_build/postprocessors/test_oneshot_main.py diff --git a/tests/kg_build/test_postprocessor_worker.py b/tests/kg_build/postprocessors/test_persistent.py similarity index 100% rename from tests/kg_build/test_postprocessor_worker.py rename to tests/kg_build/postprocessors/test_persistent.py diff --git a/tests/kg_build/test_postprocessors.py b/tests/kg_build/postprocessors/test_postprocessors.py similarity index 100% rename from tests/kg_build/test_postprocessors.py rename to tests/kg_build/postprocessors/test_postprocessors.py diff --git a/tests/kg_build/test_postprocessor_service.py b/tests/kg_build/postprocessors/test_service.py similarity index 100% rename from tests/kg_build/test_postprocessor_service.py rename to tests/kg_build/postprocessors/test_service.py diff --git a/tests/kg_build/test_ingestion_bridge_url_handler.py b/tests/workflow/test_ingestion_bridge_url_handler.py similarity index 100% rename from tests/kg_build/test_ingestion_bridge_url_handler.py rename to tests/workflow/test_ingestion_bridge_url_handler.py diff --git a/tests/kg_build/test_web_page_scrape_url_handler.py b/tests/workflow/test_web_page_scrape_url_handler.py similarity index 100% rename from tests/kg_build/test_web_page_scrape_url_handler.py rename to tests/workflow/test_web_page_scrape_url_handler.py From db3aaf625cf9a1908170c67e97c63b4c599ecfb2 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 09:36:51 +0100 Subject: [PATCH 55/63] fix: update kg_build __init__ export paths after postprocessors reorganisation --- wordlift_sdk/kg_build/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py index 1692a5e..3539b6f 100644 --- a/wordlift_sdk/kg_build/__init__.py +++ b/wordlift_sdk/kg_build/__init__.py @@ -64,13 +64,16 @@ "wordlift_sdk.kg_build.container", "KgBuildApplicationContainer", ), - "IdAllocator": ("wordlift_sdk.kg_build.id_allocator", "IdAllocator"), + "IdAllocator": ( + "wordlift_sdk.kg_build.postprocessors.processors.id_allocator", + "IdAllocator", + ), "CanonicalIdGenerator": ( - "wordlift_sdk.kg_build.id_generator", + "wordlift_sdk.kg_build.postprocessors.processors.id_generator", "CanonicalIdGenerator", ), "CanonicalIdsPostprocessor": ( - "wordlift_sdk.kg_build.id_postprocessor", + "wordlift_sdk.kg_build.postprocessors.processors.id_postprocessor", "CanonicalIdsPostprocessor", ), "IriLookup": ("wordlift_sdk.kg_build.iri_lookup", "IriLookup"), From a546ea02250a15c7210ced214823da20fe6e3df8 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 09:57:15 +0100 Subject: [PATCH 56/63] fix: update engine tests to mock at pool level after morph_kgc moved to subprocess --- ...tructured_data_engine_validation_helpers.py | 18 +----------------- ..._structured_data_materialization_generic.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/tests/test_structured_data_engine_validation_helpers.py b/tests/test_structured_data_engine_validation_helpers.py index 884b483..f3c7700 100644 --- a/tests/test_structured_data_engine_validation_helpers.py +++ b/tests/test_structured_data_engine_validation_helpers.py @@ -977,23 +977,7 @@ def test_normalize_agent_yarrrml_additional_parser_branches(monkeypatch): assert any(m["name"] == "main" for m in mappings) -def test_materialize_graph_and_xpath_first_text_branches(monkeypatch): - real_import = builtins.__import__ - - def _missing_morph(name, *args, **kwargs): - if name == "morph_kgc": - raise ImportError("missing") - return real_import(name, *args, **kwargs) - - monkeypatch.setattr(builtins, "__import__", _missing_morph) - try: - engine._materialize_graph(Path("mapping.yarrrml")) - assert False, "expected RuntimeError" - except RuntimeError as exc: - assert "morph-kgc is required" in str(exc) - finally: - monkeypatch.setattr(builtins, "__import__", real_import) - +def test_materialize_graph_and_xpath_first_text_branches(): class _Doc: def __init__(self): self.calls = 0 diff --git a/tests/test_structured_data_materialization_generic.py b/tests/test_structured_data_materialization_generic.py index 8d387f8..425f228 100644 --- a/tests/test_structured_data_materialization_generic.py +++ b/tests/test_structured_data_materialization_generic.py @@ -577,12 +577,17 @@ def test_unsupported_xpath_or_function_raises_actionable_error( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - fake_morph = types.SimpleNamespace( - materialize=lambda _cfg: (_ for _ in ()).throw( - ValueError("XPathEvalError: Unsupported function local-namez()") - ) - ) - monkeypatch.setitem(sys.modules, "morph_kgc", fake_morph) + import wordlift_sdk.structured_data.engine as _engine + + class _FakeFuture: + def result(self): + raise ValueError("XPathEvalError: Unsupported function local-namez()") + + class _FakePool: + def submit(self, fn, *args, **kwargs): + return _FakeFuture() + + monkeypatch.setattr(_engine, "_get_morph_kgc_pool", lambda: _FakePool()) mapping = """ prefixes: From 6a32384c088bd8abe8d073eb7d41cff58d504dc0 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 12:54:58 +0100 Subject: [PATCH 57/63] fix(slicing): remap stdlib-only lazy exports to modules with real deps so the guard fires correctly --- wordlift_sdk/kg_build/__init__.py | 2 +- wordlift_sdk/kg_build/protocol.py | 1 + wordlift_sdk/structured_data/__init__.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py index 3539b6f..49c260d 100644 --- a/wordlift_sdk/kg_build/__init__.py +++ b/wordlift_sdk/kg_build/__init__.py @@ -59,7 +59,7 @@ "wordlift_sdk.kg_build.cloud_flow", "get_debug_output_dir", ), - "run_cloud_workflow": ("wordlift_sdk.kg_build.cloud_flow", "run_cloud_workflow"), + "run_cloud_workflow": ("wordlift_sdk.kg_build.protocol", "run_cloud_workflow"), "KgBuildApplicationContainer": ( "wordlift_sdk.kg_build.container", "KgBuildApplicationContainer", diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 1295f64..d05b3e2 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -24,6 +24,7 @@ ValidationOutcome, ) +from .cloud_flow import run_cloud_workflow as run_cloud_workflow # noqa: F401 from .config import ProfileDefinition from .entity_patcher import EntityPatcher from .kpi import KgBuildKpiCollector diff --git a/wordlift_sdk/structured_data/__init__.py b/wordlift_sdk/structured_data/__init__.py index 1e4d977..c5181c9 100644 --- a/wordlift_sdk/structured_data/__init__.py +++ b/wordlift_sdk/structured_data/__init__.py @@ -21,12 +21,12 @@ _EXPORTS = { - "CreateRequest": ("wordlift_sdk.structured_data.models", "CreateRequest"), + "CreateRequest": ("wordlift_sdk.structured_data.orchestrator", "CreateRequest"), "CreateWorkflow": ( "wordlift_sdk.structured_data.orchestrator", "CreateWorkflow", ), - "GenerateRequest": ("wordlift_sdk.structured_data.models", "GenerateRequest"), + "GenerateRequest": ("wordlift_sdk.structured_data.orchestrator", "GenerateRequest"), "GenerateWorkflow": ( "wordlift_sdk.structured_data.orchestrator", "GenerateWorkflow", From ae1e7e694df82e7d2bac886ce695e202d492861c Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 12:59:20 +0100 Subject: [PATCH 58/63] fix(tests): make lazy export tests slice-independent by stubbing heavy-dep modules --- tests/test_lazy_exports.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/test_lazy_exports.py b/tests/test_lazy_exports.py index 093c5c9..d7343ed 100644 --- a/tests/test_lazy_exports.py +++ b/tests/test_lazy_exports.py @@ -12,25 +12,51 @@ def _drop_modules(prefix: str) -> None: sys.modules.pop(name, None) -def test_root_package_import_is_lazy(): +def test_root_package_import_is_lazy(monkeypatch: pytest.MonkeyPatch): _drop_modules("wordlift_sdk") package = importlib.import_module("wordlift_sdk") assert "wordlift_sdk.main" not in sys.modules + import types + + stub_main = types.ModuleType("wordlift_sdk.main") + stub_main.run_kg_import_workflow = object() # type: ignore[attr-defined] + + def fake_import_module(name: str): + if name == "wordlift_sdk.main": + sys.modules["wordlift_sdk.main"] = stub_main + return stub_main + return importlib.import_module(name) + + monkeypatch.setattr("wordlift_sdk._lazy_exports.import_module", fake_import_module) + package.run_kg_import_workflow assert "wordlift_sdk.main" in sys.modules -def test_feature_package_import_is_lazy(): +def test_feature_package_import_is_lazy(monkeypatch: pytest.MonkeyPatch): _drop_modules("wordlift_sdk.render") package = importlib.import_module("wordlift_sdk.render") assert "wordlift_sdk.render.html_renderer" not in sys.modules + import types + + stub_renderer = types.ModuleType("wordlift_sdk.render.html_renderer") + stub_renderer.HtmlRenderer = object() # type: ignore[attr-defined] + + def fake_import_module(name: str): + if name == "wordlift_sdk.render.html_renderer": + sys.modules["wordlift_sdk.render.html_renderer"] = stub_renderer + return stub_renderer + return importlib.import_module(name) + + monkeypatch.setattr("wordlift_sdk._lazy_exports.import_module", fake_import_module) + package.HtmlRenderer assert "wordlift_sdk.render.html_renderer" in sys.modules From c95d391a152acdbe4ee7cd198c0997468b78f928 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 13:05:59 +0100 Subject: [PATCH 59/63] fix(slicing): defer legacy import in create_google_search_console_data_import to avoid gql at collection time --- ..._google_search_console_data_import_helpers.py | 16 ++++++++++------ .../create_google_search_console_data_import.py | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/test_google_search_console_data_import_helpers.py b/tests/test_google_search_console_data_import_helpers.py index 025e148..e6601a1 100644 --- a/tests/test_google_search_console_data_import_helpers.py +++ b/tests/test_google_search_console_data_import_helpers.py @@ -2,6 +2,8 @@ import asyncio import importlib +import sys +import types from datetime import datetime, timedelta from types import SimpleNamespace @@ -16,6 +18,8 @@ raise_error_if_account_analytics_not_configured, ) +_ENTITIES_MOD = "wordlift_sdk.deprecated.create_entities_with_top_query_dataframe" + gsc_import_mod = importlib.import_module( "wordlift_sdk.google_search_console.create_google_search_console_data_import" ) @@ -43,9 +47,9 @@ async def test_create_google_search_console_data_import_only_imports_stale_rows( async def _fake_entities_df(key, url_list): return source_df - monkeypatch.setattr( - gsc_import_mod, "create_entities_with_top_query_dataframe", _fake_entities_df - ) + stub = types.ModuleType(_ENTITIES_MOD) + stub.create_entities_with_top_query_dataframe = _fake_entities_df # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, _ENTITIES_MOD, stub) called_urls: list[str] = [] @@ -100,9 +104,9 @@ async def test_create_google_search_console_data_import_skips_when_no_stale( async def _fake_entities_df(key, url_list): return source_df - monkeypatch.setattr( - gsc_import_mod, "create_entities_with_top_query_dataframe", _fake_entities_df - ) + stub = types.ModuleType(_ENTITIES_MOD) + stub.create_entities_with_top_query_dataframe = _fake_entities_df # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, _ENTITIES_MOD, stub) calls: dict[str, int] = {"gather": 0} diff --git a/wordlift_sdk/google_search_console/create_google_search_console_data_import.py b/wordlift_sdk/google_search_console/create_google_search_console_data_import.py index 00bd0fc..41f566d 100644 --- a/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +++ b/wordlift_sdk/google_search_console/create_google_search_console_data_import.py @@ -9,7 +9,6 @@ from twisted.mail.scripts.mailmail import Configuration from wordlift_client import AnalyticsImportRequest -from ..deprecated import create_entities_with_top_query_dataframe from ..utils import create_delayed logger = logging.getLogger(__name__) @@ -19,6 +18,8 @@ async def create_google_search_console_data_import( configuration: Configuration, key: str, url_list: list[str] ) -> None: # Get the entities data with the top query. + from ..deprecated import create_entities_with_top_query_dataframe + entities_with_top_query_df = await create_entities_with_top_query_dataframe( key=key, url_list=url_list ) From 21d2408d0b1d7b2611010fad567941a9cdb2f137 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 13:08:50 +0100 Subject: [PATCH 60/63] =?UTF-8?q?fix(slicing):=20remove=20test=5Fingestion?= =?UTF-8?q?=5Fsource=5Fbridge=20from=20ingestion=20slice=20=E2=80=94=20nee?= =?UTF-8?q?ds=20legacy=20(gql)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/tools/run_slice_tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tools/run_slice_tests.py b/tests/tools/run_slice_tests.py index 72e3111..d2ef093 100644 --- a/tests/tools/run_slice_tests.py +++ b/tests/tools/run_slice_tests.py @@ -41,7 +41,6 @@ "tests/ingestion", "tests/test_google_sheets_url_provider.py", "tests/test_list_url_provider.py", - "tests/test_ingestion_source_bridge.py", "tests/url_provider/test_sitemap_url_provider.py", ], "structured-data": [ From 5094647307d6a7e827bf47b8288cbea2b9fa6e6b Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 14:00:07 +0100 Subject: [PATCH 61/63] fix(slicing): add python-liquid to workflow extra (needed by graph.ttl_liquid) --- poetry.lock | 22 ++++++++++++++++------ pyproject.toml | 1 + 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index f0bc229..b3ac36a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand. [[package]] name = "advertools" @@ -288,7 +288,7 @@ description = "Internationalization utilities" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\" or extra == \"ingestion\"" +markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\" or extra == \"ingestion\"" files = [ {file = "babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35"}, {file = "babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d"}, @@ -1702,8 +1702,11 @@ files = [ {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"}, {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"}, + {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"}, {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"}, {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"}, {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"}, @@ -1788,7 +1791,7 @@ description = "Safely add untrusted strings to HTML/XML markup." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\"" +markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\"" files = [ {file = "markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559"}, {file = "markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419"}, @@ -3185,7 +3188,7 @@ description = "A Python engine for the Liquid template language." optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\"" +markers = "extra == \"workflow\" or extra == \"graph\" or extra == \"kg-build\" or extra == \"legacy\" or extra == \"all\"" files = [ {file = "python_liquid-2.1.0-py3-none-any.whl", hash = "sha256:d3bbcddff4e1a73287b59218df3471613598271e69ac3d17d97e000f4b984e3e"}, {file = "python_liquid-2.1.0.tar.gz", hash = "sha256:a4c2abb24ac40ded8c9ba844ebbfbe78a3e41c6fe10a7bbe94144582569b73d0"}, @@ -3249,6 +3252,13 @@ optional = false python-versions = ">=3.8" groups = ["dev"] files = [ + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"}, @@ -4345,9 +4355,9 @@ legacy = ["google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pyco render = ["lxml", "playwright"] structured-data = ["advertools", "lxml", "morph-kgc", "playwright", "pyshacl", "rdflib", "requests", "tqdm"] validation = ["pyshacl", "rdflib", "requests", "tqdm"] -workflow = ["advertools", "google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pydantic-core", "rdflib", "tqdm"] +workflow = ["advertools", "google-auth", "gql", "gspread", "lxml", "pandas", "playwright", "pydantic-core", "python-liquid", "rdflib", "tqdm"] [metadata] lock-version = "2.1" python-versions = ">=3.10, <3.15" -content-hash = "0810a8470047131214fc3655380b14044bb11660895b114d5f61fc0e0263d1bc" +content-hash = "a119ca316866d292b70b03bb5e509c4eded8fb9d581ed3c5e541961e6aee98a8" diff --git a/pyproject.toml b/pyproject.toml index 96799a1..548349c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ workflow = [ "pandas", "playwright", "pydantic-core", + "python-liquid", "rdflib", "tqdm", ] From 2c14ab6fcbadafad62f76610a373f25781679912 Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 14:47:10 +0100 Subject: [PATCH 62/63] fix: v7 integration fixes --- tests/test_dataset_resolver.py | 15 ++++---- tests/test_lazy_exports.py | 14 ++++++++ ...chant_listing_defined_region_validation.py | 23 ++---------- tests/test_product_snippet_validation.py | 23 ++---------- tests/test_recommended_one_of_validation.py | 27 ++------------ tests/test_structured_data_engine_class.py | 15 ++++---- ...structured_data_materialization_generic.py | 15 ++++---- tests/test_structured_data_workflows.py | 15 ++++---- .../test_ingestion_bridge_url_handler.py | 22 ++++++++---- wordlift_sdk/graph/audit/_entity_matrix.py | 2 +- wordlift_sdk/kg_build/__init__.py | 20 +++++------ wordlift_sdk/utils/__init__.py | 4 +-- wordlift_sdk/utils/{get_me.py => _get_me.py} | 0 .../utils/{reset_me.py => _reset_me.py} | 0 wordlift_sdk/validation/__init__.py | 15 ++++++++ .../validation/shacl_validation_service.py | 35 +++++++------------ 16 files changed, 107 insertions(+), 138 deletions(-) rename wordlift_sdk/utils/{get_me.py => _get_me.py} (100%) rename wordlift_sdk/utils/{reset_me.py => _reset_me.py} (100%) diff --git a/tests/test_dataset_resolver.py b/tests/test_dataset_resolver.py index e42f838..03c945b 100644 --- a/tests/test_dataset_resolver.py +++ b/tests/test_dataset_resolver.py @@ -46,15 +46,16 @@ def __init__(self, *args, **kwargs) -> None: sys.modules.setdefault("wordlift_client.models", _models_module) sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module) -_pyshacl = types.ModuleType("pyshacl") +try: + import pyshacl as _pyshacl_real # noqa: F401 +except ImportError: + _pyshacl = types.ModuleType("pyshacl") + def _stub_validate(*_args, **_kwargs): + return None, None, None -def _stub_validate(*_args, **_kwargs): - return None, None, None - - -_pyshacl.validate = _stub_validate -sys.modules.setdefault("pyshacl", _pyshacl) + _pyshacl.validate = _stub_validate + sys.modules["pyshacl"] = _pyshacl from wordlift_sdk.structured_data.dataset_resolver import DatasetResolver # noqa: E402 diff --git a/tests/test_lazy_exports.py b/tests/test_lazy_exports.py index d7343ed..e7f020f 100644 --- a/tests/test_lazy_exports.py +++ b/tests/test_lazy_exports.py @@ -6,8 +6,22 @@ import pytest +# Modules that own ProcessPoolExecutors must not be evicted — dropping them +# causes function-identity mismatches when the pool tries to pickle workers. +_PRESERVE_MODULES = frozenset( + [ + "wordlift_sdk.structured_data.engine", + "wordlift_sdk.validation.shacl_validation_service", + "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler", + "wordlift_sdk.workflow.url_handler.web_page_scrape_url_handler", + ] +) + + def _drop_modules(prefix: str) -> None: for name in list(sys.modules): + if name in _PRESERVE_MODULES: + continue if name == prefix or name.startswith(f"{prefix}."): sys.modules.pop(name, None) diff --git a/tests/test_merchant_listing_defined_region_validation.py b/tests/test_merchant_listing_defined_region_validation.py index d17c799..43d9474 100644 --- a/tests/test_merchant_listing_defined_region_validation.py +++ b/tests/test_merchant_listing_defined_region_validation.py @@ -1,30 +1,13 @@ -import importlib.util import json -import sys from pathlib import Path -import pytest - from wordlift_sdk.validation import shacl from wordlift_sdk.validation.shacl import extract_validation_issues -def _load_real_validate(monkeypatch: pytest.MonkeyPatch): - if "pyshacl" in sys.modules: - monkeypatch.delitem(sys.modules, "pyshacl", raising=False) - spec = importlib.util.find_spec("pyshacl") - if spec is None or spec.loader is None: - raise RuntimeError("pyshacl is required for this test.") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module.validate - - def test_merchant_listing_defined_region_address_country_only_is_warning_only( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - payload = { "@context": {"@vocab": "http://schema.org/"}, "@id": "https://data.wordlift.io/wl1506344/merchant-return-policys/shipping-policy/offer-shipping-details/offer-shipping-details-1/defined-regions/defined-region", @@ -51,10 +34,8 @@ def test_merchant_listing_defined_region_address_country_only_is_warning_only( def test_defined_region_address_country_only_conforms_with_default_shapes( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - payload = { "@context": {"@vocab": "http://schema.org/"}, "@id": "https://data.wordlift.io/wl1506344/merchant-return-policys/shipping-policy/offer-shipping-details/offer-shipping-details-1/defined-regions/defined-region", diff --git a/tests/test_product_snippet_validation.py b/tests/test_product_snippet_validation.py index 9054125..b202ba1 100644 --- a/tests/test_product_snippet_validation.py +++ b/tests/test_product_snippet_validation.py @@ -1,29 +1,12 @@ from pathlib import Path -import importlib.util import json -import sys - -import pytest from wordlift_sdk.validation import shacl -def _load_real_validate(monkeypatch: pytest.MonkeyPatch): - if "pyshacl" in sys.modules: - monkeypatch.delitem(sys.modules, "pyshacl", raising=False) - spec = importlib.util.find_spec("pyshacl") - if spec is None or spec.loader is None: - raise RuntimeError("pyshacl is required for this test.") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module.validate - - def test_product_snippet_offers_satisfies_one_of( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - fixture = Path("tests/fixtures/product_snippet_offers.jsonld") data = json.loads(fixture.read_text(encoding="utf-8")) @@ -43,10 +26,8 @@ def test_product_snippet_offers_satisfies_one_of( def test_product_snippet_aggregate_offer_satisfies_one_of( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - fixture = Path("tests/fixtures/product_snippet_aggregate_offer.jsonld") data = json.loads(fixture.read_text(encoding="utf-8")) diff --git a/tests/test_recommended_one_of_validation.py b/tests/test_recommended_one_of_validation.py index f4c89c2..29ce149 100644 --- a/tests/test_recommended_one_of_validation.py +++ b/tests/test_recommended_one_of_validation.py @@ -1,25 +1,10 @@ -import importlib.util import json -import sys from pathlib import Path -import pytest - from wordlift_sdk.validation import shacl from wordlift_sdk.validation.shacl import extract_validation_issues -def _load_real_validate(monkeypatch: pytest.MonkeyPatch): - if "pyshacl" in sys.modules: - monkeypatch.delitem(sys.modules, "pyshacl", raising=False) - spec = importlib.util.find_spec("pyshacl") - if spec is None or spec.loader is None: - raise RuntimeError("pyshacl is required for this test.") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module.validate - - def _write_jsonld(tmp_path: Path, name: str, payload: dict) -> Path: path = tmp_path / name path.write_text(json.dumps(payload), encoding="utf-8") @@ -31,10 +16,8 @@ def _messages_for(result) -> list[str]: def test_dataset_recommended_either_or_is_warning_only( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - missing_payload = { "@context": {"@vocab": "http://schema.org/"}, "@type": "Dataset", @@ -74,10 +57,8 @@ def test_dataset_recommended_either_or_is_warning_only( def test_offer_shipping_details_recommended_either_or_is_warning_only( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - missing_payload = { "@context": {"@vocab": "http://schema.org/"}, "@type": "OfferShippingDetails", @@ -116,10 +97,8 @@ def test_offer_shipping_details_recommended_either_or_is_warning_only( def test_product_offer_price_currency_recommended_either_or_is_warning_only( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch + tmp_path: Path, ) -> None: - monkeypatch.setattr(shacl, "validate", _load_real_validate(monkeypatch)) - missing_payload = { "@context": {"@vocab": "http://schema.org/"}, "@type": "Product", diff --git a/tests/test_structured_data_engine_class.py b/tests/test_structured_data_engine_class.py index f4a272e..b208a78 100644 --- a/tests/test_structured_data_engine_class.py +++ b/tests/test_structured_data_engine_class.py @@ -44,15 +44,16 @@ def __init__(self, *args, **kwargs) -> None: sys.modules.setdefault("wordlift_client.models", _models_module) sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module) -_pyshacl = types.ModuleType("pyshacl") +try: + import pyshacl as _pyshacl_real # noqa: F401 +except ImportError: + _pyshacl = types.ModuleType("pyshacl") + def _stub_validate(*_args, **_kwargs): + return None, None, None -def _stub_validate(*_args, **_kwargs): - return None, None, None - - -_pyshacl.validate = _stub_validate -sys.modules.setdefault("pyshacl", _pyshacl) + _pyshacl.validate = _stub_validate + sys.modules["pyshacl"] = _pyshacl from wordlift_sdk.structured_data.structured_data_engine import ( # noqa: E402 StructuredDataEngine, diff --git a/tests/test_structured_data_materialization_generic.py b/tests/test_structured_data_materialization_generic.py index 425f228..f4f492b 100644 --- a/tests/test_structured_data_materialization_generic.py +++ b/tests/test_structured_data_materialization_generic.py @@ -50,15 +50,16 @@ def __init__(self, *args, **kwargs) -> None: sys.modules.setdefault("wordlift_client.models", _models_module) sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module) -_pyshacl = types.ModuleType("pyshacl") +try: + import pyshacl as _pyshacl_real # noqa: F401 +except ImportError: + _pyshacl = types.ModuleType("pyshacl") + def _stub_validate(*_args, **_kwargs): + return None, None, None -def _stub_validate(*_args, **_kwargs): - return None, None, None - - -_pyshacl.validate = _stub_validate -sys.modules.setdefault("pyshacl", _pyshacl) + _pyshacl.validate = _stub_validate + sys.modules["pyshacl"] = _pyshacl from wordlift_sdk.structured_data.engine import ( # noqa: E402 materialize_yarrrml_jsonld, diff --git a/tests/test_structured_data_workflows.py b/tests/test_structured_data_workflows.py index 373802c..0b7ecad 100644 --- a/tests/test_structured_data_workflows.py +++ b/tests/test_structured_data_workflows.py @@ -52,15 +52,16 @@ def __init__(self, *args, **kwargs) -> None: sys.modules.setdefault("wordlift_client.models", _models_module) sys.modules.setdefault("wordlift_client.models.ask_request", _ask_module) -_pyshacl = types.ModuleType("pyshacl") +try: + import pyshacl as _pyshacl_real # noqa: F401 +except ImportError: + _pyshacl = types.ModuleType("pyshacl") + def _stub_validate(*_args, **_kwargs): + return None, None, None -def _stub_validate(*_args, **_kwargs): - return None, None, None - - -_pyshacl.validate = _stub_validate -sys.modules.setdefault("pyshacl", _pyshacl) + _pyshacl.validate = _stub_validate + sys.modules["pyshacl"] = _pyshacl from wordlift_sdk.structured_data import ( # noqa: E402 CreateRequest, diff --git a/tests/workflow/test_ingestion_bridge_url_handler.py b/tests/workflow/test_ingestion_bridge_url_handler.py index 60fe267..5ad4fca 100644 --- a/tests/workflow/test_ingestion_bridge_url_handler.py +++ b/tests/workflow/test_ingestion_bridge_url_handler.py @@ -6,6 +6,7 @@ import pytest +import wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler as _handler_mod from wordlift_sdk.ingestion.errors import LoaderRuntimeError from wordlift_sdk.ingestion.loaders import PlaywrightLoaderAdapter from wordlift_sdk.url_source import Url @@ -37,7 +38,8 @@ async def test_ingestion_bridge_handler_calls_callback( ) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[ SimpleNamespace( @@ -88,7 +90,8 @@ async def test_ingestion_bridge_handler_raises_on_failed_ingestion( ) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[], events=[ @@ -124,7 +127,8 @@ async def test_ingestion_bridge_handler_raises_and_skips_callback_on_http_404( ) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[ SimpleNamespace( @@ -164,7 +168,8 @@ async def test_ingestion_bridge_handler_raises_and_skips_callback_on_http_500( ) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[ SimpleNamespace( @@ -205,7 +210,8 @@ async def test_ingestion_bridge_handler_surfaces_failed_meta_diagnostics( caplog.set_level("ERROR") monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[], events=[ @@ -265,7 +271,8 @@ async def test_ingestion_bridge_handler_meta_fallback_keeps_old_message( ) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[], events=[ @@ -306,7 +313,8 @@ async def test_ingestion_bridge_handler_truncates_diagnostics_payload( long_message = "token=abc123 " + ("x" * 10000) monkeypatch.setattr( - "wordlift_sdk.workflow.url_handler.ingestion_web_page_scrape_url_handler.run_ingestion", + _handler_mod, + "run_ingestion", lambda settings: SimpleNamespace( pages=[], events=[ diff --git a/wordlift_sdk/graph/audit/_entity_matrix.py b/wordlift_sdk/graph/audit/_entity_matrix.py index 23b3048..1dab968 100644 --- a/wordlift_sdk/graph/audit/_entity_matrix.py +++ b/wordlift_sdk/graph/audit/_entity_matrix.py @@ -15,7 +15,7 @@ _find_webpage_urls, ) from wordlift_sdk.validation.shacl import ( - normalize_schema_org_uris, + _normalize_schema_org_uris as normalize_schema_org_uris, # type: ignore[attr-defined] ) _SCHEMA_ORG_PREFIXES = ("http://schema.org/", "https://schema.org/") diff --git a/wordlift_sdk/kg_build/__init__.py b/wordlift_sdk/kg_build/__init__.py index 49c260d..17c9c4d 100644 --- a/wordlift_sdk/kg_build/__init__.py +++ b/wordlift_sdk/kg_build/__init__.py @@ -1,8 +1,6 @@ from __future__ import annotations -from importlib import import_module -from typing import Any - +from .._lazy_exports import resolve_attr __all__ = [ "ProfileConfig", @@ -142,12 +140,10 @@ } -def __getattr__(name: str) -> Any: - target = _EXPORTS.get(name) - if target is None: - raise AttributeError( - f"module 'wordlift_sdk.kg_build' has no attribute '{name}'" - ) - module_name, attr_name = target - module = import_module(module_name) - return getattr(module, attr_name) +def __getattr__(name: str): + return resolve_attr( + name=name, + module_name="wordlift_sdk.kg_build", + exports=_EXPORTS, + extra="kg-build", + ) diff --git a/wordlift_sdk/utils/__init__.py b/wordlift_sdk/utils/__init__.py index df6297e..18d34b3 100644 --- a/wordlift_sdk/utils/__init__.py +++ b/wordlift_sdk/utils/__init__.py @@ -36,8 +36,8 @@ "create_entity_patch_request", ), "create_delayed": ("wordlift_sdk.utils.delayed", "create_delayed"), - "get_me": ("wordlift_sdk.utils.get_me", "get_me"), - "reset_me": ("wordlift_sdk.utils.reset_me", "reset_me"), + "get_me": ("wordlift_sdk.utils._get_me", "get_me"), + "reset_me": ("wordlift_sdk.utils._reset_me", "reset_me"), "HtmlConverter": ("wordlift_sdk.utils.html_converter", "HtmlConverter"), "AutoConcurrencyController": ( "wordlift_sdk.utils.auto_concurrency", diff --git a/wordlift_sdk/utils/get_me.py b/wordlift_sdk/utils/_get_me.py similarity index 100% rename from wordlift_sdk/utils/get_me.py rename to wordlift_sdk/utils/_get_me.py diff --git a/wordlift_sdk/utils/reset_me.py b/wordlift_sdk/utils/_reset_me.py similarity index 100% rename from wordlift_sdk/utils/reset_me.py rename to wordlift_sdk/utils/_reset_me.py diff --git a/wordlift_sdk/validation/__init__.py b/wordlift_sdk/validation/__init__.py index bcc616f..40701c1 100644 --- a/wordlift_sdk/validation/__init__.py +++ b/wordlift_sdk/validation/__init__.py @@ -17,6 +17,9 @@ "prepare_shapes", "validate_file", "validate_jsonld_from_url", + "ShaclValidationService", + "ValidationMode", + "ValidationOutcome", ] @@ -51,6 +54,18 @@ "wordlift_sdk.validation.shacl", "validate_jsonld_from_url", ), + "ShaclValidationService": ( + "wordlift_sdk.validation.shacl_validation_service", + "ShaclValidationService", + ), + "ValidationMode": ( + "wordlift_sdk.validation.shacl_validation_service", + "ValidationMode", + ), + "ValidationOutcome": ( + "wordlift_sdk.validation.shacl_validation_service", + "ValidationOutcome", + ), } diff --git a/wordlift_sdk/validation/shacl_validation_service.py b/wordlift_sdk/validation/shacl_validation_service.py index 60e01f7..7ad8499 100644 --- a/wordlift_sdk/validation/shacl_validation_service.py +++ b/wordlift_sdk/validation/shacl_validation_service.py @@ -10,11 +10,10 @@ from enum import Enum from typing import Any -from pyshacl import validate as pyshacl_validate from rdflib import Graph from rdflib.namespace import SH -from wordlift_sdk.validation.shacl import load_shapes_graph, normalize_schema_org_uris +from wordlift_sdk.validation.shacl import PreparedShaclValidator logger = logging.getLogger(__name__) @@ -29,13 +28,12 @@ class ValidationMode(str, Enum): # Module-level worker state — one copy per subprocess, initialised by _init_worker. # Must be module-level for picklability by ProcessPoolExecutor. -_worker_shapes_graph: Graph | None = None -_worker_source_map: dict = {} +_worker_validator: PreparedShaclValidator | None = None def _init_worker(shape_specs: list[str] | None) -> None: - global _worker_shapes_graph, _worker_source_map - _worker_shapes_graph, _worker_source_map = load_shapes_graph(shape_specs) + global _worker_validator + _worker_validator = PreparedShaclValidator.from_shape_specs(shape_specs) def _validate_in_worker(ntriples: str, submit_time: float) -> dict: @@ -44,30 +42,23 @@ def _validate_in_worker(ntriples: str, submit_time: float) -> dict: data_graph = Graph() data_graph.parse(data=ntriples, format="nt") - data_graph = normalize_schema_org_uris(data_graph) - conforms, report_graph, _ = pyshacl_validate( - data_graph, - shacl_graph=_worker_shapes_graph, - inference="rdfs", - abort_on_first=False, - allow_infos=True, - allow_warnings=True, - ) + result = _worker_validator.validate_graph(data_graph) + source_map = _worker_validator.prepared_shapes.shape_source_map warning_sources: dict[str, int] = {} error_sources: dict[str, int] = {} - for node in report_graph.subjects(SH.resultSeverity, SH.Warning): - shape = next(report_graph.objects(node, SH.sourceShape), None) - label = _worker_source_map.get(shape, "unknown") + for node in result.report_graph.subjects(SH.resultSeverity, SH.Warning): + shape = next(result.report_graph.objects(node, SH.sourceShape), None) + label = source_map.get(shape, "unknown") warning_sources[str(label)] = warning_sources.get(str(label), 0) + 1 - for node in report_graph.subjects(SH.resultSeverity, SH.Violation): - shape = next(report_graph.objects(node, SH.sourceShape), None) - label = _worker_source_map.get(shape, "unknown") + for node in result.report_graph.subjects(SH.resultSeverity, SH.Violation): + shape = next(result.report_graph.objects(node, SH.sourceShape), None) + label = source_map.get(shape, "unknown") error_sources[str(label)] = error_sources.get(str(label), 0) + 1 return { - "passed": bool(conforms), + "passed": bool(result.conforms), "warning_sources": dict(sorted(warning_sources.items())), "error_sources": dict(sorted(error_sources.items())), "queue_wait_ms": queue_wait_ms, From d012f3da36b1f8e67bd6e59879f1c3aa6a3dae4e Mon Sep 17 00:00:00 2001 From: Rubens Panfili Date: Fri, 20 Mar 2026 17:59:40 +0100 Subject: [PATCH 63/63] chore: bump to v8.0.0 --- CHANGELOG.md | 35 +++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11327be..7a83011 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # Changelog +## 8.0.0 - 2026-03-20 + +### Breaking + +- `kg_build` postprocessor subprocess entry points renamed: + - `runner.py` → `oneshot.py` + - `worker.py` → `persistent.py` +- `SubprocessPostprocessor` split into `OneshotPostprocessor` and `PersistentPostprocessor`; any host code referencing the old class name must be updated. +- `PostprocessorService` is now profile-agnostic; profile resolution no longer happens inside the service. +- `utils.get_me` / `utils.reset_me` module files renamed to `_get_me.py` / `_reset_me.py`; direct submodule imports (not recommended) must be updated. + +### Added + +- `ShaclValidationService` — runs SHACL validation in a dedicated process pool via `PreparedShaclValidator`, wired into `ProfileImportProtocol`. +- Separate pool-size settings for postprocessors and SHACL validation. +- In-process postprocessor runtime (`inprocess`) for single-process execution. +- SHACL process-pool queue-wait and execution-time tracking in timing logs. +- `morph_kgc` subprocess pool for true RML-mapping parallelism, bypassing `pyparsing` lock contention and the GIL. + - Configurable pool size via `morph_kgc_pool_size` / `MORPH_KGC_POOL_SIZE`. + - Subprocess queue-wait tracked separately in timing logs. +- `PostprocessorResult` dataclass — replaces implicit tuple return from postprocessing stage. +- `ImportAnnotationPostprocessor` and `RootIdReconcilerPostprocessor` extracted as named processors. +- `first_level_subjects` graph utility helper. +- Slice verification tooling extended with `run_slice_smoke_imports.py` and `run_slice_tests.py`. + +### Changed + +- Postprocessors reorganised into `postprocessors/` subpackage (`processors/`, `PostprocessorService`, loader helpers). +- `ProfileImportProtocol.__init__` decomposed into focused `_init_*` factory methods; class surface significantly reduced. +- `morph_kgc` RML mapping stage runs in subprocess pool instead of a thread executor. +- SHACL validation and postprocessors offloaded to dedicated thread/process pools; ingestion runs in an executor to avoid blocking the event loop. +- Persistent `ApiClient` reused across requests instead of one per graph; `ApiClient` is closed on protocol shutdown. +- Lazy-export guards remapped to modules with real third-party dependencies so `ModuleNotFoundError` fires correctly when an extra is absent. +- `python-liquid` added to the `workflow` extra (required by `graph.ttl_liquid`). + ## 7.0.0 - 2026-03-15 ### Breaking diff --git a/pyproject.toml b/pyproject.toml index 548349c..3d7ee87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "wordlift-sdk" -version = "7.0.1" +version = "8.0.0" description = "Python toolkit for orchestrating WordLift imports and structured data workflows." authors = ["David Riccitelli "] readme = "README.md"