From dc692bc15177aa38641066863ac3d759188f3370 Mon Sep 17 00:00:00 2001 From: Javier Romero Castro Date: Mon, 23 Mar 2026 12:02:01 +0100 Subject: [PATCH] inspire: get exact match for accelerators and experiments * closes https://github.com/CERNDocumentServer/cds-rdm/issues/551 --- site/cds_rdm/config.py | 20 ++ .../transform/mappers/custom_fields.py | 40 ++-- site/cds_rdm/inspire_harvester/utils.py | 40 ++++ site/tests/conftest.py | 11 +- .../inspire_harvester/test_harvester_job.py | 8 + .../test_vocabulary_matching.py | 200 ++++++++++++++++++ 6 files changed, 292 insertions(+), 27 deletions(-) create mode 100644 site/tests/inspire_harvester/test_vocabulary_matching.py diff --git a/site/cds_rdm/config.py b/site/cds_rdm/config.py index f844f5f7..3fcd7b31 100644 --- a/site/cds_rdm/config.py +++ b/site/cds_rdm/config.py @@ -55,3 +55,23 @@ CDS_ILS_IMPORTER_API_KEY = "CHANGE_ME" """API key for the CLC importer. This is a placeholder and should be replaced with a real key.""" + +# ============================================================================= +# INSPIRE Harvester - Vocabulary Mappings +# ============================================================================= + +CDS_INSPIRE_ACCELERATOR_MAPPINGS = { + # Mappings from INSPIRE accelerator names to CDS vocabulary IDs + # Most INSPIRE accelerators match CDS exactly (e.g., "CERN LHC", "CERN SPS") + # Add mappings here only for special cases where names differ +} +"""INSPIRE to CDS accelerator vocabulary mappings.""" + +CDS_INSPIRE_EXPERIMENT_MAPPINGS = { + # Mappings from INSPIRE experiment names to CDS vocabulary IDs + "LHCb": "LHCB", + "AMS": "AMS-RE1", + "NA-62": "NA62", + "NA-062": "NA62", +} +"""INSPIRE to CDS experiment vocabulary mappings.""" diff --git a/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py b/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py index 9ae47273..c45147d3 100644 --- a/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py +++ b/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py @@ -12,7 +12,7 @@ from idutils.normalizers import normalize_isbn from cds_rdm.inspire_harvester.transform.mappers.mapper import MapperBase -from cds_rdm.inspire_harvester.utils import search_vocabulary +from cds_rdm.inspire_harvester.utils import get_vocabulary_exact @dataclass(frozen=True) @@ -65,41 +65,29 @@ def map_value(self, src_record, ctx, logger): acc_exp_list = src_metadata.get("accelerator_experiments", []) _accelerators = [] _experiments = [] + for item in acc_exp_list: accelerator = item.get("accelerator") experiment = item.get("experiment") institution = item.get("institution") if accelerator: - logger.debug( - f"Searching vocabulary 'accelerator' for term: '{accelerator}'" - ) if institution: - accelerator = f"{institution} {accelerator}" - else: - accelerator = f"{accelerator}" - result = search_vocabulary(accelerator, "accelerators", ctx, logger) - if result.total == 1: - logger.info(f"Found accelerator '{accelerator}'") - hit = list(result.hits)[0] - _accelerators.append({"id": hit["id"]}) + accelerator_term = f"{institution} {accelerator}" else: - logger.warning( - f"Accelerator '{accelerator}' not found." - ) + accelerator_term = accelerator + + vocab_id = get_vocabulary_exact( + accelerator_term, "accelerators", ctx, logger + ) + if vocab_id: + _accelerators.append({"id": vocab_id}) if experiment: - logger.debug( - f"Searching vocabulary 'experiments' for term: '{experiment}'" + vocab_id = get_vocabulary_exact( + experiment, "experiments", ctx, logger ) - result = search_vocabulary(experiment, "experiments", ctx, logger) - if result.total == 1: - logger.info(f"Found experiment '{experiment}'") - hit = list(result.hits)[0] - _experiments.append({"id": hit["id"]}) - else: - logger.warning( - f"Experiment '{accelerator}' not found." - ) + if vocab_id: + _experiments.append({"id": vocab_id}) return {"cern:accelerators": _accelerators, "cern:experiments": _experiments} diff --git a/site/cds_rdm/inspire_harvester/utils.py b/site/cds_rdm/inspire_harvester/utils.py index 92df274b..206e7e66 100644 --- a/site/cds_rdm/inspire_harvester/utils.py +++ b/site/cds_rdm/inspire_harvester/utils.py @@ -94,3 +94,43 @@ def search_vocabulary(term, vocab_type, ctx, logger): f"Vocabulary term ['{term}'] not found in '{vocab_type}'. INSPIRE#: {ctx.inspire_id}" ) raise e + + +def _search_vocabulary_id(service, term, vocab_type): + """Search vocabulary by exact ID match, returning the ID or None.""" + search_term = f'"{term}"' if "/" in term else term + result = service.search(system_identity, type=vocab_type, q=f'id:"{search_term}"') + if result.total == 1: + return list(result.hits)[0]["id"] + return None + + +def get_vocabulary_exact(term, vocab_type, ctx, logger): + """Get vocabulary ID by exact match, with fallback to normalized term.""" + if not term: + return None + + service = current_service_registry.get("vocabularies") + + try: + vocab_id = _search_vocabulary_id(service, term, vocab_type) + if vocab_id: + return vocab_id + + # Fallback: normalize (uppercase + strip hyphens) and search again + normalized = term.upper().replace("-", "") + if normalized != term: + vocab_id = _search_vocabulary_id(service, normalized, vocab_type) + if vocab_id: + return vocab_id + + logger.warning( + f"Vocabulary term '{term}' not found in '{vocab_type}'." + ) + return None + + except Exception as e: + logger.error( + f"Failed vocabulary search for '{term}' in '{vocab_type}'. Error: {e}." + ) + return None diff --git a/site/tests/conftest.py b/site/tests/conftest.py index 30374d90..77501f09 100644 --- a/site/tests/conftest.py +++ b/site/tests/conftest.py @@ -644,7 +644,7 @@ def experiments_type_v(app, experiments_type): }, ) - vocab = vocabulary_service.create( + vocabulary_service.create( system_identity, { "id": "ATLAS", @@ -654,6 +654,15 @@ def experiments_type_v(app, experiments_type): }, ) + vocab = vocabulary_service.create( + system_identity, + { + "id": "NA62", + "title": {"en": "NA62"}, + "type": "experiments", + }, + ) + Vocabulary.index.refresh() return vocab diff --git a/site/tests/inspire_harvester/test_harvester_job.py b/site/tests/inspire_harvester/test_harvester_job.py index 298770e9..0908c5c5 100644 --- a/site/tests/inspire_harvester/test_harvester_job.py +++ b/site/tests/inspire_harvester/test_harvester_job.py @@ -287,6 +287,14 @@ "description": "In the present study the possibility of measuring the lifetime of the positively charged Kaon , K+, is investigated , by using data and framework produced by the experiment NA62 of the European Organization for Nuclear Research (CERN).", }, "custom_fields": { + 'cern:experiments': [ + { + 'id': 'NA62', + 'title': { + 'en': 'NA62', + }, + }, + ], 'cern:programmes': { 'id': 'None', 'title': { diff --git a/site/tests/inspire_harvester/test_vocabulary_matching.py b/site/tests/inspire_harvester/test_vocabulary_matching.py new file mode 100644 index 00000000..834ed67f --- /dev/null +++ b/site/tests/inspire_harvester/test_vocabulary_matching.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the GPL-2.0 License; see LICENSE file for more details. + +"""Tests for vocabulary exact matching functionality.""" +from cds_rdm.inspire_harvester.logger import Logger +from cds_rdm.inspire_harvester.transform.context import MetadataSerializationContext +from cds_rdm.inspire_harvester.transform.mappers.custom_fields import CERNFieldsMapper +from cds_rdm.inspire_harvester.transform.resource_types import ResourceType +from cds_rdm.inspire_harvester.utils import get_vocabulary_exact + + +def test_get_vocabulary_exact_found(running_app): + """Test get_vocabulary_exact with a term that exists in vocabulary.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + result = get_vocabulary_exact("CERN LHC", "accelerators", ctx, logger) + + assert result == "CERN LHC" + assert len(ctx.errors) == 0 + + +def test_get_vocabulary_exact_not_found(running_app): + """Test get_vocabulary_exact with a term not in vocabulary.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + result = get_vocabulary_exact("UNKNOWN", "accelerators", ctx, logger) + + assert result is None + assert len(ctx.errors) == 0 + + +def test_get_vocabulary_exact_normalizes_case(running_app): + """Test get_vocabulary_exact normalizes case before lookup.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + result = get_vocabulary_exact("alice", "experiments", ctx, logger) + + assert result == "ALICE" + assert len(ctx.errors) == 0 + + +def test_get_vocabulary_exact_normalizes_hyphens(running_app): + """Test get_vocabulary_exact strips hyphens before lookup.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + # "NA-62" normalizes to "NA62" which exists in vocabulary + result = get_vocabulary_exact("NA-62", "experiments", ctx, logger) + + assert result == "NA62" + assert len(ctx.errors) == 0 + + +def test_get_vocabulary_exact_empty_term(running_app): + """Test get_vocabulary_exact with empty term.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + result = get_vocabulary_exact("", "accelerators", ctx, logger) + + assert result is None + assert len(ctx.errors) == 0 + + +def test_get_vocabulary_exact_none_term(running_app): + """Test get_vocabulary_exact with None term.""" + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + + result = get_vocabulary_exact(None, "accelerators", ctx, logger) + + assert result is None + assert len(ctx.errors) == 0 + + +def test_cern_fields_mapper_accelerator_found(running_app): + """Test CERNFieldsMapper with accelerator that exists in vocabulary.""" + src_metadata = { + "accelerator_experiments": [ + {"accelerator": "LHC", "institution": "CERN"}, + ] + } + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + mapper = CERNFieldsMapper() + src_record = {"metadata": src_metadata, "created": "2023-01-01"} + + result = mapper.map_value(src_record, ctx, logger) + + assert len(result["cern:accelerators"]) == 1 + assert result["cern:accelerators"][0]["id"] == "CERN LHC" + assert len(ctx.errors) == 0 + + +def test_cern_fields_mapper_accelerator_not_found(running_app): + """Test CERNFieldsMapper with accelerator not in vocabulary.""" + src_metadata = { + "accelerator_experiments": [ + {"accelerator": "UNKNOWN"}, + ] + } + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + mapper = CERNFieldsMapper() + src_record = {"metadata": src_metadata, "created": "2023-01-01"} + + result = mapper.map_value(src_record, ctx, logger) + + assert len(result["cern:accelerators"]) == 0 + assert len(ctx.errors) == 0 + + +def test_cern_fields_mapper_experiment_found(running_app): + """Test CERNFieldsMapper with experiment that exists in vocabulary.""" + src_metadata = { + "accelerator_experiments": [ + {"experiment": "ALICE"}, + ] + } + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + mapper = CERNFieldsMapper() + src_record = {"metadata": src_metadata, "created": "2023-01-01"} + + result = mapper.map_value(src_record, ctx, logger) + + assert len(result["cern:experiments"]) == 1 + assert result["cern:experiments"][0]["id"] == "ALICE" + assert len(ctx.errors) == 0 + + +def test_cern_fields_mapper_experiment_not_found(running_app): + """Test CERNFieldsMapper with experiment not in vocabulary.""" + src_metadata = { + "accelerator_experiments": [ + {"experiment": "UNKNOWN_EXP"}, + ] + } + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + mapper = CERNFieldsMapper() + src_record = {"metadata": src_metadata, "created": "2023-01-01"} + + result = mapper.map_value(src_record, ctx, logger) + + assert len(result["cern:experiments"]) == 0 + assert len(ctx.errors) == 0 + + +def test_cern_fields_mapper_mixed_results(running_app): + """Test CERNFieldsMapper with some found and some not found.""" + src_metadata = { + "accelerator_experiments": [ + {"accelerator": "LHC", "institution": "CERN", "experiment": "ALICE"}, + {"accelerator": "UNKNOWN", "experiment": "UNKNOWN_EXP"}, + ] + } + ctx = MetadataSerializationContext( + resource_type=ResourceType.OTHER, inspire_id="12345" + ) + logger = Logger(inspire_id="12345") + mapper = CERNFieldsMapper() + src_record = {"metadata": src_metadata, "created": "2023-01-01"} + + result = mapper.map_value(src_record, ctx, logger) + + assert len(result["cern:accelerators"]) == 1 + assert result["cern:accelerators"][0]["id"] == "CERN LHC" + + assert len(result["cern:experiments"]) == 1 + assert result["cern:experiments"][0]["id"] == "ALICE" + + assert len(ctx.errors) == 0