From dc692bc15177aa38641066863ac3d759188f3370 Mon Sep 17 00:00:00 2001
From: Javier Romero Castro <jrcastro9515@gmail.com>
Date: Mon, 23 Mar 2026 12:02:01 +0100
Subject: [PATCH] inspire: get exact match for accelerators and experiments

* closes https://github.com/CERNDocumentServer/cds-rdm/issues/551
---
 site/cds_rdm/config.py                        |  20 ++
 .../transform/mappers/custom_fields.py        |  40 ++--
 site/cds_rdm/inspire_harvester/utils.py       |  40 ++++
 site/tests/conftest.py                        |  11 +-
 .../inspire_harvester/test_harvester_job.py   |   8 +
 .../test_vocabulary_matching.py               | 200 ++++++++++++++++++
 6 files changed, 292 insertions(+), 27 deletions(-)
 create mode 100644 site/tests/inspire_harvester/test_vocabulary_matching.py

diff --git a/site/cds_rdm/config.py b/site/cds_rdm/config.py
index f844f5f7..3fcd7b31 100644
--- a/site/cds_rdm/config.py
+++ b/site/cds_rdm/config.py
@@ -55,3 +55,23 @@
 
 CDS_ILS_IMPORTER_API_KEY = "CHANGE_ME"
 """API key for the CLC importer. This is a placeholder and should be replaced with a real key."""
+
+# =============================================================================
+# INSPIRE Harvester - Vocabulary Mappings
+# =============================================================================
+
+CDS_INSPIRE_ACCELERATOR_MAPPINGS = {
+    # Mappings from INSPIRE accelerator names to CDS vocabulary IDs
+    # Most INSPIRE accelerators match CDS exactly (e.g., "CERN LHC", "CERN SPS")
+    # Add mappings here only for special cases where names differ
+}
+"""INSPIRE to CDS accelerator vocabulary mappings."""
+
+CDS_INSPIRE_EXPERIMENT_MAPPINGS = {
+    # Mappings from INSPIRE experiment names to CDS vocabulary IDs
+    "LHCb": "LHCB",
+    "AMS": "AMS-RE1",
+    "NA-62": "NA62",
+    "NA-062": "NA62",
+}
+"""INSPIRE to CDS experiment vocabulary mappings."""
diff --git a/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py b/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py
index 9ae47273..c45147d3 100644
--- a/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py
+++ b/site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py
@@ -12,7 +12,7 @@
 from idutils.normalizers import normalize_isbn
 
 from cds_rdm.inspire_harvester.transform.mappers.mapper import MapperBase
-from cds_rdm.inspire_harvester.utils import search_vocabulary
+from cds_rdm.inspire_harvester.utils import get_vocabulary_exact
 
 
 @dataclass(frozen=True)
@@ -65,41 +65,29 @@ def map_value(self, src_record, ctx, logger):
         acc_exp_list = src_metadata.get("accelerator_experiments", [])
         _accelerators = []
         _experiments = []
+
         for item in acc_exp_list:
             accelerator = item.get("accelerator")
             experiment = item.get("experiment")
             institution = item.get("institution")
 
             if accelerator:
-                logger.debug(
-                    f"Searching vocabulary 'accelerator' for term: '{accelerator}'"
-                )
                 if institution:
-                    accelerator = f"{institution} {accelerator}"
-                else:
-                    accelerator = f"{accelerator}"
-                result = search_vocabulary(accelerator, "accelerators", ctx, logger)
-                if result.total == 1:
-                    logger.info(f"Found accelerator '{accelerator}'")
-                    hit = list(result.hits)[0]
-                    _accelerators.append({"id": hit["id"]})
+                    accelerator_term = f"{institution} {accelerator}"
                 else:
-                    logger.warning(
-                        f"Accelerator '{accelerator}' not found."
-                    )
+                    accelerator_term = accelerator
+
+                vocab_id = get_vocabulary_exact(
+                    accelerator_term, "accelerators", ctx, logger
+                )
+                if vocab_id:
+                    _accelerators.append({"id": vocab_id})
 
             if experiment:
-                logger.debug(
-                    f"Searching vocabulary 'experiments' for term: '{experiment}'"
+                vocab_id = get_vocabulary_exact(
+                    experiment, "experiments", ctx, logger
                 )
-                result = search_vocabulary(experiment, "experiments", ctx, logger)
-                if result.total == 1:
-                    logger.info(f"Found experiment '{experiment}'")
-                    hit = list(result.hits)[0]
-                    _experiments.append({"id": hit["id"]})
-                else:
-                    logger.warning(
-                        f"Experiment '{accelerator}' not found."
-                    )
+                if vocab_id:
+                    _experiments.append({"id": vocab_id})
 
         return {"cern:accelerators": _accelerators, "cern:experiments": _experiments}
diff --git a/site/cds_rdm/inspire_harvester/utils.py b/site/cds_rdm/inspire_harvester/utils.py
index 92df274b..206e7e66 100644
--- a/site/cds_rdm/inspire_harvester/utils.py
+++ b/site/cds_rdm/inspire_harvester/utils.py
@@ -94,3 +94,43 @@ def search_vocabulary(term, vocab_type, ctx, logger):
             f"Vocabulary term ['{term}'] not found in '{vocab_type}'. INSPIRE#: {ctx.inspire_id}"
         )
         raise e
+
+
+def _search_vocabulary_id(service, term, vocab_type):
+    """Search vocabulary by exact ID match, returning the ID or None."""
+    search_term = f'"{term}"' if "/" in term else term
+    result = service.search(system_identity, type=vocab_type, q=f'id:"{search_term}"')
+    if result.total == 1:
+        return list(result.hits)[0]["id"]
+    return None
+
+
+def get_vocabulary_exact(term, vocab_type, ctx, logger):
+    """Get vocabulary ID by exact match, with fallback to normalized term."""
+    if not term:
+        return None
+
+    service = current_service_registry.get("vocabularies")
+
+    try:
+        vocab_id = _search_vocabulary_id(service, term, vocab_type)
+        if vocab_id:
+            return vocab_id
+
+        # Fallback: normalize (uppercase + strip hyphens) and search again
+        normalized = term.upper().replace("-", "")
+        if normalized != term:
+            vocab_id = _search_vocabulary_id(service, normalized, vocab_type)
+            if vocab_id:
+                return vocab_id
+
+        logger.warning(
+            f"Vocabulary term '{term}' not found in '{vocab_type}'."
+        )
+        return None
+
+    except Exception as e:
+        logger.error(
+            f"Failed vocabulary search for '{term}' in '{vocab_type}'. Error: {e}."
+        )
+        return None
diff --git a/site/tests/conftest.py b/site/tests/conftest.py
index 30374d90..77501f09 100644
--- a/site/tests/conftest.py
+++ b/site/tests/conftest.py
@@ -644,7 +644,7 @@ def experiments_type_v(app, experiments_type):
         },
     )
 
-    vocab = vocabulary_service.create(
+    vocabulary_service.create(
         system_identity,
         {
             "id": "ATLAS",
@@ -654,6 +654,15 @@ def experiments_type_v(app, experiments_type):
         },
     )
 
+    vocab = vocabulary_service.create(
+        system_identity,
+        {
+            "id": "NA62",
+            "title": {"en": "NA62"},
+            "type": "experiments",
+        },
+    )
+
     Vocabulary.index.refresh()
 
     return vocab
diff --git a/site/tests/inspire_harvester/test_harvester_job.py b/site/tests/inspire_harvester/test_harvester_job.py
index 298770e9..0908c5c5 100644
--- a/site/tests/inspire_harvester/test_harvester_job.py
+++ b/site/tests/inspire_harvester/test_harvester_job.py
@@ -287,6 +287,14 @@
         "description": "In the present study the possibility of measuring the lifetime of the positively charged Kaon , K+, is investigated , by using data and framework produced by the experiment NA62 of the European Organization for Nuclear Research (CERN).",
     },
     "custom_fields": {
+        'cern:experiments': [
+            {
+                'id': 'NA62',
+                'title': {
+                    'en': 'NA62',
+                },
+            },
+        ],
         'cern:programmes': {
             'id': 'None',
             'title': {
diff --git a/site/tests/inspire_harvester/test_vocabulary_matching.py b/site/tests/inspire_harvester/test_vocabulary_matching.py
new file mode 100644
index 00000000..834ed67f
--- /dev/null
+++ b/site/tests/inspire_harvester/test_vocabulary_matching.py
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2026 CERN.
+#
+# CDS-RDM is free software; you can redistribute it and/or modify it under
+# the terms of the GPL-2.0 License; see LICENSE file for more details.
+
+"""Tests for vocabulary exact matching functionality."""
+from cds_rdm.inspire_harvester.logger import Logger
+from cds_rdm.inspire_harvester.transform.context import MetadataSerializationContext
+from cds_rdm.inspire_harvester.transform.mappers.custom_fields import CERNFieldsMapper
+from cds_rdm.inspire_harvester.transform.resource_types import ResourceType
+from cds_rdm.inspire_harvester.utils import get_vocabulary_exact
+
+
+def test_get_vocabulary_exact_found(running_app):
+    """Test get_vocabulary_exact with a term that exists in vocabulary."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    result = get_vocabulary_exact("CERN LHC", "accelerators", ctx, logger)
+
+    assert result == "CERN LHC"
+    assert len(ctx.errors) == 0
+
+
+def test_get_vocabulary_exact_not_found(running_app):
+    """Test get_vocabulary_exact with a term not in vocabulary."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    result = get_vocabulary_exact("UNKNOWN", "accelerators", ctx, logger)
+
+    assert result is None
+    assert len(ctx.errors) == 0
+
+
+def test_get_vocabulary_exact_normalizes_case(running_app):
+    """Test get_vocabulary_exact normalizes case before lookup."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    result = get_vocabulary_exact("alice", "experiments", ctx, logger)
+
+    assert result == "ALICE"
+    assert len(ctx.errors) == 0
+
+
+def test_get_vocabulary_exact_normalizes_hyphens(running_app):
+    """Test get_vocabulary_exact strips hyphens before lookup."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    # "NA-62" normalizes to "NA62" which exists in vocabulary
+    result = get_vocabulary_exact("NA-62", "experiments", ctx, logger)
+
+    assert result == "NA62"
+    assert len(ctx.errors) == 0
+
+
+def test_get_vocabulary_exact_empty_term(running_app):
+    """Test get_vocabulary_exact with empty term."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    result = get_vocabulary_exact("", "accelerators", ctx, logger)
+
+    assert result is None
+    assert len(ctx.errors) == 0
+
+
+def test_get_vocabulary_exact_none_term(running_app):
+    """Test get_vocabulary_exact with None term."""
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+
+    result = get_vocabulary_exact(None, "accelerators", ctx, logger)
+
+    assert result is None
+    assert len(ctx.errors) == 0
+
+
+def test_cern_fields_mapper_accelerator_found(running_app):
+    """Test CERNFieldsMapper with accelerator that exists in vocabulary."""
+    src_metadata = {
+        "accelerator_experiments": [
+            {"accelerator": "LHC", "institution": "CERN"},
+        ]
+    }
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+    mapper = CERNFieldsMapper()
+    src_record = {"metadata": src_metadata, "created": "2023-01-01"}
+
+    result = mapper.map_value(src_record, ctx, logger)
+
+    assert len(result["cern:accelerators"]) == 1
+    assert result["cern:accelerators"][0]["id"] == "CERN LHC"
+    assert len(ctx.errors) == 0
+
+
+def test_cern_fields_mapper_accelerator_not_found(running_app):
+    """Test CERNFieldsMapper with accelerator not in vocabulary."""
+    src_metadata = {
+        "accelerator_experiments": [
+            {"accelerator": "UNKNOWN"},
+        ]
+    }
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+    mapper = CERNFieldsMapper()
+    src_record = {"metadata": src_metadata, "created": "2023-01-01"}
+
+    result = mapper.map_value(src_record, ctx, logger)
+
+    assert len(result["cern:accelerators"]) == 0
+    assert len(ctx.errors) == 0
+
+
+def test_cern_fields_mapper_experiment_found(running_app):
+    """Test CERNFieldsMapper with experiment that exists in vocabulary."""
+    src_metadata = {
+        "accelerator_experiments": [
+            {"experiment": "ALICE"},
+        ]
+    }
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+    mapper = CERNFieldsMapper()
+    src_record = {"metadata": src_metadata, "created": "2023-01-01"}
+
+    result = mapper.map_value(src_record, ctx, logger)
+
+    assert len(result["cern:experiments"]) == 1
+    assert result["cern:experiments"][0]["id"] == "ALICE"
+    assert len(ctx.errors) == 0
+
+
+def test_cern_fields_mapper_experiment_not_found(running_app):
+    """Test CERNFieldsMapper with experiment not in vocabulary."""
+    src_metadata = {
+        "accelerator_experiments": [
+            {"experiment": "UNKNOWN_EXP"},
+        ]
+    }
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+    mapper = CERNFieldsMapper()
+    src_record = {"metadata": src_metadata, "created": "2023-01-01"}
+
+    result = mapper.map_value(src_record, ctx, logger)
+
+    assert len(result["cern:experiments"]) == 0
+    assert len(ctx.errors) == 0
+
+
+def test_cern_fields_mapper_mixed_results(running_app):
+    """Test CERNFieldsMapper with some found and some not found."""
+    src_metadata = {
+        "accelerator_experiments": [
+            {"accelerator": "LHC", "institution": "CERN", "experiment": "ALICE"},
+            {"accelerator": "UNKNOWN", "experiment": "UNKNOWN_EXP"},
+        ]
+    }
+    ctx = MetadataSerializationContext(
+        resource_type=ResourceType.OTHER, inspire_id="12345"
+    )
+    logger = Logger(inspire_id="12345")
+    mapper = CERNFieldsMapper()
+    src_record = {"metadata": src_metadata, "created": "2023-01-01"}
+
+    result = mapper.map_value(src_record, ctx, logger)
+
+    assert len(result["cern:accelerators"]) == 1
+    assert result["cern:accelerators"][0]["id"] == "CERN LHC"
+
+    assert len(result["cern:experiments"]) == 1
+    assert result["cern:experiments"][0]["id"] == "ALICE"
+
+    assert len(ctx.errors) == 0