Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions site/cds_rdm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,23 @@

CDS_ILS_IMPORTER_API_KEY = "CHANGE_ME"
"""API key for the CLC importer. This is a placeholder and should be replaced with a real key."""

# =============================================================================
# INSPIRE Harvester - Vocabulary Mappings
# =============================================================================

CDS_INSPIRE_ACCELERATOR_MAPPINGS = {
# Mappings from INSPIRE accelerator names to CDS vocabulary IDs
# Most INSPIRE accelerators match CDS exactly (e.g., "CERN LHC", "CERN SPS")
# Add mappings here only for special cases where names differ
}
"""INSPIRE to CDS accelerator vocabulary mappings."""

CDS_INSPIRE_EXPERIMENT_MAPPINGS = {
Copy link
Copy Markdown
Contributor

@kpsherva kpsherva Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we instead just normalise the term on the fly? what I see here is that we are uppercasing and removing hyphens... I find it troublesome to have to maintain this list when we can normalise the name before performing a search (same comment for the config above)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, I understand that we want to do exact matches, and just added the list for the cases where we do not make the exact matches. To remove the mapping I guess we could first search the term as it is, or if it fails we search with the value normalized (so that it covers the more generic values), does this work?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the list, I don't think we can normalize AMS to AMS-RE1 or NA-062 to NA62

I was going to suggest to use the props field but it only supports 1 string value :(

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed IRL, agreed to keep it simple, search for the value and if not found normalize it, search again, if not found simply log the error message with the failed value, and remove the mapping. If the value is not found it means is wrong, it should be updated in the source.

# Mappings from INSPIRE experiment names to CDS vocabulary IDs
"LHCb": "LHCB",
"AMS": "AMS-RE1",
"NA-62": "NA62",
"NA-062": "NA62",
}
"""INSPIRE to CDS experiment vocabulary mappings."""
40 changes: 14 additions & 26 deletions site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from idutils.normalizers import normalize_isbn

from cds_rdm.inspire_harvester.transform.mappers.mapper import MapperBase
from cds_rdm.inspire_harvester.utils import search_vocabulary
from cds_rdm.inspire_harvester.utils import get_vocabulary_exact


@dataclass(frozen=True)
Expand Down Expand Up @@ -65,41 +65,29 @@ def map_value(self, src_record, ctx, logger):
acc_exp_list = src_metadata.get("accelerator_experiments", [])
_accelerators = []
_experiments = []

for item in acc_exp_list:
accelerator = item.get("accelerator")
experiment = item.get("experiment")
institution = item.get("institution")

if accelerator:
logger.debug(
f"Searching vocabulary 'accelerator' for term: '{accelerator}'"
)
if institution:
accelerator = f"{institution} {accelerator}"
else:
accelerator = f"{accelerator}"
result = search_vocabulary(accelerator, "accelerators", ctx, logger)
if result.total == 1:
logger.info(f"Found accelerator '{accelerator}'")
hit = list(result.hits)[0]
_accelerators.append({"id": hit["id"]})
accelerator_term = f"{institution} {accelerator}"
else:
logger.warning(
f"Accelerator '{accelerator}' not found."
)
accelerator_term = accelerator

vocab_id = get_vocabulary_exact(
accelerator_term, "accelerators", ctx, logger
)
if vocab_id:
_accelerators.append({"id": vocab_id})

if experiment:
logger.debug(
f"Searching vocabulary 'experiments' for term: '{experiment}'"
vocab_id = get_vocabulary_exact(
experiment, "experiments", ctx, logger
)
result = search_vocabulary(experiment, "experiments", ctx, logger)
if result.total == 1:
logger.info(f"Found experiment '{experiment}'")
hit = list(result.hits)[0]
_experiments.append({"id": hit["id"]})
else:
logger.warning(
f"Experiment '{accelerator}' not found."
)
if vocab_id:
_experiments.append({"id": vocab_id})

return {"cern:accelerators": _accelerators, "cern:experiments": _experiments}
40 changes: 40 additions & 0 deletions site/cds_rdm/inspire_harvester/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,43 @@ def search_vocabulary(term, vocab_type, ctx, logger):
f"Vocabulary term ['{term}'] not found in '{vocab_type}'. INSPIRE#: {ctx.inspire_id}"
)
raise e


def _search_vocabulary_id(service, term, vocab_type):
"""Search vocabulary by exact ID match, returning the ID or None."""
search_term = f'"{term}"' if "/" in term else term
result = service.search(system_identity, type=vocab_type, q=f'id:"{search_term}"')
if result.total == 1:
return list(result.hits)[0]["id"]
return None


def get_vocabulary_exact(term, vocab_type, ctx, logger):
"""Get vocabulary ID by exact match, with fallback to normalized term."""
if not term:
return None

service = current_service_registry.get("vocabularies")

try:
vocab_id = _search_vocabulary_id(service, term, vocab_type)
if vocab_id:
return vocab_id

# Fallback: normalize (uppercase + strip hyphens) and search again
normalized = term.upper().replace("-", "")
if normalized != term:
vocab_id = _search_vocabulary_id(service, normalized, vocab_type)
if vocab_id:
return vocab_id

logger.warning(
f"Vocabulary term '{term}' not found in '{vocab_type}'."
)
return None

except Exception as e:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: do we really need to catch separately? 😅

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RIght, collapesed into a single one

logger.error(
f"Failed vocabulary search for '{term}' in '{vocab_type}'. Error: {e}."
)
return None
11 changes: 10 additions & 1 deletion site/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def experiments_type_v(app, experiments_type):
},
)

vocab = vocabulary_service.create(
vocabulary_service.create(
system_identity,
{
"id": "ATLAS",
Expand All @@ -654,6 +654,15 @@ def experiments_type_v(app, experiments_type):
},
)

vocab = vocabulary_service.create(
system_identity,
{
"id": "NA62",
"title": {"en": "NA62"},
"type": "experiments",
},
)

Vocabulary.index.refresh()

return vocab
Expand Down
8 changes: 8 additions & 0 deletions site/tests/inspire_harvester/test_harvester_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,14 @@
"description": "In the present study the possibility of measuring the lifetime of the positively charged Kaon , K+, is investigated , by using data and framework produced by the experiment NA62 of the European Organization for Nuclear Research (CERN).",
},
"custom_fields": {
'cern:experiments': [
{
'id': 'NA62',
'title': {
'en': 'NA62',
},
},
],
'cern:programmes': {
'id': 'None',
'title': {
Expand Down
200 changes: 200 additions & 0 deletions site/tests/inspire_harvester/test_vocabulary_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the GPL-2.0 License; see LICENSE file for more details.

"""Tests for vocabulary exact matching functionality."""
from cds_rdm.inspire_harvester.logger import Logger
from cds_rdm.inspire_harvester.transform.context import MetadataSerializationContext
from cds_rdm.inspire_harvester.transform.mappers.custom_fields import CERNFieldsMapper
from cds_rdm.inspire_harvester.transform.resource_types import ResourceType
from cds_rdm.inspire_harvester.utils import get_vocabulary_exact


def test_get_vocabulary_exact_found(running_app):
"""Test get_vocabulary_exact with a term that exists in vocabulary."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("CERN LHC", "accelerators", ctx, logger)

assert result == "CERN LHC"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_not_found(running_app):
"""Test get_vocabulary_exact with a term not in vocabulary."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("UNKNOWN", "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_normalizes_case(running_app):
"""Test get_vocabulary_exact normalizes case before lookup."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("alice", "experiments", ctx, logger)

assert result == "ALICE"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_normalizes_hyphens(running_app):
"""Test get_vocabulary_exact strips hyphens before lookup."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

# "NA-62" normalizes to "NA62" which exists in vocabulary
result = get_vocabulary_exact("NA-62", "experiments", ctx, logger)

assert result == "NA62"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_empty_term(running_app):
"""Test get_vocabulary_exact with empty term."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("", "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_none_term(running_app):
"""Test get_vocabulary_exact with None term."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact(None, "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_cern_fields_mapper_accelerator_found(running_app):
"""Test CERNFieldsMapper with accelerator that exists in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "LHC", "institution": "CERN"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 1
assert result["cern:accelerators"][0]["id"] == "CERN LHC"
assert len(ctx.errors) == 0


def test_cern_fields_mapper_accelerator_not_found(running_app):
"""Test CERNFieldsMapper with accelerator not in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "UNKNOWN"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 0
assert len(ctx.errors) == 0


def test_cern_fields_mapper_experiment_found(running_app):
"""Test CERNFieldsMapper with experiment that exists in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"experiment": "ALICE"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:experiments"]) == 1
assert result["cern:experiments"][0]["id"] == "ALICE"
assert len(ctx.errors) == 0


def test_cern_fields_mapper_experiment_not_found(running_app):
"""Test CERNFieldsMapper with experiment not in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"experiment": "UNKNOWN_EXP"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:experiments"]) == 0
assert len(ctx.errors) == 0


def test_cern_fields_mapper_mixed_results(running_app):
"""Test CERNFieldsMapper with some found and some not found."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "LHC", "institution": "CERN", "experiment": "ALICE"},
{"accelerator": "UNKNOWN", "experiment": "UNKNOWN_EXP"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 1
assert result["cern:accelerators"][0]["id"] == "CERN LHC"

assert len(result["cern:experiments"]) == 1
assert result["cern:experiments"][0]["id"] == "ALICE"

assert len(ctx.errors) == 0
Loading