Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cds_migrator_kit/rdm/migration_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,11 @@ def _(x): # needed to avoid start time failure with lazy strings
CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/streams.yaml"

RDM_RECORDS_IDENTIFIERS_SCHEMES = {
"edms": {
"label": _("EDMS"),
"validator": always_valid,
"datacite": "EDMS",
},
"cdsrn": {
"label": _("CDS Reference"),
"validator": always_valid,
Expand Down
69 changes: 69 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/hse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM HSE model."""
from cds_migrator_kit.rdm.records.transform.models.base_publication_record import (
rdm_base_publication_model,
)
from cds_migrator_kit.transform.overdo import CdsOverdo


class HseModel(CdsOverdo):
"""Translation model for HSE records."""

__query__ = """980__:RP_RESTRICTED OR 595__:CERN-HSE"""

__ignore_keys__ = {
"0247_9",
"0248_a",
"0248_p",
"0248_q",
"100__m",
"035__d", # oai harvest tag
"035__h", # oai harvest tag
"035__m", # oai harvest tag
"300__a", # number of pages
"300__c", # value only '9 p': 2712787, 2789695
"6531_9", # keyword scheme
"700__m",
"7870_r", # detailed description of record relation (2862345)
"8564_8",
"8564_s",
"8564_x",
"8564_y", # file description - done by files dump
"8564_z",
"720__a", # Author's duplicate
"960__a", # base number
"961__c", # CDS modification tag # TODO
"961__h", # CDS modification tag # TODO
"961__l", # CDS modification tag # TODO
"961__x", # CDS modification tag # TODO
"916__y", # year of publication, redundant
"981__a", # duplicate record id
"852__c", # Physical Location https://cds.cern.ch/record/184322/export # TODO
"852__h", # Physical Location https://cds.cern.ch/record/184322/export # TODO
"100__j", # TODO: Can we ignore this? 2 records: 2808737, 2808721
"700__j", # TODO: Can we ignore this? same records with 100__j
# TODO: can we ignore them? https://cds.cern.ch/record/202927/
"913__t", # Citation field
"913__y", # Citation field
"913__v", # Citation field
"913__c", # Citation field
}

_default_fields = {
"resource_type": {"id": "publication-other"},
"custom_fields": {},
# TODO: do we need this creator / what should be the value?
"creators": [{"person_or_org": {"type": "organizational", "name": "HR"}}],
}


hse_model = HseModel(
bases=(rdm_base_publication_model,),
entry_point_group="cds_migrator_kit.migrator.rules.hse",
)
179 changes: 179 additions & 0 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/hse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM HSE rules."""

from dojson.errors import IgnoreKey
from dojson.utils import filter_values, force_list

from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.transform.xml_processing.quality.decorators import (
for_each_value,
)
from cds_migrator_kit.transform.xml_processing.quality.parsers import StringValue
from cds_migrator_kit.transform.xml_processing.rules.base import (
description as base_description,
)

from ...models.hse import hse_model as model
from .base import aleph_number as base_aleph_number
from .base import copyrights as base_copyright
from .base import identifiers as base_identifiers
from .base import licenses as base_licenses
from .base import note as base_note
from .base import report_number as base_report_number
from .base import title as base_title
from .hr import corpo_author
from .it import imprint

model.over("creators", "(^110__)")(corpo_author)
model.over("imprint_info", "(^250__)")(imprint) # Only one record: 156663


@model.over("collection", "^690C_", override=True)
@for_each_value
def collection(self, key, value):
"""Translates collection field."""
collection = value.get("a").strip().lower()
if collection not in ["cern", "preprint", "article", "report"]:
raise UnexpectedValue(subfield="a", value=value, field=key)
raise IgnoreKey("collection")


@model.over("physical_copies", "^964__", override=True)
@for_each_value
def physical_copies(self, key, value):
"""Translates physical copies field."""
number_of_copies = value.get("a").strip()
if number_of_copies and number_of_copies not in [
"0002",
"0001",
"0004",
"0003",
"0005",
]:
raise UnexpectedValue(subfield="a", value=number_of_copies, field=key)
raise IgnoreKey("physical_copies")


@model.over("physical_medium", "^340__", override=True)
@for_each_value
def physical_medium(self, key, value):
"""Translates physical medium field."""
physical_medium = value.get("a").strip()
if physical_medium and physical_medium not in ["paper"]:
raise UnexpectedValue(subfield="a", value=physical_medium, field=key)
raise IgnoreKey("physical_medium")


@model.over("access_note", "^506__", override=True)
@for_each_value
def access_note(self, key, value):
"""Translates physical medium field."""
access_note = value.get("a").strip()
if access_note and access_note.lower() not in ["free", "restricted"]:
raise UnexpectedValue(subfield="a", value=access_note, field=key)
raise IgnoreKey("access_note")


@model.over("internal_notes", "^595__")
@for_each_value
def note_hse(self, key, value):
"""Translates notes. HSE Articles query is 595__:CERN-HSE."""
_note = force_list(value.get("a", ""))
if _note == ("CERN-HSE",):
self["resource_type"] = {"id": "publication-article"}
raise IgnoreKey("internal_notes")
else:
return base_note(self, key, value)


@model.over("resource_type", "^980__", override=True)
def resource_type(self, key, value):
"""Translates resource_type."""
value = value.get("a")
if value:
value = value.strip().lower()
if value in ["rp_restricted"]:
raise IgnoreKey("resource_type")
# TODO: what if more than one resource type exists?
map = {
"conferencepaper": {"id": "publication-conferencepaper"},
"article": {"id": "publication-article"},
"preprint": {"id": "publication-preprint"},
"report": {"id": "publication-report"},
}
try:
return map[value]
except KeyError:
raise UnexpectedValue("Unknown resource type (HSE)", field=key, value=value)


@model.over("identifiers", "(^035__)|(^037__)|(^088__)|(^970__)", override=True)
@for_each_value
def identifiers(self, key, value):
"""Translates identifiers."""
if key == "035__":
identifier = value.get("a").strip()
scheme = StringValue(value.get("9", "")).parse().lower()
if scheme and scheme == "edms":
new_id = {
# TODO: for edms do we need scheme type or we can use url?
"identifier": identifier,
"scheme": "edms",
# TODO: check relation and resource type
}
return new_id
else:
new_id = base_identifiers(self, key, value)
elif key in ("037__", "088__"):
# TODO: two records have 'b' value: 2041604, 2040156
new_id = base_report_number(self, key, value)
elif key == "970__":
new_id = base_aleph_number(self, key, value)
if new_id:
return new_id[0]
raise IgnoreKey("identifiers")


@model.over("rights", "^540__", override=True)
@for_each_value
@filter_values
def licenses(self, key, value):
"""Translates rights."""
material = value.get("3")
if material and material.lower() not in ["publication"]:
raise UnexpectedValue(subfield="3", value=material, field=key)
license = base_licenses(self, key, value)
return license[0]


@model.over("copyright", "^542__", override=True)
def copyrights(self, key, value):
"""Translates copyright."""
material = value.get("3")
if material and material.lower() not in ["publication"]:
raise UnexpectedValue(subfield="3", value=material, field=key)
return base_copyright(self, key, value)


@model.over("description", "^520__", override=True)
def description(self, key, value):
"""Translates description."""
abstract_number = value.get("9")
if abstract_number and abstract_number not in ["JACoW", "submitter"]:
raise UnexpectedValue(subfield="9", value=abstract_number, field=key)
return base_description(self, key, value)


@model.over("title", "^245__", override=True)
def title(self, key, value):
"""Translates title."""
value_9 = value.get("9")
if value_9 and value_9 not in ["submitter"]:
raise UnexpectedValue(subfield="9", value=value_9, field=key)
return base_title(self, key, value)
11 changes: 11 additions & 0 deletions cds_migrator_kit/rdm/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,14 @@ records:
- "77617386-632c-4b86-8dd2-68de77ae0018"
load:
legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_restricted/duplicated_pids.json
hse:
data_dir: cds_migrator_kit/rdm/data/hse
tmp_dir: cds_migrator_kit/rdm/tmp/hse
log_dir: cds_migrator_kit/rdm/log/hse
extract:
dirpath: cds_migrator_kit/rdm/data/hse/dump/
transform:
files_dump_dir: cds_migrator_kit/rdm/data/hse/files/
missing_users: cds_migrator_kit/rdm/data/users
communities_ids:
- "0523fcda-5ac3-4b22-b464-5be8bfa16000"
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ cds_migrator_kit.migrator.models =
it = cds_migrator_kit.rdm.records.transform.models.it:it_model
it_meetings = cds_migrator_kit.rdm.records.transform.models.it_meetings:it_meetings_model
hr = cds_migrator_kit.rdm.records.transform.models.hr:hr_model
hse = cds_migrator_kit.rdm.records.transform.models.hse:hse_model
sy = cds_migrator_kit.rdm.records.transform.models.sy:sy_model
te = cds_migrator_kit.rdm.records.transform.models.te:te_model
en = cds_migrator_kit.rdm.records.transform.models.en:en_model
Expand Down Expand Up @@ -139,6 +140,10 @@ cds_migrator_kit.migrator.rules.hr =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
hr = cds_migrator_kit.rdm.records.transform.xml_processing.rules.hr
cds_migrator_kit.migrator.rules.hse =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
hse = cds_migrator_kit.rdm.records.transform.xml_processing.rules.hse
cds_migrator_kit.migrator.rules.sy =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
Expand Down
Loading