From 888e70b6aaeaf9a0b1c7dcbb2a554ad6e3ea2176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Thu, 19 Mar 2026 17:03:12 +0100 Subject: [PATCH] add(staff-association): new model and transformation rules --- .../transform/models/staff_association.py | 101 ++++++++++++++++++ .../transform/xml_processing/rules/base.py | 15 +++ .../xml_processing/rules/bulletin_issue.py | 2 +- .../xml_processing/rules/staff_association.py | 66 ++++++++++++ cds_migrator_kit/rdm/streams.yaml | 11 ++ setup.cfg | 5 + 6 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 cds_migrator_kit/rdm/records/transform/models/staff_association.py create mode 100644 cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py diff --git a/cds_migrator_kit/rdm/records/transform/models/staff_association.py b/cds_migrator_kit/rdm/records/transform/models/staff_association.py new file mode 100644 index 00000000..8bab1ac5 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/staff_association.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Staff Association model.""" +from cds_migrator_kit.rdm.records.transform.models.base_record import ( + rdm_base_record_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class StaffAssociationModel(CdsOverdo): + """Translation model for Staff Association.""" + + __query__ = """980__:BULLETINSTAFF + -980__:CERN_BULLETIN_ARTICLE + -980__:CERN_BULLETIN_ISSUE + """ + + # Copy-pasted from bulletin issue + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "100__m", # email of contributor + "110__a", # corporate author, always CERN, safe to ignore + "300__a", # number of pages + "336__a", # DM metadata + "506__m", # 2120833, ignored with confirmation from IR-ECO-CO + "5831_2", # DM tags 1054836 + "5831_5", # DM tags + "5831_a", # DM tags + "5831_c", # DM tags + "5831_f", # DM tags + "5831_i", # DM tags + "5831_k", # DM tags + "5831_u", # DM tags + "5831_3", # DM tags + "5831_6", # DM tags + "5831_n", # DM tags + "5831_b", # DM tags + "5831_o", # DM tags + "583__a", # DM tags + "583__c", # DM tags + "583__z", # DM tags + "590__b", # staff association? value, redundant with language + "594__a", # specifies if the related articles menu has a separator or not (display feature) + "650172", # scheme of subjects + "6531_9", # scheme of keywords + "691__a", # draft/online values, redundant + "700__m", # email of contributor + "773__p", # title of the "CERN Bulletin" series + "773__t", # CERN Bulletin value, redundant + "773__y", # year, duplicate of 260 + "773__u", # broken links on record 44920 + "787__i", # one referenced record (video in 1755835, 1754359) + "859__a", # empty value + "856__q", # 619830 broken link + "8560_f", # contact email + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + "8564_y", # file description - done by files dump + "8564_2", # DM metadata + "8564_q", # DM metadata + "8564_w", # DM metadata + "8564_z", # DM metadata + "8567_2", # DM tags + "8567_q", # DM tags + "8567_w", # DM tags + "8567_d", # DM tags + "906__m", # edit rights, will be granted by the community + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__a", # CDS modification tag # TODO + "961__b", # CDS modification tag # TODO + "961__c", # CDS modification tag # TODO + "961__h", # CDS modification tag # TODO + "961__l", # CDS modification tag # TODO + "961__x", # CDS modification tag # TODO + "981__a", # duplicate record id + "980__b", + # "246_1a", + # "690C_a", + } + + _default_fields = { + # TODO should we keep this custom field? + "custom_fields": {"journal:journal": {"title": "CERN Bulletin"}}, + "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}], + } + + +staff_association_model = StaffAssociationModel( + bases=(rdm_base_record_model,), + entry_point_group="cds_migrator_kit.migrator.rules.staff_association", +) diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 9a743263..af5ba292 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -797,10 +797,25 @@ def related_identifiers_787(self, key, value): "relation_type": {"id": "references"}, "resource_type": {"id": "publication-conferencepaper"}, }, + "corresponding video": { + "relation_type": {"id": "references"}, + # TODO: do we need new resource type for video? + "resource_type": {"id": "audio"}, + }, + "manuscript": { + "relation_type": {"id": "isderivedfrom"}, + "resource_type": {"id": "publication-preprint"}, + }, } if recid: if description: + if description not in relation_map.keys(): + raise UnexpectedValue( + f"Unexpected relation description {description}", + field=key, + value=value, + ) new_id = { "identifier": recid, "scheme": "cds", diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py index df373cb9..b405f654 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py @@ -229,7 +229,7 @@ def urls_bulletin_bis(self, key, value): @model.over("custom_fields_journal", "(^916__)", override=True) -def issue_number(self, key, value): +def custom_fields_journal(self, key, value): _custom_fields = self.get("custom_fields", {}) issue = value.get("z") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py new file mode 100644 index 00000000..fa31424c --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Staff Association rules.""" + +from cds_migrator_kit.errors import UnexpectedValue + +from ...models.staff_association import staff_association_model as model +from .bulletin_issue import ( + additional_descriptions, + additional_titles_bulletin, + bull_related_identifiers, + bulletin_report_number, + collection, + creators, + custom_fields_journal, + description, + imprint_info, + issue_number, + journal, + rel_identifiers, + subjects_bulletin, + translated_description, + urls_bulletin, + urls_bulletin_bis, +) + +# Re-register all shared bulletin rules onto staff_association_model +model.over("creators", "^100__", override=True)(creators) +model.over("additional_titles", "(^246_[1_])", override=True)( + additional_titles_bulletin +) +model.over("description", "^520__", override=True)(description) +model.over("collection", "^690C_", override=True)(collection) +model.over("publication_date", "(^260__)", override=True)(imprint_info) +model.over("custom_fields", "(^773__)")(journal) +model.over("additional_descriptions", "(^500__)")(additional_descriptions) +model.over("additional_descriptions", "(^590__)")(translated_description) +model.over("subjects", "(^650[12_][7_])|(^6531_)", override=True)(subjects_bulletin) +model.over("url_identifiers", "^8564_", override=True)(urls_bulletin) +model.over("urls_bulletin", "^856__")(urls_bulletin_bis) +model.over("custom_fields_journal", "(^916__)", override=True)(custom_fields_journal) +model.over("bulletin_report_number", "(^037__)|(^088__)", override=True)( + bulletin_report_number +) +model.over("custom_fields", "(^925__)")(issue_number) +model.over("bull_related_identifiers_1", "(^941__)")(bull_related_identifiers) +model.over("bull_related_identifiers_2", "(^962__)", override=True)(rel_identifiers) + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") if "a" in value else value.get("b") + if value: + value = value.lower() + if value == "bulletinstaff": + # TODO what is the resource type? + return {"id": "publication-periodicalarticle"} + raise UnexpectedValue( + "Unknown resource type (STAFF ASSOCIATION)", field=key, value=value + ) diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index 19c5d2ca..4b67b502 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -110,3 +110,14 @@ records: - "77617386-632c-4b86-8dd2-68de77ae0018" load: legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_restricted/duplicated_pids.json + staff_association: + data_dir: cds_migrator_kit/rdm/data/staff_association + tmp_dir: cds_migrator_kit/rdm/tmp/staff_association + log_dir: cds_migrator_kit/rdm/log/staff_association + extract: + dirpath: cds_migrator_kit/rdm/data/staff_association/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/staff_association/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "9ab1f6bd-b213-4bb7-9249-13b9665e453a" diff --git a/setup.cfg b/setup.cfg index 803a3173..dc030ff2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -81,6 +81,7 @@ cds_migrator_kit.migrator.models = te = cds_migrator_kit.rdm.records.transform.models.te:te_model en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model + staff_association = cds_migrator_kit.rdm.records.transform.models.staff_association:staff_association_model cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = @@ -158,6 +159,10 @@ cds_migrator_kit.migrator.rules.annual_rep = base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications annual_rep = cds_migrator_kit.rdm.records.transform.xml_processing.rules.annual_rep +cds_migrator_kit.migrator.rules.staff_association = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + staff_association = cds_migrator_kit.rdm.records.transform.xml_processing.rules.staff_association cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters =