Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions cds_migrator_kit/rdm/records/transform/models/mous.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2025 CERN.
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM CMS note model."""
"""CDS-RDM MoUs model."""
from cds_migrator_kit.rdm.records.transform.models.base_record import (
rdm_base_record_model,
)
Expand All @@ -17,11 +17,19 @@ class MOUSModel(CdsOverdo):

__query__ = "980__:MOUS"

__ignore_keys__ = {}
__ignore_keys__ = {
"8564_8",
"8564_s",
"8564_x",
"8564_y", # file description - done by files dump
"8564_z",
}

_default_fields = {
"resource_type": {"id": "publication-other"},
"custom_fields": {},
"dates": [],
"creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}],
}


Expand Down
189 changes: 189 additions & 0 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/mous.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM MoUs rules."""

import re
from datetime import datetime

from dojson.errors import IgnoreKey
from dojson.utils import for_each_value

from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.transform.xml_processing.quality.parsers import StringValue

from ...models.mous import mous_model as model
from .it import corporate_author

model.over("creators", "^110__")(corporate_author)


@model.over("collection", "^690C_")
@for_each_value
def collection(self, key, value):
"""Translates document type field."""
collection = value.get("a").strip().lower()
# TODO: what they mean? how can we use them?
if collection not in [
"signatory",
"addendum",
"amendment",
"initial",
"letter",
"tracking list",
"signatory page",
"select:",
"amendment to mou",
"initial mou",
"mou information",
"information",
]:
raise UnexpectedValue(subfield="a", value=value, field=key)
raise IgnoreKey("collection")


@model.over("dates", "^925__")
@for_each_value
def dates(self, key, value):
"""Translates dates field."""

def parse_date(date_str: str) -> str:
date_str = date_str.strip()
try:
dt = datetime.fromisoformat(date_str)
return dt.date().isoformat()
except ValueError:
pass

for fmt in ("%d/%m/%Y", "%Y/%m/%d", "%d-%m-%Y", "%Y-%m-%d", "%d.%m.%Y"):
try:
dt = datetime.strptime(date_str, fmt)
return dt.date().isoformat()
except ValueError:
continue

if re.fullmatch(r"\d{4}", date_str):
return date_str

if re.fullmatch(r"\d{4}-\d{2}", date_str):
return date_str

raise UnexpectedValue(
f"Unsupported date format: {date_str}", field=key, value=value
)

dates = self.get("dates", [])
creation_date = value.get("a")
if creation_date:
creation_date = parse_date(creation_date)
dates.append({"date": creation_date, "type": {"id": "created"}})

signature_date = value.get("b")
if signature_date:
signature_date = parse_date(signature_date)
# TODO:What is the type for signature date?
dates.append({"date": signature_date, "type": {"id": "issued"}})
if not creation_date and not signature_date:
raise UnexpectedValue(subfield="a", value=value, field=key)
self["dates"] = dates
raise IgnoreKey("dates")


@model.over("parent_mou", "(^773__)")
def parent_mou(self, key, value):
"""Translates parent MOU field."""
parent__mou_recid = value.get("w", "").strip()
parent_mou_report_number = value.get("r", "").strip()
if not parent__mou_recid:
raise UnexpectedValue(subfield="r", value=value, field=key)
rel_ids = self.get("related_identifiers", [])
# TODO: we can add this as a related identifier?
if parent_mou_report_number:
new_id = {
"scheme": "cdsrn",
"identifier": parent_mou_report_number,
"relation_type": {"id": "ispartof"},
"resource_type": {"id": "publication-other"},
}
if new_id not in rel_ids:
rel_ids.append(new_id)
if parent__mou_recid:
parent_mou_recid = {
"scheme": "cds",
"identifier": parent__mou_recid,
"relation_type": {"id": "ispartof"},
"resource_type": {"id": "publication-other"},
}
if parent_mou_recid not in rel_ids:
rel_ids.append(parent_mou_recid)
self["related_identifiers"] = rel_ids
raise IgnoreKey("parent_mou")


@model.over("funding", "(^536__)")
def funding(self, key, value):
"""Translates funding field."""
# TODO: how can transform we have only a subfield?
programme = value.get("a")
# if programme:
# raise UnexpectedValue("Unexpected programme value", field=key, value=value)
raise IgnoreKey("funding")


@model.over("official_reference", "^036__")
@for_each_value
def official_reference(self, key, value):
"""Translates official reference fields."""
# TODO: example record: https://cds.cern.ch/record/2774726/export/xm
identifier = value.get("a", "").strip()
existing_ids = self.get("identifiers", [])

n_value = value.get("n", "").strip().lower()
if n_value and n_value != "internal reference":
raise UnexpectedValue(field=key, value=value, subfield="n")

if not identifier:
raise UnexpectedValue(subfield="a", value=value, field=key)
new_id = {"scheme": "cdsrn", "identifier": identifier}
if new_id not in existing_ids:
existing_ids.append(new_id)
self["identifiers"] = existing_ids
raise IgnoreKey("identifiers")


@model.over("identifiers", "(^037__)|(^970__)", override=True)
@for_each_value
def identifiers(self, key, value):
"""Translates identifiers."""
if key == "037__":
identifier = value.get("a", "").strip()
existing_ids = self.get("identifiers", [])

n_value = value.get("n", "").strip().lower()
if n_value and n_value not in ["internal reference", "resources review boards"]:
raise UnexpectedValue(field=key, value=value, subfield="n")
original_scheme = StringValue(value.get("9", "")).parse()
scheme = original_scheme.lower()
if scheme:
raise UnexpectedValue(field=key, value=value, subfield="9")
if not identifier:
raise UnexpectedValue(subfield="a", value=value, field=key)
if identifier in existing_ids:
return {"scheme": "cdsrn", "identifier": identifier}

elif key == "970__":
value_9 = value.get("9", "").strip().lower()
identifier = value.get("a", "").strip()
if value_9:
if value_9 not in ["cern sharepoint"]:
raise UnexpectedValue(field=key, value=value, subfield="9")
else:
# TODO: how can we use this? it's sharepoint id?
new_id = {"scheme": "cdsrn", "identifier": identifier}
else:
raise UnexpectedValue(field=key, value=value, subfield="9")
raise IgnoreKey("identifiers")
11 changes: 11 additions & 0 deletions cds_migrator_kit/rdm/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,14 @@ records:
- "77617386-632c-4b86-8dd2-68de77ae0018"
load:
legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_restricted/duplicated_pids.json
mous:
data_dir: cds_migrator_kit/rdm/data/mous
tmp_dir: cds_migrator_kit/rdm/tmp/mous
log_dir: cds_migrator_kit/rdm/log/mous
extract:
dirpath: cds_migrator_kit/rdm/data/mous/dump/
transform:
files_dump_dir: cds_migrator_kit/rdm/data/mous/files/
missing_users: cds_migrator_kit/rdm/data/users
communities_ids:
- "0523fcda-5ac3-4b22-b464-5be8bfa16000"
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ cds_migrator_kit.migrator.rules.thesis =
cds_migrator_kit.migrator.rules.mous =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
mous = cds_migrator_kit.rdm.records.transform.xml_processing.rules.mous
cds_migrator_kit.migrator.rules.bulletin_issue =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
Expand Down
Loading