From 6728b458958995304e89e9593152accb14975f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Wed, 18 Mar 2026 15:01:23 +0100 Subject: [PATCH] videos: add new curation fields for digitized records --- .../transform/models/video_lecture.py | 2 +- .../transform/transform.py | 29 +++- .../transform/xml_processing/rules/base.py | 43 ++++- .../xml_processing/rules/video_lecture.py | 159 +++++++++++++++++- setup.cfg | 3 +- 5 files changed, 214 insertions(+), 22 deletions(-) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py index 2f8a8eda..96e9ae6a 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/video_lecture.py @@ -27,7 +27,7 @@ class VideoLecture(CdsOverdo): """Translation Index for CERN Video Lectures.""" - __query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED -5831_.a:digitized" + __query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED" __ignore_keys__ = base_model.__ignore_keys__ # IMPLEMENTED diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py index 6fe46173..52fc6c65 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py @@ -300,7 +300,7 @@ def guess_dates(json_data, key, subkey=None): ### Returns: - `set[str]`: A set of date strings. """ - items = json_data.get(key, []) + items = get_values_in_json(json_data, key, type=list) if subkey: return { item[subkey]["date"] @@ -345,7 +345,10 @@ def reformat_date(json_data): ] return None - + # Check imprint date in 260 (digitized records has it) + imprint_dates = get_values_in_json(json_data, "imprint_date") + if len(imprint_dates) == 1: + return list(imprint_dates) raise MissingRequiredField( f"No valid date found in record: {json_data.get('recid')}.", stage="transform", @@ -353,9 +356,19 @@ def reformat_date(json_data): def description(json_data): """Reformat the description for the cds-videos data model.""" - if not json_data.get("description"): + candidates = json_data.get("description") or [] + values = [v.strip() for v in candidates if v and v.strip()] + if len(values) > 1: + raise UnexpectedValue( + field="description", + value=values, + message=f"Multiple description values found: {values}" + ) + + if values: + return values[0] + else: return json_data.get("title").get("title") - return json_data.get("description") def format_contributors(json_data): """ @@ -394,7 +407,7 @@ def publication_date(json_data): def notes(json_data): """Get the notes.""" - notes = entry.get("notes") + notes = get_values_in_json(json_data, "notes", type=list) if notes: note_strings = [note.get("note") for note in notes] return "\n".join(note_strings) @@ -459,7 +472,7 @@ def get_report_number(json_data): def get_keywords(json_data): """Return keywords.""" keywords = json_data.get("keywords", []) - subject_categories = json_data.get("subject_categories", []) + subject_categories = get_values_in_json(json_data, "subject_categories", type=list) subject_indicators = json_data.get("subject_indicators", []) all_keywords = [ @@ -578,7 +591,7 @@ def get_curation(json_data): """Return _curation.""" _curation = json_data.get("_curation", {}) # Add volumes - additional_titles = json_data.get("additional_titles", []) + additional_titles = get_values_in_json(json_data, "additional_titles", type=list) volumes = [item["volume"] for item in additional_titles if "volume" in item] if volumes: _curation["volumes"] = volumes @@ -594,7 +607,7 @@ def get_curation(json_data): def get_additional_titles(json_data): """Return additional_titles.""" - tag_246 = json_data.get("additional_titles", {}) + tag_246 = get_values_in_json(json_data, "additional_titles", type=list) _titles = [item for item in tag_246 if "title" in item] additional_titles = [] for title_item in _titles: diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py index 57369455..6d971e05 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py @@ -26,6 +26,25 @@ from ...models.base import model from ..quality.contributors import get_contributor +from ..quality.curation import transform_subfields + + +def append_transformed_subfields(self, key, value, field_name, subfield_name=None): + """Helper to append transformed subfields to a curation field.""" + curation = self["_curation"] + transformed = transform_subfields(key, value) + + if subfield_name: + existing_values = curation.setdefault(field_name, {}) + legacy_field = existing_values.get(subfield_name, []) + legacy_field.extend(transformed) + if legacy_field: + curation[field_name][subfield_name] = legacy_field + else: + existing_values = curation.get(field_name, []) + existing_values.extend(transformed) + if existing_values: + curation[field_name] = existing_values @model.over("legacy_recid", "^001") @@ -46,10 +65,15 @@ def title(self, key, value): @model.over("description", "^520__") +@for_each_value def description(self, key, value): """Translates description.""" description_text = StringValue(value.get("a")).parse() - + provenance = value.get("9", "").strip() + curation_info = value.get("8", "").strip() + if curation_info or provenance: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "520") + return None return description_text @@ -71,7 +95,16 @@ def languages(self, key, value): if not langs: raise MissingRequiredField(field=key, subfield="a", value=raw_lang) - + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + # TODO: we need to add for_each_value to this rule but we need to keep the same behavior + # curation_field = value.get("8", "").strip() + # if curation_field: + # append_transformed_subfields(self, key, value, "legacy_marc_fields", "041") + + self["additional_languages"].extend(langs[1:]) return langs[0] @@ -145,10 +178,12 @@ def keywords(self, key, value): """Translates keywords from tag 6531.""" keyword = value.get("a", "").strip() provenance = value.get("9", "").strip() - if provenance and provenance not in ["CERN", "review"]: + if provenance and provenance not in ["CERN", "review", "review Mar2021", "CERN QA"]: # checking if anything else stored in this field raise UnexpectedValue(field=key, subfield="9", value=provenance) - + curation_field = value.get("8", "").strip() + if curation_field: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "653") if keyword: return {"name": keyword} diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py index 34555fef..bdcf5dce 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py @@ -85,14 +85,16 @@ def imprint(self, key, value): ) if producer: self["contributors"].append({"name": producer, "role": "Producer"}) - + provenance = value.get("9", "").strip() + if provenance: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "269") date_field = value.get("c") # 269 'c' subfield (e.g., '1993-08-09') parsed_date = parse_date(date_field) if parsed_date: # If parsing succeeds, return the formatted date return parsed_date -@model.over("_", "^260__") +@model.over("imprint_date", "^260__") @for_each_value def tag260(self, key, value): """Check tag 269.""" @@ -103,13 +105,20 @@ def validate_location(val, subfield): field=key, subfield=subfield, value=val, message="Unexpected tag 260" ) - date_field = value.get("c") # more detailed in 269__c, drop name = value.get("b") place = value.get("a") + curation_info = value.get("8", "").strip() + provenance = value.get("9", "").strip() + if provenance or curation_info: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "260") + + date_field = value.get("c") + parsed_date = parse_date(date_field) + if parsed_date: # If parsing succeeds, return the formatted date + return parsed_date validate_location(place, "a") validate_location(name, "b") - IgnoreKey("_") @model.over("contributors", "^511__") @@ -117,6 +126,10 @@ def validate_location(val, subfield): @require(["a"]) def related_person(self, key, value): """Translates related person.""" + record = value.get("0") + if record: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "511") + return None role = value.get("e", "").strip().lower() contributor_role = "" if role else "RelatedPerson" return get_contributor(key, value, contributor_role=contributor_role) @@ -157,6 +170,7 @@ def format_field(val): "nonpublic_note": format_field(value.get("x")), "md5_checksum": format_field(value.get("w")), "source": format_field(value.get("2")), + "description": format_field(value.get("i")), }.items() if v } @@ -206,6 +220,10 @@ def format_field(val): @require(["a"]) def notes(self, key, value): """Detects notes.""" + curation_field = value.get("8", "").strip() + if curation_field: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "500") + return None note_str = value.get("a").strip() if value.get("9"): note_str = value.get("9").strip() + " : " + value.get("a").strip() @@ -408,15 +426,19 @@ def corporate_author(self, key, value): return None -@model.over("subject_indicators", "^690C_") +@model.over("subject_indicators", "(^690C_)|(^690c_)") @for_each_value def subject_indicators(self, key, value): """Translates subject_indicators as keywords from tag 690C.""" subject = value.get("a", "").strip() if subject: - if subject not in ["ACAD", "CERN", "TALK", "movingimages", "SSLP", "reviewed"]: + if subject not in ["ACAD", "CERN", "TALK", "movingimages", "SSLP", "reviewed", "quality-controlled"]: # checking if anything else stored in this field raise UnexpectedValue(field=key, subfield="a", value=subject) + curated_field = value.get("9", "").strip() + if curated_field and curated_field not in ["review", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=curated_field) return {"name": subject} @@ -429,7 +451,14 @@ def subject_categories(self, key, value): if source and source != "SzGeCERN": # checking if anything else stored in this field raise UnexpectedValue(field=key, subfield="9", value=source) - + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "review Mar2021", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + curation_field = value.get("8", "").strip() + if curation_field: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "650") + return None if keyword: return {"name": keyword} @@ -454,7 +483,8 @@ def additional_titles(self, key, value): additional_title["lang"] = "fr" # Transform as AlternativeTitle elif lang not in ["Previous title", "Also quoted as"]: - raise UnexpectedValue(field=key, subfield="i", value=lang) + append_transformed_subfields(self, key, value, "legacy_marc_fields", "246") + return None if volume: formatted_volume = f"{part} : {volume}" if part else volume @@ -469,6 +499,11 @@ def additional_titles(self, key, value): def additional_descriptions(self, key, value): """Translates additional_descriptions.""" description = value.get("a", "").strip() + provenance = value.get("9", "").strip() + curation_information = value.get("8", "").strip() + if provenance or curation_information: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "590") + return None if description: return {"description": description, "type": "Other", "lang": "fr"} return None @@ -524,6 +559,10 @@ def copyright(self, key, value): @for_each_value def presented_at(self, key, value): """Translates related identifiers.""" + k_value = value.get("k", "").strip() + if k_value: + append_transformed_subfields(self, key, value, "legacy_marc_fields", "962") + return None recid = value.get("b") material = value.get("n", "").lower().strip() # drop if recid exists rel_ids = self.get("related_identifiers", []) @@ -686,6 +725,110 @@ def internal_note(self, key, value): append_transformed_subfields(self, key, value, "internal_note") +@model.over("action_note", "^5831_") +@for_each_value +def action_note(self, key, value): + """Translates action note (digitized information).""" + append_transformed_subfields(self, key, value, "action_note") + + +@model.over("5421", "^5421_") +@for_each_value +def curation_copyright(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "5421") + + +@model.over("597", "^597__") +@for_each_value +def curation_local_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "597") + + +@model.over("514", "^514__") +@for_each_value +def data_quality_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "514") + + +@model.over("594", "^594__") +@for_each_value +def digitized_local_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "594") + + +@model.over("5111", "5111_") +@for_each_value +def performer_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "511") + +@model.over("963", "^963__") +@for_each_value +def digitized_owner(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "963") + + +@model.over("993", "^993__") +@for_each_value +def digitized_993(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "993") + + +@model.over("344", "^344__") +@for_each_value +def digitized_344(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "344") + + +@model.over("508", "(^5081_)|(^508__)") +@for_each_value +def digitized_508(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "508") + + +@model.over("020", "^020__") +@for_each_value +def book_number(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "020") + + +@model.over("519", "^519__") +@for_each_value +def digitized_519(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "519") + + +@model.over("856", "^856_2") +@for_each_value +def digitized_856_2(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "856_2") + + +@model.over("775", "^775__") +@for_each_value +def digitized_775(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "775") + + +@model.over("300", "^300__") +@for_each_value +def pyhsical_description(self, key, value): + """Translates tag 300.""" + append_transformed_subfields(self, key, value, "legacy_marc_fields", "300") + + @model.over("964", "^964__") @for_each_value def physical_location(self, key, value): diff --git a/setup.cfg b/setup.cfg index 803a3173..bb6044ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,8 @@ rdm = videos = Flask-Security-Invenio==3.4.0 marshmallow<4.0 - cds @ git+https://github.com/CERNDocumentServer/cds-videos@additional-files#egg=cds + cds @ git+https://github.com/CERNDocumentServer/cds-videos@main#egg=cds + srt-to-vtt==1.0.0 tests = pytest-black>=0.3.0