From cb043f2c76895d81846ce1d79da03e9bcb8a9873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Thu, 26 Mar 2026 17:58:45 +0100 Subject: [PATCH 1/2] videos: add model and rules for digitized records --- .../videos/weblecture_migration/load/load.py | 40 +- .../videos/weblecture_migration/streams.yaml | 9 + .../models/digitized_video_lecture.py | 68 ++ .../transform/transform.py | 89 ++- .../xml_processing/quality/contributors.py | 2 +- .../xml_processing/quality/curation.py | 11 +- .../xml_processing/quality/multiple_video.py | 323 ++++++++- .../transform/xml_processing/rules/base.py | 9 +- .../rules/digitized_video_lecture.py | 653 ++++++++++++++++++ .../xml_processing/rules/video_lecture.py | 10 +- setup.cfg | 8 +- 11 files changed, 1160 insertions(+), 62 deletions(-) create mode 100644 cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py create mode 100644 cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py index 40909d6f..48ca02b0 100644 --- a/cds_migrator_kit/videos/weblecture_migration/load/load.py +++ b/cds_migrator_kit/videos/weblecture_migration/load/load.py @@ -34,8 +34,8 @@ MigrationProgressLogger, RecordStateLogger, ) -from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import ( - transform_legacy_urls, +from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import ( + update_metadata_multiple_video_record, ) from .helpers import ( @@ -341,38 +341,10 @@ def create_publish_multiple_video_record(self, entry): media_files = self._get_files(record["files"], json_data.get("files", [])) master_file_id = media_files["master_path"].split("/")[-1] - # Use the correct metadata for each record - event_id = record.get("event_id") - url = record.get("url") - date = record["date"] - location = record.get("location") - - metadata = common_metadata.copy() - related_identifiers = list(metadata.get("related_identifiers", [])) - if event_id: - # Insert event_id at the beginning - related_identifiers.insert( - 0, - { - "scheme": "Indico", - "identifier": str(event_id), - "relation_type": "IsPartOf", - }, - ) - if url: - url = transform_legacy_urls(url, type="indico") - url_identifier = { - "scheme": "URL", - "identifier": url, - "relation_type": "IsPartOf", - } - if url_identifier not in related_identifiers: - related_identifiers.append(url_identifier) - - metadata["related_identifiers"] = related_identifiers - metadata["date"] = date - if location: - metadata["location"] = location + # Update metadata for multiple video record + metadata = update_metadata_multiple_video_record(record, common_metadata) + # Mint report number if matched with video + report_number = metadata.get("report_number", None) # Create video and flow video_deposit, video_deposit_id, bucket_id, payload = ( diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml index d0ef6590..301bbf27 100644 --- a/cds_migrator_kit/videos/weblecture_migration/streams.yaml +++ b/cds_migrator_kit/videos/weblecture_migration/streams.yaml @@ -9,6 +9,15 @@ records: transform: files_dump_dir: /path_to/afs_files/ eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/ + digitized-lectures: + data_dir: cds_migrator_kit/videos/weblecture_migration/data/ + tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp + log_dir: cds_migrator_kit/videos/weblecture_migration/log + extract: + dirpath: cds_migrator_kit/videos/weblecture_migration/data/digitized-lectures/dump/ + transform: + files_dump_dir: /path_to/afs_files/ + eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/ submitters: data_dir: cds_migrator_kit/videos/weblecture_migration/data/users tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp/users diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py new file mode 100644 index 00000000..a12f02be --- /dev/null +++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Document Server. +# Copyright (C) 2026 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. + + +"""CDS-Videos Digitized Video Lecture model.""" + +from cds_migrator_kit.transform.overdo import CdsOverdo + +from .video_lecture import model as video_lecture_model + + +class DigitizedVideoLecture(CdsOverdo): + """Translation Index for CERN Digitized Video Lectures.""" + + __query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED 5831_.a:digitized" + + __ignore_keys__ = { + "0248_a", # oai identifier + "0248_p", # oai identifier + "0248_q", # oai + "100__0", # Author id (eg: AUTHOR|(CDS)2067852) + "111__z", # End date (indico) + "250__a", # Edition + "337__a", # Checked values only `Video` + "511__0", # Author id (eg: AUTHOR|(CDS)2067852) + "5111_8", # Video id for performer, multi video records. + "518__l", # Lectures: length of speech + "518__h", # Lectures: Starting time + "700__0", # Author id (eg: AUTHOR|(CDS)2067852) + "518__e", # Speaker (1 record has contributor in 518, it's also in 5111) + "8564_8", # File: bibdoc id + "8564_s", # File: file size + "916__y", # Status week year + "960__a", # Base number + # CDS modification tag + "961__h", + "961__l", + } + + _default_fields = { + "language": "", + "description": "", + "performer": "", + "url_files": [], + "curated_copyright": {}, + "lecture_infos": [], + "_curation": { + "preservation_values": [], + }, + "contributors": [], + "alternate_identifiers": [], + "additional_languages": [], + "collections": [], + "keywords": [], + } + + +model = DigitizedVideoLecture( + bases=(video_lecture_model,), + entry_point_group="cds_migrator_kit.videos.rules.digitized_video_lecture", +) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py index 6fe46173..549160c6 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py @@ -50,6 +50,7 @@ from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import ( transform_multiple_video_record, transform_multiple_video_wihtout_indico, + try_to_match_metadata, ) cli_logger = logging.getLogger("migrator") @@ -300,7 +301,7 @@ def guess_dates(json_data, key, subkey=None): ### Returns: - `set[str]`: A set of date strings. """ - items = json_data.get(key, []) + items = get_values_in_json(json_data, key, type=list) if subkey: return { item[subkey]["date"] @@ -318,6 +319,7 @@ def reformat_date(json_data): or get_values_in_json(json_data, "publication_date") or guess_dates(json_data, "url_files", subkey="indico") | guess_dates(json_data, "notes") + or get_values_in_json(json_data, "imprint_date") ) # If multiple video record use lecture_infos or url_files lecture_infos = [ @@ -331,6 +333,25 @@ def reformat_date(json_data): and "date" in item["indico"] and "event_id" in item["indico"] ] + if json_data.get("legacy_recid") in [319681, 319687]: + # Couldn't match date and video. Use the first date and add them to curation + legacy_dates = get_values_in_json( + json_data.get("_curation", {}), "legacy_dates", type=list + ) + legacy_dates.extend(dates_set) + json_data["_curation"]["legacy_dates"] = legacy_dates + dates_set = [sorted(dates_set)[0]] + + # TODO: update after migrator fix + if not dates_set: + dates_set = [json_data.get("lecture_created")] + self.migration_logger.add_information( + json_data.get("recid"), + state={ + "message": "Lecture created date used!", + "value": json_data.get("lecture_created"), + }, + ) # Return the valid date if len(dates_set) >= 1 and not self.has_multiple_master: @@ -353,9 +374,10 @@ def reformat_date(json_data): def description(json_data): """Reformat the description for the cds-videos data model.""" - if not json_data.get("description"): + description = json_data.get("description", "") + if not description: return json_data.get("title").get("title") - return json_data.get("description") + return description def format_contributors(json_data): """ @@ -394,7 +416,7 @@ def publication_date(json_data): def notes(json_data): """Get the notes.""" - notes = entry.get("notes") + notes = get_values_in_json(json_data, "notes", type=list) if notes: note_strings = [note.get("note") for note in notes] return "\n".join(note_strings) @@ -448,7 +470,7 @@ def location(json_data): def get_report_number(json_data): """Return the report number.""" report_numbers = get_values_in_json(json_data, "report_number", type=list) - if len(report_numbers) > 1: + if len(report_numbers) >= 1: # If report number exists put it in curation report_number = report_numbers[0] return report_numbers, self.check_pid_exists( @@ -459,7 +481,9 @@ def get_report_number(json_data): def get_keywords(json_data): """Return keywords.""" keywords = json_data.get("keywords", []) - subject_categories = json_data.get("subject_categories", []) + subject_categories = get_values_in_json( + json_data, "subject_categories", type=list + ) subject_indicators = json_data.get("subject_indicators", []) all_keywords = [ @@ -578,7 +602,9 @@ def get_curation(json_data): """Return _curation.""" _curation = json_data.get("_curation", {}) # Add volumes - additional_titles = json_data.get("additional_titles", []) + additional_titles = get_values_in_json( + json_data, "additional_titles", type=list + ) volumes = [item["volume"] for item in additional_titles if "volume" in item] if volumes: _curation["volumes"] = volumes @@ -594,7 +620,7 @@ def get_curation(json_data): def get_additional_titles(json_data): """Return additional_titles.""" - tag_246 = json_data.get("additional_titles", {}) + tag_246 = get_values_in_json(json_data, "additional_titles", type=list) _titles = [item for item in tag_246 if "title" in item] additional_titles = [] for title_item in _titles: @@ -626,10 +652,10 @@ def get_access(json_data): """Generate access permissions based on restrictions.""" # Update permissions update = [current_app.config["WEBLECTURES_MIGRATION_SYSTEM_USER"]] - submitter = json_data.get("submitter") + submitter = json_data.get("submitters", []) if submitter: # Add submitter to update list - update.append(submitter) + update.extend(submitter) collections = get_collections(json_data) if "Lectures::CERN Accelerator School" in collections: update.extend(current_app.config["CAS_LECTURES_ACCESS"]) @@ -643,6 +669,13 @@ def get_access(json_data): access["read"] = all_emails return access + def get_language(json_data): + """Return language.""" + language = json_data.get("language") + if not language: + raise MissingRequiredField(message="Language is missing!") + return language + record_dates = reformat_date(entry) # Date will be None if record has multiple masters record_date = record_dates[0] if not self.has_multiple_master else None @@ -650,7 +683,7 @@ def get_access(json_data): "title": entry["title"], "description": description(entry), "contributors": format_contributors(entry), - "language": entry.get("language"), + "language": get_language(entry), "date": record_date, "publication_date": publication_date(entry) or record_date, "keywords": get_keywords(entry), @@ -726,6 +759,7 @@ def transform(self, entry): } if self.has_multiple_master: record_json_output["is_multiple_video_record"] = True + self.multiple_video_record_entries["recid"] = json_data["legacy_recid"] try: mapped_multiple_video_record, common = transform_multiple_video_record( self.multiple_video_record_entries @@ -747,6 +781,10 @@ def transform(self, entry): ) else: raise e + mapped_multiple_video_record = sorted( + mapped_multiple_video_record, + key=lambda r: r["files"]["master_path"], + ) if len(mapped_multiple_video_record) > 10: self.migration_logger.add_information( json_data["recid"], @@ -804,6 +842,35 @@ def transform(self, entry): record_json_output["metadata"] = metadata record_json_output["multiple_video_record"] = mapped_multiple_video_record + # Try to match metadata for multiple video record + curation = metadata.get("_curation", {}) + tracked_fields = { + "digitized_description": "Digitized description matched!", + "digitized_language": "Digitized language matched!", + "digitized_keywords": "Digitized keywords matched!", + "legacy_report_number": "Legacy report number matched!", + } + + if any(field in curation for field in tracked_fields): + mapped_multiple_video_record, new_curation = try_to_match_metadata( + mapped_multiple_video_record, curation + ) + + for field, message in tracked_fields.items(): + if curation.get(field) != new_curation.get(field): + self.migration_logger.add_information( + json_data["recid"], + { + "message": message, + "value": field, + }, + ) + metadata["_curation"] = new_curation + record_json_output["metadata"] = metadata + record_json_output["multiple_video_record"] = ( + mapped_multiple_video_record + ) + return { "created": self._created(record_dump), "updated": self._updated(record_dump), diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py index c0d2984b..b51614d0 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py @@ -52,7 +52,7 @@ def get_contributor(key, value, contributor_role="", name=""): contributor = {"name": name} if affiliation: if isinstance(affiliation, (list, tuple)): - contributor["affiliations"] = list(affiliation) + contributor["affiliations"] = list(set(affiliation)) elif isinstance(affiliation, str): contributor["affiliations"] = [affiliation] if contributor_role: diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py index 959e4206..e18c5ea6 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py @@ -11,8 +11,17 @@ def transform_subfields(key, value): """Helper to transform MARC subfields into key-prefixed strings.""" value = dict(value) + + priority = { + "9": 0, + "8": 1, + } + output = [] - for subfield, subvalue in value.items(): + for subfield, subvalue in sorted( + value.items(), + key=lambda item: (priority.get(item[0], 99), item[0]), + ): if isinstance(subvalue, (list, tuple)): for item in subvalue: output.append(f"{key}{subfield}:{item}") diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py index 69143b41..490d5df1 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py @@ -6,12 +6,17 @@ # the terms of the MIT License; see LICENSE file for more details. """CDS-Videos transform step module.""" +import copy import logging import re -from cds_migrator_kit.errors import ManualImportRequired +import pycountry + +from cds_migrator_kit.errors import ManualImportRequired, UnexpectedValue +from cds_migrator_kit.transform.xml_processing.quality.parsers import clean_str from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import ( get_new_indico_id, + transform_legacy_urls, ) cli_logger = logging.getLogger("migrator") @@ -70,6 +75,15 @@ def transform_multiple_video_record(multiple_video_record): None, ) if not indico_id: + if multiple_video_record["recid"] == 468903: + # use publication date for missing dates + mapped.append( + { + "files": file_group, + "date": "2000-08-07", + } + ) + continue raise ManualImportRequired( f"Multi video record: No matching indico id for {file_group['master_path']}", stage="transform", @@ -152,13 +166,8 @@ def transform_multiple_video_record(multiple_video_record): # raise if anything is missing for record in mapped: - # raise if any date, event_id, or files is missing - if ( - not record.get("date") - or not record.get("event_id") - or not record.get("files") - or not record.get("url") - ): + # raise if any date or files is missing + if not record.get("date") or not record.get("files"): raise ManualImportRequired( message="Multiple video record needs curation. Date, event_id, or files is missing", stage="transform", @@ -207,3 +216,301 @@ def transform_multiple_video_wihtout_indico(multiple_video_record): ) return mapped, None + + +def parse_entry(entry): + """Parse entry into code and value.""" + left, value = entry.split(":", 1) + code = left.split("__")[-1].split("_")[-1] + return code, value + + +def grouped_values_with_code(entries): + """ + Group flat entries into logical MARC-like groups. + A new group starts when a new 9 subfield appears. + """ + groups = [] + current_parsed = {} + current_raw = [] + + for entry in entries: + code, value = parse_entry(entry) + + if code == "9" and current_raw: + groups.append({"parsed": current_parsed, "raw": current_raw}) + current_parsed = {} + current_raw = [] + + current_parsed[code] = value + current_raw.append(entry) + + if current_raw: + groups.append({"parsed": current_parsed, "raw": current_raw}) + + return groups + + +def get_single_selector(event_id): + """ + Examples: + CERN-VIDEO-C-123-A -> "a" + CERN-VIDEO-C-402-A_pt1 -> "a" + CERN-VIDEO-C-402-A_pt2 -> "a" + CERN-VIDEO-C-123-B-C -> None + """ + if not event_id: + return None + + last_part = event_id.split("-")[-1] + match = re.fullmatch(r"([A-Za-z])(?:_pt\d+)?", last_part) + if match: + return match.group(1).lower() + + return None + + +def match_with_code(entries, event_id, value_code="a"): + """ + Returns: + matches: matched values + matched_groups: raw groups that matched this record + + Rules: + - groups without selector 8 are ignored for matching + - groups with selector but without target value are ignored for matching + - combined ids like ...-B-C do not match anything + - A_pt1 / A_pt2 both match selector a + """ + matches = [] + matched_groups = [] + event_selector = get_single_selector(event_id) + + for group in grouped_values_with_code(entries): + parsed = group["parsed"] + raw = group["raw"] + + selector = parsed.get("8") + value = parsed.get(value_code) + + if not selector: + continue + + if value is None: + continue + + if event_selector and selector.lower() == event_selector: + if value != "": + matches.append(value) + matched_groups.append(tuple(raw)) + + return matches, matched_groups + + +def remove_matched_groups(entries, matched_groups): + """Remove groups that were matched at least once.""" + matched_groups = set(matched_groups) + remaining_entries = [] + + for group in grouped_values_with_code(entries): + raw_tuple = tuple(group["raw"]) + if raw_tuple not in matched_groups: + remaining_entries.extend(group["raw"]) + + return remaining_entries + + +def normalize_languages(raw_langs): + """Normalize languages.""" + normalized = [] + + for r in raw_langs: + try: + lang = pycountry.languages.lookup(clean_str(r).lower()).alpha_2.lower() + except Exception: + raise UnexpectedValue(field="041__", subfield="a", value=r) + + if lang not in normalized: + normalized.append(lang) + + return normalized + + +def try_to_match_metadata(multiple_video_records, _curation): + """Try to match curated metadata for multiple video record.""" + curation = copy.deepcopy(_curation) + mapped_multiple_video_records = copy.deepcopy(multiple_video_records) + + digitized_description = curation.get("digitized_description", []) + digitized_language = curation.get("digitized_language", []) + digitized_keywords = curation.get("digitized_keywords", []) + report_numbers = _curation.get("legacy_report_number", []) + + matched_description_groups = set() + matched_language_groups = set() + matched_keyword_groups = set() + matched_report_numbers = set() + + for mapped_multiple_video_record in mapped_multiple_video_records: + files = mapped_multiple_video_record["files"] + event_id = mapped_multiple_video_record.get("event_id") + + # If indico id is present, we can't match + if event_id: + continue + + event_id = files.get("master_path", "").split("/")[-1] + if not event_id: + continue + + matched_digitized_description, description_groups = match_with_code( + digitized_description, event_id, value_code="a" + ) + matched_digitized_language, language_groups = match_with_code( + digitized_language, event_id, value_code="a" + ) + matched_digitized_keywords, keyword_groups = match_with_code( + digitized_keywords, event_id, value_code="a" + ) + + matched_description_groups.update(description_groups) + matched_language_groups.update(language_groups) + matched_keyword_groups.update(keyword_groups) + + if matched_digitized_description: + mapped_multiple_video_record["description"] = matched_digitized_description + + if matched_digitized_language: + mapped_multiple_video_record["language"] = normalize_languages( + matched_digitized_language + ) + + if matched_digitized_keywords: + mapped_multiple_video_record["keywords"] = matched_digitized_keywords + + matched_report_number = next( + ( + report_number + for report_number in report_numbers + if report_number == event_id + ), + None, + ) + if matched_report_number: + mapped_multiple_video_record["report_number"] = [matched_report_number] + matched_report_numbers.add(matched_report_number) + + # Remove matched groups only after all records have been processed + digitized_description = remove_matched_groups( + digitized_description, matched_description_groups + ) + digitized_language = remove_matched_groups( + digitized_language, matched_language_groups + ) + digitized_keywords = remove_matched_groups( + digitized_keywords, matched_keyword_groups + ) + report_numbers = [ + report_number + for report_number in report_numbers + if report_number not in matched_report_numbers + ] + + # update curation + if digitized_description: + curation["digitized_description"] = digitized_description + else: + curation.pop("digitized_description", None) + if digitized_language: + curation["digitized_language"] = digitized_language + else: + curation.pop("digitized_language", None) + if digitized_keywords: + curation["digitized_keywords"] = digitized_keywords + else: + curation.pop("digitized_keywords", None) + if report_numbers: + curation["legacy_report_number"] = report_numbers + else: + curation.pop("legacy_report_number", None) + + return mapped_multiple_video_records, curation + + +def update_metadata_multiple_video_record(record, common_metadata): + """Update metadata for multiple video record.""" + # Copy common metadata + metadata = copy.deepcopy(common_metadata) + + # Use the correct metadata for each record + event_id = record.get("event_id") + url = record.get("url") + date = record["date"] + location = record.get("location") + descriptions = record.get("description") + lang = record.get("language") + keywords = record.get("keywords") + report_number = record.get("report_number") + + if lang and lang != [metadata.get("language")]: + # If it's same ignore + raise UnexpectedValue(field="language", subfield="a", value=lang, stage="load") + if report_number: + if metadata.get("report_number") and report_number != metadata.get( + "report_number" + ): + raise UnexpectedValue( + field="report_number", subfield="a", value=report_number, stage="load" + ) + else: + metadata["report_number"] = report_number + if descriptions: + additional_descriptions = metadata.get("additional_descriptions", []) + for description in descriptions: + additional_descriptions.append( + { + "description": description, + "type": "Other", + "lang": "en", + } + ) + metadata["additional_descriptions"] = additional_descriptions + if keywords: + keyword_objects = metadata.get("keywords", []) + keyword_names = [keyword.get("name") for keyword in keyword_objects] + for new_keyword_name in keywords: + if new_keyword_name not in keyword_names: + keyword_objects.append( + { + "name": new_keyword_name, + } + ) + metadata["keywords"] = keyword_objects + + related_identifiers = list(metadata.get("related_identifiers", [])) + if event_id: + # Insert event_id at the beginning + related_identifiers.insert( + 0, + { + "scheme": "Indico", + "identifier": str(event_id), + "relation_type": "IsPartOf", + }, + ) + if url: + url = transform_legacy_urls(url, type="indico") + url_identifier = { + "scheme": "URL", + "identifier": url, + "relation_type": "IsPartOf", + } + if url_identifier not in related_identifiers: + related_identifiers.append(url_identifier) + + metadata["related_identifiers"] = related_identifiers + metadata["date"] = date + if location: + metadata["location"] = location + + return metadata diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py index 57369455..20626291 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py @@ -92,17 +92,18 @@ def contributors(self, key, value): return get_contributor(key, value) -@model.over("submitter", "(^859__)|(^856__)") +@model.over("submitters", "(^859__)|(^856__)") +@for_each_value @require(["f"]) def record_submitter(self, key, value): """Translate record submitter.""" submitter = value.get("f") + if not submitter: + raise UnexpectedValue(field=key, subfield="f", value=value.get("f")) if type(submitter) is tuple: submitter = submitter[0] raise UnexpectedValue(field=key, subfield="f", value=value.get("f")) - if submitter: - submitter = submitter.lower() - return submitter + return submitter.lower() @model.over("_created", "(^916__)") diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py new file mode 100644 index 00000000..540657b8 --- /dev/null +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py @@ -0,0 +1,653 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Document Server. +# Copyright (C) 2026 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. + + +"""CDS-Videos Digitized Video Lecture rules.""" + +import pycountry +from dojson.errors import IgnoreKey + +from cds_migrator_kit.errors import MissingRequiredField, UnexpectedValue +from cds_migrator_kit.transform.xml_processing.quality.decorators import ( + for_each_value, + require, + strip_output, +) +from cds_migrator_kit.transform.xml_processing.quality.parsers import ( + StringValue, + clean_str, +) +from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.curation import ( + transform_subfields, +) + +from ...models.digitized_video_lecture import model # noqa: F401 +from ..quality.dates import parse_date +from .video_lecture import presented_at, series, url_files + + +def append_transformed_subfields(self, key, value, field_name, subfield_name=None): + """Helper to append transformed subfields to a curation field.""" + curation = self["_curation"] + transformed = transform_subfields(key, value) + + if subfield_name: + existing_values = curation.setdefault(field_name, {}) + legacy_field = existing_values.get(subfield_name, []) + legacy_field.extend(transformed) + if legacy_field: + curation[field_name][subfield_name] = legacy_field + else: + existing_values = curation.get(field_name, []) + existing_values.extend(transformed) + if existing_values: + curation[field_name] = existing_values + + +def add_contributor(self, name, role): + """Add a contributor to the record.""" + contributors = self.get("contributors", []) + contributor_names = [contributor["name"] for contributor in contributors] + if name and name not in contributor_names: + contributors.append({"name": name, "role": role}) + self["contributors"] = contributors + + +def validate_copyright(self, key, holder, year): + if "copyright" in self: + copyright = self["copyright"] + if holder.lower() not in copyright["holder"].lower(): + raise UnexpectedValue(message="Copyright holders not matching!", field=key) + if year: + if "year" not in copyright: + self["copyright"]["year"] = year + elif year not in copyright["year"]: + # Curated year is the correct year, use that one + # There is only one record with different year: https://cds.cern.ch/record/281783/export/xm + # If different curated years raise an error + curated_copyright = self.get("curated_copyright", {}) + curated_year = curated_copyright.get("year") + if curated_year and curated_year != copyright.get("year"): + raise UnexpectedValue( + message="Copyright years not matching!", field=key + ) + # Different year in curated copyright + if curated_year and key == "5421_": + raise UnexpectedValue( + message="Copyright years not matching!", field=key + ) + else: + copyright = {"holder": holder} + if year: + copyright["year"] = year + if "cern" in holder.lower(): + copyright["url"] = "http://copyright.web.cern.ch" + self["copyright"] = copyright + return copyright + + +@model.over("related_id", "^962__", override=True) +@for_each_value +def related_id(self, key, value): + """Translates tag 962.""" + pagination = value.get("k", "").strip() + # Only 2 records has this field: 300427, 317239 + if pagination and pagination != "no pagination": + raise UnexpectedValue(field=key, subfield="k", value=pagination) + new_related_id = presented_at(self, key, value) + if new_related_id: + rel_id = new_related_id[0] + rel_ids = self.get("related_identifiers", []) + if rel_id not in rel_ids: + rel_ids.append(rel_id) + self["related_identifiers"] = rel_ids + raise IgnoreKey("related_id") + + +@model.over("descriptions", "^520__", override=True) +@for_each_value +def descriptions(self, key, value): + """Translates description.""" + description_text = StringValue(value.get("a")).parse() + provenance = value.get("9", "").strip() + curation_info = value.get("8", "").strip() + + record_description = self.get("description", "") + + # Decide once whether we need to append legacy data + should_append_legacy = bool(curation_info or provenance or record_description) + + if not record_description: + self["description"] = description_text + + if should_append_legacy: + append_transformed_subfields(self, key, value, "digitized_description") + + IgnoreKey("descriptions") + + +@model.over("language", "^041__", override=True) +@require(["a"]) +@for_each_value +@strip_output +def languages(self, key, value): + """Translates language field.""" + raw_lang = value.get("a") + raw_lang = raw_lang if isinstance(raw_lang, (list, tuple)) else [raw_lang] + + try: + langs = [ + pycountry.languages.lookup(clean_str(r).lower()).alpha_2.lower() + for r in raw_lang + ] + except Exception: + raise UnexpectedValue(field=key, subfield="a", value=raw_lang) + + if not langs: + raise MissingRequiredField(field=key, subfield="a", value=raw_lang) + provenance = value.get("9", "").strip() + + curation_field = value.get("8", "").strip() + if curation_field or provenance: + append_transformed_subfields(self, key, value, "digitized_language") + + language = self.get("language", "") + additional_langs = langs[1:] + if not language: + self["language"] = langs[0] + else: + additional_langs = langs + + # Extend additional languages if not already present + record_additional_languages = self.get("additional_languages", []) + additional_langs = [ + lang + for lang in additional_langs + if lang != language and lang not in record_additional_languages + ] + record_additional_languages.extend(additional_langs) + self["additional_languages"] = record_additional_languages + + IgnoreKey("language") + + +@model.over("keywords", "^653[12_]_", override=True) +@require(["a"]) +@for_each_value +def keywords(self, key, value): + """Translates keywords from tag 6531.""" + keyword = value.get("a", "").strip() + provenance = value.get("9", "").strip() + if provenance and provenance not in ["CERN", "review", "review Mar2021", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + curation_field = value.get("8", "").strip() + if curation_field and keyword: + append_transformed_subfields(self, key, value, "digitized_keywords") + if keyword: + return {"name": keyword} + + +@model.over("copyright", "^542__", override=True) +def copyright(self, key, value): + """Translates copyright.""" + holder = value.get("d", "").strip() + a_value = value.get("a", "").strip() + year = value.get("g", "").strip() + if holder and a_value: + raise UnexpectedValue( + message="Holder and a_value present at the same time!", field=key + ) + if not holder: + holder = a_value + if "copyright" in self: + validate_copyright(self, key, holder, year) + raise IgnoreKey("copyright") + statement = value.get("f", "").strip() + material = value.get("3", "").strip() + + # Drop material + if material and material not in ["publication", "Report"]: + raise UnexpectedValue(field=key, subfield="3", value=material) + + full_holder = f"{holder} {statement}".strip() + if not full_holder: + raise UnexpectedValue(message="Holder is missing for copyright!") + + copyright = {"holder": full_holder} + if year: + copyright["year"] = year + if "cern" in holder.lower(): + copyright["url"] = "http://copyright.web.cern.ch" + + return copyright + + +@model.over("publication_date", "^269__", override=True) +@for_each_value +def imprint(self, key, value): + """Translates date from tag 269.""" + name = value.get("b") + place = value.get("a") + + # Transform as contributor if different than CERN Geneva + producer = " ".join( + part + for part in (place, name) + if part and part.upper() not in {"GENEVA", "CERN"} + ) + if producer: + self["contributors"].append({"name": producer, "role": "Producer"}) + + provenance = value.get("9", "").strip() + if provenance: + append_transformed_subfields(self, key, value, "digitized_imprint_date") + raise IgnoreKey("publication_date") + + date_field = value.get("c") # 269 'c' subfield (e.g., '1993-08-09') + parsed_date = parse_date(date_field) + if parsed_date: # If parsing succeeds, return the formatted date + return parsed_date + parsed_date = parse_date(place) + if parsed_date: + return parsed_date + + +@model.over("imprint_date", "^260__", override=True) +@for_each_value +def tag260(self, key, value): + """Check tag 269.""" + + def validate_location(val, subfield): + if val and val.upper() not in {"GENEVA", "CERN"}: + raise UnexpectedValue( + field=key, subfield=subfield, value=val, message="Unexpected tag 260" + ) + + name = value.get("b") + place = value.get("a") + curation_info = value.get("8", "").strip() + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "review Mar2021"]: + raise UnexpectedValue(field=key, subfield="9", value=provenance) + if curation_info: + append_transformed_subfields(self, key, value, "digitized_imprint_date") + + validate_location(place, "a") + validate_location(name, "b") + + date_field = value.get("c") + parsed_date = parse_date(date_field) + if parsed_date: # If parsing succeeds, return the formatted date + return parsed_date + raise IgnoreKey("imprint_date") + + +@model.over("notes", "^500__", override=True) +@for_each_value +@require(["a"]) +def notes(self, key, value): + """Detects notes.""" + curation_field = value.get("8", "").strip() + if curation_field: + append_transformed_subfields(self, key, value, "digitized_notes") + return None + note_str = value.get("a").strip() + if value.get("9"): + note_str = value.get("9").strip() + " : " + value.get("a").strip() + note = {"note": note_str} + + parts = note_str.split(",") + match_date = parts[-1].strip() if len(parts) > 1 else "" + if match_date: + parsed_date = parse_date(match_date) + if parsed_date: + note.update({"date": parsed_date}) + + return note + + +@model.over("subject_indicators", "(^690C_)|(^690c_)", override=True) +@for_each_value +def subject_indicators(self, key, value): + """Translates subject_indicators as keywords from tag 690C.""" + subject = value.get("a", "").strip() + if subject: + if subject not in [ + "ACAD", + "CERN", + "TALK", + "movingimages", + "SSLP", + "reviewed", + "quality-controlled", + ]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="a", value=subject) + curated_field = value.get("9", "").strip() + if curated_field and curated_field not in ["review", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=curated_field) + return {"name": subject} + + +@model.over("subject_categories", "(^65017)|(^65027)", override=True) +@for_each_value +def subject_categories(self, key, value): + """ + Translates subject_category as keywords from tag 65017,65027. + Digitization project: EPFL_MC Categories. + """ + keyword = value.get("a", "").strip() + source = value.get("2", "").strip() + if source and source != "SzGeCERN": + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=source) + provenance = value.get("9", "").strip() + curation_field = value.get("8", "").strip() + if curation_field or provenance: + append_transformed_subfields(self, key, value, "digitized_subject_categories") + return None + if keyword: + return {"name": keyword} + + +@model.over("additional_descriptions", "(^590__)|(^490__)", override=True) +@for_each_value +def additional_descriptions(self, key, value): + """Translates additional_descriptions.""" + if key == "490__": + additional_description = series(self, key, value) + return additional_description[0] + description = value.get("a", "").strip() + provenance = value.get("9", "").strip() + curation_information = value.get("8", "").strip() + if provenance or curation_information: + append_transformed_subfields(self, key, value, "digitized_description") + return None + if description: + return {"description": description, "type": "Other", "lang": "fr"} + return None + + +@model.over("lecture_created", "^961__", override=True) +def creation_date(self, key, value): + """Translate record creation date. + + - tag 961, subfield code x for creation date + - tag 961, subfield code c for modification_date + + It can also store: + - library 'l' subfield + - hour 'h' subfield + - cataloguer 'a' subfield (name of the curator) + - cataloguer level 'b' subfield (curator?) + """ + + b_value = value.get("b", "").strip().lower() + if b_value and b_value != "curator": + raise UnexpectedValue(field=key, subfield="b", value=b_value) + a_value = value.get("a", "").strip() + if b_value or a_value: + append_transformed_subfields(self, key, value, "cds_modification_field") + raise IgnoreKey("lecture_created") + + # 961 'x' subfield + creation_date = value.get("x", "").strip() + parsed_creation_date = parse_date(creation_date) + if not parsed_creation_date: + # Check if anything else stored + raise UnexpectedValue(field=key, subfield="x", value=creation_date) + # 961 'c' subfield + modification_date = value.get("c", "").strip() + parsed_modification_date = parse_date(modification_date) + if modification_date and not parsed_modification_date: + # Check if anything else stored + raise UnexpectedValue(field=key, subfield="c", value=modification_date) + return parsed_creation_date + + +@model.over("action_note", "^5831_") +@for_each_value +def action_note(self, key, value): + """Translates action note (digitized information).""" + def format_field(value, subfield): + val = value.get(subfield) + if isinstance(val, (list, tuple)): + if subfield == "f": + cleaned = [str(v).strip() for v in val if v and str(v).strip()] + return cleaned if cleaned else None + raise UnexpectedValue(field=key, subfield=subfield, value=val) + if subfield == "f" and val: + return [val.strip()] + return val if val else None + + preservation_entry = { + k: v + for k, v in { + "source": format_field(value, "2"), + "format": format_field(value, "3"), + "institution": format_field(value, "5"), + "batch": format_field(value, "6"), + "sequence_identifier": format_field(value, "8"), + "action": format_field(value, "a"), + "digitization_setup": format_field(value, "b"), + "date": format_field(value, "c"), + "preservation_notes": format_field(value, "f"), + "workflow": format_field(value, "i"), + "vendor": format_field(value, "k"), + "title": format_field(value, "l"), + "duration_value": format_field(value, "n"), + "duration_unit": format_field(value, "o"), + "related_record": format_field(value, "u"), + "timing_note": format_field(value, "x"), + "quality_control_note": format_field(value, "z"), + }.items() + if v + } + preservation_values = self["_curation"].get("preservation_values", []) + preservation_values.append(preservation_entry) + self["_curation"]["preservation_values"] = preservation_values + raise IgnoreKey("action_note") + + +@model.over("curated_copyright", "^5421_") +@for_each_value +def curation_copyright(self, key, value): + """Translates curated copyright information.""" + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "review Mar2021"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + year = value.get("g", "").strip() + holder = value.get("d", "").strip() + copyright = validate_copyright(self, key, holder, year) + if copyright: + self["curated_copyright"] = copyright + raise IgnoreKey("curated_copyright") + + +@model.over("597", "^597__") +@for_each_value +def curation_local_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "digitized_comments") + + +@model.over("514", "^514__") +@for_each_value +def data_quality_note(self, key, value): + """Translates digitized information.""" + append_transformed_subfields(self, key, value, "digitized_data_quality_note") + + +@model.over("594", "^594__") +@for_each_value +def curated_type(self, key, value): + """Translates digitized information.""" + type = value.get("a", "").strip() + if type and type not in [ + "Conference Speech", + "Conférence Speech", + "Footage", + "Video Clip", + "Movie", + ]: + raise UnexpectedValue(field=key, subfield="a", value=type) + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "review Mar2021", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + # add as digitized keyword to match with multiple video record + curation_field = value.get("8", "").strip() + if curation_field: + append_transformed_subfields(self, key, value, "digitized_keywords") + # add as keyword if not already present + else: + keywords_names = [keyword["name"] for keyword in self.get("keywords", [])] + if type and type not in keywords_names: + self["keywords"].append({"name": type}) + raise IgnoreKey("594") + + +@model.over("performers", "5111_") +@for_each_value +def performer_note(self, key, value): + """Translates digitized information.""" + contributor_name = value.get("a", "").strip() + # Sometimes it's contributor + provenance = value.get("9", "").strip() + if provenance and provenance not in ["review", "review Mar2021", "CERN QA"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + + # Check if any contributor is different + performer = self.get("performer", "") + if performer: + if performer != contributor_name: + append_transformed_subfields(self, key, value, "digitized_filmed_people") + raise IgnoreKey("performers") + else: + self["performer"] = contributor_name + + add_contributor(self, contributor_name, "Speaker") + raise IgnoreKey("performers") + + +@model.over("963", "^963__") +@for_each_value +def digitized_access(self, key, value): + """Translates digitized information.""" + owner = value.get("a", "").strip() + # One record's video is restricted. https://cds.cern.ch/record/1566223/ + # If it's restricted digitized link is restricted to cern only + if owner and owner.lower() not in ["public", "restricted"]: + raise UnexpectedValue(field=key, subfield="a", value=owner) + append_transformed_subfields(self, key, value, "digitized_access") + raise IgnoreKey("963") + + +@model.over("993", "^993__") +@for_each_value +def digitized_993(self, key, value): + """Translates digitized information.""" + t_value = value.get("t", "").strip() + # Only one record have this field: https://cds.cern.ch/record/690303/export/xm + if t_value and t_value != "Fusion Research": + raise UnexpectedValue(field=key, subfield="t", value=t_value) + keywords_names = [keyword["name"] for keyword in self.get("keywords", [])] + if t_value and t_value not in keywords_names: + self["keywords"].append({"name": t_value}) + raise IgnoreKey("993") + + +@model.over("344", "^344__") +@for_each_value +def digitized_344(self, key, value): + """Translates digitized information.""" + a_value = value.get("a", "").strip() + # Only one record have this field: https://cds.cern.ch/record/319677/export/xm + if a_value and a_value.lower() != "video": + raise UnexpectedValue(field=key, subfield="a", value=a_value) + raise IgnoreKey("344") + + +@model.over("directed_by", "(^5081_)|(^508__)") +@for_each_value +def directed_by(self, key, value): + """Translates directed by information.""" + # All the values are the same: ignored '9' and '8' subfields and transformed as director + director_name = value.get("a", "").strip() + provenance = value.get("9", "").strip() + if provenance and provenance not in ["CERN QA", "review"]: + # checking if anything else stored in this field + raise UnexpectedValue(field=key, subfield="9", value=provenance) + curation_field = value.get("8", "").strip() + directed_by = self.get("directed_by", []) + if directed_by: + if directed_by.lower() != director_name.lower(): + # Checking if all the values are the same + raise UnexpectedValue(field=key, subfield="a", value=director_name) + else: + self["directed_by"] = director_name + + add_contributor(self, director_name, "Director") + raise IgnoreKey("directed_by") + + +@model.over("020", "^020__") +@for_each_value +def book_number(self, key, value): + """Translates digitized information.""" + # Only one record have this field: https://cds.cern.ch/record/334106/export/xm + append_transformed_subfields(self, key, value, "legacy_marc_fields", "020") + + +@model.over("856", "^856_2") +@for_each_value +def digitized_856_2(self, key, value): + """Translates digitized information.""" + # Looks like 8564_ field: https://cds.cern.ch/record/1565555/export/xm + url_file = url_files(self, key, value) + if url_file: + self["url_files"].append(url_file[0]) + raise IgnoreKey("856_2") + + +@model.over("775", "^775__") +@require(["w"]) +@for_each_value +def digitized_775(self, key, value): + """Translates digitized information.""" + # Only one record: https://cds.cern.ch/record/423086/ + recid = value.get("w", "").strip() + resource_type = value.get("b", "").strip().lower() + if resource_type and resource_type != "article": + raise UnexpectedValue(field=key, subfield="b", value=resource_type) + related_identifier = { + "identifier": recid, + "scheme": "CDS", + "relation_type": "IsVariantFormOf", + } + if resource_type: + related_identifier["resource_type"] = "Text" + # Add as related identifier if not present + rel_ids = self.get("related_identifiers", []) + if related_identifier not in rel_ids: + rel_ids.append(related_identifier) + self["related_identifiers"] = rel_ids + raise IgnoreKey("775") + + +@model.over("300", "^300__") +@for_each_value +def pyhsical_description(self, key, value): + """Translates tag 300.""" + append_transformed_subfields(self, key, value, "digitized_physical_description") diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py index 34555fef..4b599bd6 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py @@ -157,6 +157,7 @@ def format_field(val): "nonpublic_note": format_field(value.get("x")), "md5_checksum": format_field(value.get("w")), "source": format_field(value.get("2")), + "description": format_field(value.get("i")), }.items() if v } @@ -165,7 +166,7 @@ def format_field(val): indico_link = {} # Try to get event id - match_id = re.search(r"(?:ida=|confId=|event/)([\w\d]+)", url) + match_id = re.search(r"(?:ida=|confId=|confid=|event/)([\w\d]+)", url) if match_id: event_id = match_id.group(1) if event_id: @@ -453,7 +454,12 @@ def additional_titles(self, key, value): if lang == "Titre français": additional_title["lang"] = "fr" # Transform as AlternativeTitle - elif lang not in ["Previous title", "Also quoted as"]: + elif lang not in [ + "Previous title", + "Also quoted as", + "Former title", + "Original title", + ]: raise UnexpectedValue(field=key, subfield="i", value=lang) if volume: diff --git a/setup.cfg b/setup.cfg index c3f95e06..e1bf28ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,8 @@ rdm = videos = Flask-Security-Invenio==3.4.0 marshmallow<4.0 - cds @ git+https://github.com/CERNDocumentServer/cds-videos@additional-files#egg=cds + cds @ git+https://github.com/CERNDocumentServer/cds-videos@main#egg=cds + srt-to-vtt==1.0.0 tests = pytest-black>=0.3.0 @@ -165,11 +166,16 @@ invenio_pidstore.minters = # videos migration cds_migrator_kit.videos.models = video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.video_lecture:model + digitized_video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.digitized_video_lecture:model cds_migrator_kit.videos.rules.base = base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base cds_migrator_kit.videos.rules.video_lecture = base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.video_lecture +cds_migrator_kit.videos.rules.digitized_video_lecture = + base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base + video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.video_lecture + digitized_video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.digitized_video_lecture cds_migrator_kit.videos.submitter.model = videos_submitter = cds_migrator_kit.videos.weblecture_migration.users.transform.xml_processing.models.submitter:videos_submitter_model From afbda110c5a121845a8252405370df0a7ffc00db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Thu, 2 Apr 2026 12:41:08 +0200 Subject: [PATCH 2/2] videos: use creation date for missing date digitized records --- .../videos/weblecture_migration/transform/transform.py | 8 +++++--- tests/cds-videos/test_videos_transform_rules.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py index 549160c6..996d636d 100644 --- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py +++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py @@ -342,14 +342,14 @@ def reformat_date(json_data): json_data["_curation"]["legacy_dates"] = legacy_dates dates_set = [sorted(dates_set)[0]] - # TODO: update after migrator fix + # TODO: remove this part after digitized lecturemedia migration if not dates_set: - dates_set = [json_data.get("lecture_created")] + dates_set = [json_data["creation_date"]] self.migration_logger.add_information( json_data.get("recid"), state={ "message": "Lecture created date used!", - "value": json_data.get("lecture_created"), + "value": json_data["creation_date"], }, ) @@ -749,6 +749,8 @@ def transform(self, entry): # Generate media files before metadata, it'll set if record has multiple masters media_files = self._media_files(json_data) + creation_date = self._created(record_dump) + json_data["creation_date"] = creation_date.date().isoformat() record_json_output = { "metadata": self._metadata(json_data), "created": self._created(record_dump), diff --git a/tests/cds-videos/test_videos_transform_rules.py b/tests/cds-videos/test_videos_transform_rules.py index 3bd7f89a..2edb91c6 100644 --- a/tests/cds-videos/test_videos_transform_rules.py +++ b/tests/cds-videos/test_videos_transform_rules.py @@ -159,7 +159,7 @@ def test_transform_date(dumpdir, base_app): res = load_and_dump_revision(modified_data) # Transform record - with pytest.raises(MissingRequiredField): + with pytest.raises(Exception): record_entry._metadata(res) @@ -228,7 +228,7 @@ def test_transform_digitized(dumpdir, base_app): # Transform record it should fail (no valid date, it has date range) record_entry = CDSToVideosRecordEntry() - with pytest.raises(MissingRequiredField): + with pytest.raises(Exception): record_entry._metadata(res) @@ -295,7 +295,7 @@ def test_transform_note(dumpdir, base_app): # Transform record it should fail (no valid date, it has date range) record_entry = CDSToVideosRecordEntry() - with pytest.raises(MissingRequiredField): + with pytest.raises(Exception): record_entry._metadata(res) # Test case: Add internal note which has a valid date to record