CERNDocumentServer · zzacharo · Apr 2, 2026 · Mar 26, 2026 · Apr 2, 2026 · zzacharo
diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py
@@ -34,8 +34,8 @@
     MigrationProgressLogger,
     RecordStateLogger,
 )
-from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import (
-    transform_legacy_urls,
+from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
+    update_metadata_multiple_video_record,
 )
 
 from .helpers import (
@@ -341,38 +341,10 @@ def create_publish_multiple_video_record(self, entry):
             media_files = self._get_files(record["files"], json_data.get("files", []))
             master_file_id = media_files["master_path"].split("/")[-1]
 
-            # Use the correct metadata for each record
-            event_id = record.get("event_id")
-            url = record.get("url")
-            date = record["date"]
-            location = record.get("location")
-
-            metadata = common_metadata.copy()
-            related_identifiers = list(metadata.get("related_identifiers", []))
-            if event_id:
-                # Insert event_id at the beginning
-                related_identifiers.insert(
-                    0,
-                    {
-                        "scheme": "Indico",
-                        "identifier": str(event_id),
-                        "relation_type": "IsPartOf",
-                    },
-                )
-            if url:
-                url = transform_legacy_urls(url, type="indico")
-                url_identifier = {
-                    "scheme": "URL",
-                    "identifier": url,
-                    "relation_type": "IsPartOf",
-                }
-                if url_identifier not in related_identifiers:
-                    related_identifiers.append(url_identifier)
-
-            metadata["related_identifiers"] = related_identifiers
-            metadata["date"] = date
-            if location:
-                metadata["location"] = location
+            # Update metadata for multiple video record
+            metadata = update_metadata_multiple_video_record(record, common_metadata)
+            # Mint report number if matched with video
+            report_number = metadata.get("report_number", None)
 
             # Create video and flow
             video_deposit, video_deposit_id, bucket_id, payload = (

diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
@@ -9,6 +9,15 @@ records:
     transform:
       files_dump_dir: /path_to/afs_files/
       eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
+  digitized-lectures:
+    data_dir: cds_migrator_kit/videos/weblecture_migration/data/
+    tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
+    log_dir: cds_migrator_kit/videos/weblecture_migration/log
+    extract:
+      dirpath: cds_migrator_kit/videos/weblecture_migration/data/digitized-lectures/dump/
+    transform:
+      files_dump_dir: /path_to/afs_files/
+      eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
 submitters:
     data_dir: cds_migrator_kit/videos/weblecture_migration/data/users
     tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp/users

diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2026 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+
+
+"""CDS-Videos Digitized Video Lecture model."""
+
+from cds_migrator_kit.transform.overdo import CdsOverdo
+
+from .video_lecture import model as video_lecture_model
+
+
+class DigitizedVideoLecture(CdsOverdo):
+    """Translation Index for CERN Digitized Video Lectures."""
+
+    __query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED 5831_.a:digitized"
+
+    __ignore_keys__ = {
+        "0248_a",  # oai identifier
+        "0248_p",  # oai identifier
+        "0248_q",  # oai
+        "100__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "111__z",  # End date (indico)
+        "250__a",  # Edition
+        "337__a",  # Checked values only `Video`
+        "511__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "5111_8",  # Video id for performer, multi video records.
+        "518__l",  # Lectures: length of speech
+        "518__h",  # Lectures: Starting time
+        "700__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "518__e",  # Speaker (1 record has contributor in 518, it's also in 5111)
+        "8564_8",  # File: bibdoc id
+        "8564_s",  # File: file size
+        "916__y",  # Status week year
+        "960__a",  # Base number
+        # CDS modification tag
+        "961__h",
+        "961__l",
+    }
+
+    _default_fields = {
+        "language": "",
+        "description": "",
+        "performer": "",
+        "url_files": [],
+        "curated_copyright": {},
+        "lecture_infos": [],
+        "_curation": {
+            "preservation_values": [],
+        },
+        "contributors": [],
+        "alternate_identifiers": [],
+        "additional_languages": [],
+        "collections": [],
+        "keywords": [],
+    }
+
+
+model = DigitizedVideoLecture(
+    bases=(video_lecture_model,),
+    entry_point_group="cds_migrator_kit.videos.rules.digitized_video_lecture",
+)
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -50,6 +50,7 @@
 from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
     transform_multiple_video_record,
     transform_multiple_video_wihtout_indico,
+    try_to_match_metadata,
 )
 
 cli_logger = logging.getLogger("migrator")
@@ -300,7 +301,7 @@ def guess_dates(json_data, key, subkey=None):
             ### Returns:
             - `set[str]`: A set of date strings.
             """
-            items = json_data.get(key, [])
+            items = get_values_in_json(json_data, key, type=list)
             if subkey:
                 return {
                     item[subkey]["date"]
@@ -318,6 +319,7 @@ def reformat_date(json_data):
                 or get_values_in_json(json_data, "publication_date")
                 or guess_dates(json_data, "url_files", subkey="indico")
                 | guess_dates(json_data, "notes")
+                or get_values_in_json(json_data, "imprint_date")
             )
             # If multiple video record use lecture_infos or url_files
             lecture_infos = [
@@ -331,6 +333,25 @@ def reformat_date(json_data):
                 and "date" in item["indico"]
                 and "event_id" in item["indico"]
             ]
+            if json_data.get("legacy_recid") in [319681, 319687]:
+                # Couldn't match date and video. Use the first date and add them to curation
+                legacy_dates = get_values_in_json(
+                    json_data.get("_curation", {}), "legacy_dates", type=list
+                )
+                legacy_dates.extend(dates_set)
+                json_data["_curation"]["legacy_dates"] = legacy_dates
+                dates_set = [sorted(dates_set)[0]]
+
+            # TODO: remove this part after digitized lecturemedia migration
+            if not dates_set:
+                dates_set = [json_data["creation_date"]]
+                self.migration_logger.add_information(
+                    json_data.get("recid"),
+                    state={
+                        "message": "Lecture created date used!",
+                        "value": json_data["creation_date"],
+                    },
+                )
 
             # Return the valid date
             if len(dates_set) >= 1 and not self.has_multiple_master:
@@ -353,9 +374,10 @@ def reformat_date(json_data):
 
         def description(json_data):
             """Reformat the description for the cds-videos data model."""
-            if not json_data.get("description"):
+            description = json_data.get("description", "")
+            if not description:
                 return json_data.get("title").get("title")
-            return json_data.get("description")
+            return description
 
         def format_contributors(json_data):
             """
@@ -394,7 +416,7 @@ def publication_date(json_data):
 
         def notes(json_data):
             """Get the notes."""
-            notes = entry.get("notes")
+            notes = get_values_in_json(json_data, "notes", type=list)
             if notes:
                 note_strings = [note.get("note") for note in notes]
                 return "\n".join(note_strings)
@@ -448,7 +470,7 @@ def location(json_data):
         def get_report_number(json_data):
             """Return the report number."""
             report_numbers = get_values_in_json(json_data, "report_number", type=list)
-            if len(report_numbers) > 1:
+            if len(report_numbers) >= 1:
                 # If report number exists put it in curation
                 report_number = report_numbers[0]
                 return report_numbers, self.check_pid_exists(
@@ -459,7 +481,9 @@ def get_report_number(json_data):
         def get_keywords(json_data):
             """Return keywords."""
             keywords = json_data.get("keywords", [])
-            subject_categories = json_data.get("subject_categories", [])
+            subject_categories = get_values_in_json(
+                json_data, "subject_categories", type=list
+            )
             subject_indicators = json_data.get("subject_indicators", [])
 
             all_keywords = [
@@ -578,7 +602,9 @@ def get_curation(json_data):
             """Return _curation."""
             _curation = json_data.get("_curation", {})
             # Add volumes
-            additional_titles = json_data.get("additional_titles", [])
+            additional_titles = get_values_in_json(
+                json_data, "additional_titles", type=list
+            )
             volumes = [item["volume"] for item in additional_titles if "volume" in item]
             if volumes:
                 _curation["volumes"] = volumes
@@ -594,7 +620,7 @@ def get_curation(json_data):
 
         def get_additional_titles(json_data):
             """Return additional_titles."""
-            tag_246 = json_data.get("additional_titles", {})
+            tag_246 = get_values_in_json(json_data, "additional_titles", type=list)
             _titles = [item for item in tag_246 if "title" in item]
             additional_titles = []
             for title_item in _titles:
@@ -626,10 +652,10 @@ def get_access(json_data):
             """Generate access permissions based on restrictions."""
             # Update permissions
             update = [current_app.config["WEBLECTURES_MIGRATION_SYSTEM_USER"]]
-            submitter = json_data.get("submitter")
+            submitter = json_data.get("submitters", [])
             if submitter:
                 # Add submitter to update list
-                update.append(submitter)
+                update.extend(submitter)
             collections = get_collections(json_data)
             if "Lectures::CERN Accelerator School" in collections:
                 update.extend(current_app.config["CAS_LECTURES_ACCESS"])
@@ -643,14 +669,21 @@ def get_access(json_data):
                 access["read"] = all_emails
             return access
 
+        def get_language(json_data):
+            """Return language."""
+            language = json_data.get("language")
+            if not language:
+                raise MissingRequiredField(message="Language is missing!")
+            return language
+
         record_dates = reformat_date(entry)
         # Date will be None if record has multiple masters
         record_date = record_dates[0] if not self.has_multiple_master else None
         metadata = {
             "title": entry["title"],
             "description": description(entry),
             "contributors": format_contributors(entry),
-            "language": entry.get("language"),
+            "language": get_language(entry),
             "date": record_date,
             "publication_date": publication_date(entry) or record_date,
             "keywords": get_keywords(entry),
@@ -716,6 +749,8 @@ def transform(self, entry):
 
         # Generate media files before metadata, it'll set if record has multiple masters
         media_files = self._media_files(json_data)
+        creation_date = self._created(record_dump)
+        json_data["creation_date"] = creation_date.date().isoformat()
         record_json_output = {
             "metadata": self._metadata(json_data),
             "created": self._created(record_dump),
@@ -726,6 +761,7 @@ def transform(self, entry):
         }
         if self.has_multiple_master:
             record_json_output["is_multiple_video_record"] = True
+            self.multiple_video_record_entries["recid"] = json_data["legacy_recid"]
             try:
                 mapped_multiple_video_record, common = transform_multiple_video_record(
                     self.multiple_video_record_entries
@@ -747,6 +783,10 @@ def transform(self, entry):
                     )
                 else:
                     raise e
+            mapped_multiple_video_record = sorted(
+                mapped_multiple_video_record,
+                key=lambda r: r["files"]["master_path"],
+            )
             if len(mapped_multiple_video_record) > 10:
                 self.migration_logger.add_information(
                     json_data["recid"],
@@ -804,6 +844,35 @@ def transform(self, entry):
             record_json_output["metadata"] = metadata
             record_json_output["multiple_video_record"] = mapped_multiple_video_record
 
+            # Try to match metadata for multiple video record
+            curation = metadata.get("_curation", {})
+            tracked_fields = {
+                "digitized_description": "Digitized description matched!",
+                "digitized_language": "Digitized language matched!",
+                "digitized_keywords": "Digitized keywords matched!",
+                "legacy_report_number": "Legacy report number matched!",
+            }
+
+            if any(field in curation for field in tracked_fields):
+                mapped_multiple_video_record, new_curation = try_to_match_metadata(
+                    mapped_multiple_video_record, curation
+                )
+
+                for field, message in tracked_fields.items():
+                    if curation.get(field) != new_curation.get(field):
+                        self.migration_logger.add_information(
+                            json_data["recid"],
+                            {
+                                "message": message,
+                                "value": field,
+                            },
+                        )
+                metadata["_curation"] = new_curation
+                record_json_output["metadata"] = metadata
+                record_json_output["multiple_video_record"] = (
+                    mapped_multiple_video_record
+                )
+
         return {
             "created": self._created(record_dump),
             "updated": self._updated(record_dump),

diff --git a/...migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/...migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
@@ -52,7 +52,7 @@ def get_contributor(key, value, contributor_role="", name=""):
     contributor = {"name": name}
     if affiliation:
         if isinstance(affiliation, (list, tuple)):
-            contributor["affiliations"] = list(affiliation)
+            contributor["affiliations"] = list(set(affiliation))
         elif isinstance(affiliation, str):
             contributor["affiliations"] = [affiliation]
     if contributor_role:

diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py
@@ -11,8 +11,17 @@
 def transform_subfields(key, value):
     """Helper to transform MARC subfields into key-prefixed strings."""
     value = dict(value)
+
+    priority = {
+        "9": 0,
+        "8": 1,
+    }
+
     output = []
-    for subfield, subvalue in value.items():
+    for subfield, subvalue in sorted(
+        value.items(),
+        key=lambda item: (priority.get(item[0], 99), item[0]),
+    ):
         if isinstance(subvalue, (list, tuple)):
             for item in subvalue:
                 output.append(f"{key}{subfield}:{item}")