From cb043f2c76895d81846ce1d79da03e9bcb8a9873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Thu, 26 Mar 2026 17:58:45 +0100
Subject: [PATCH 1/2] videos: add model and rules for digitized records

---
 .../videos/weblecture_migration/load/load.py  |  40 +-
 .../videos/weblecture_migration/streams.yaml  |   9 +
 .../models/digitized_video_lecture.py         |  68 ++
 .../transform/transform.py                    |  89 ++-
 .../xml_processing/quality/contributors.py    |   2 +-
 .../xml_processing/quality/curation.py        |  11 +-
 .../xml_processing/quality/multiple_video.py  | 323 ++++++++-
 .../transform/xml_processing/rules/base.py    |   9 +-
 .../rules/digitized_video_lecture.py          | 653 ++++++++++++++++++
 .../xml_processing/rules/video_lecture.py     |  10 +-
 setup.cfg                                     |   8 +-
 11 files changed, 1160 insertions(+), 62 deletions(-)
 create mode 100644 cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py
 create mode 100644 cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py

diff --git a/cds_migrator_kit/videos/weblecture_migration/load/load.py b/cds_migrator_kit/videos/weblecture_migration/load/load.py
index 40909d6f..48ca02b0 100644
--- a/cds_migrator_kit/videos/weblecture_migration/load/load.py
+++ b/cds_migrator_kit/videos/weblecture_migration/load/load.py
@@ -34,8 +34,8 @@
     MigrationProgressLogger,
     RecordStateLogger,
 )
-from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import (
-    transform_legacy_urls,
+from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
+    update_metadata_multiple_video_record,
 )
 
 from .helpers import (
@@ -341,38 +341,10 @@ def create_publish_multiple_video_record(self, entry):
             media_files = self._get_files(record["files"], json_data.get("files", []))
             master_file_id = media_files["master_path"].split("/")[-1]
 
-            # Use the correct metadata for each record
-            event_id = record.get("event_id")
-            url = record.get("url")
-            date = record["date"]
-            location = record.get("location")
-
-            metadata = common_metadata.copy()
-            related_identifiers = list(metadata.get("related_identifiers", []))
-            if event_id:
-                # Insert event_id at the beginning
-                related_identifiers.insert(
-                    0,
-                    {
-                        "scheme": "Indico",
-                        "identifier": str(event_id),
-                        "relation_type": "IsPartOf",
-                    },
-                )
-            if url:
-                url = transform_legacy_urls(url, type="indico")
-                url_identifier = {
-                    "scheme": "URL",
-                    "identifier": url,
-                    "relation_type": "IsPartOf",
-                }
-                if url_identifier not in related_identifiers:
-                    related_identifiers.append(url_identifier)
-
-            metadata["related_identifiers"] = related_identifiers
-            metadata["date"] = date
-            if location:
-                metadata["location"] = location
+            # Update metadata for multiple video record
+            metadata = update_metadata_multiple_video_record(record, common_metadata)
+            # Mint report number if matched with video
+            report_number = metadata.get("report_number", None)
 
             # Create video and flow
             video_deposit, video_deposit_id, bucket_id, payload = (
diff --git a/cds_migrator_kit/videos/weblecture_migration/streams.yaml b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
index d0ef6590..301bbf27 100644
--- a/cds_migrator_kit/videos/weblecture_migration/streams.yaml
+++ b/cds_migrator_kit/videos/weblecture_migration/streams.yaml
@@ -9,6 +9,15 @@ records:
     transform:
       files_dump_dir: /path_to/afs_files/
       eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
+  digitized-lectures:
+    data_dir: cds_migrator_kit/videos/weblecture_migration/data/
+    tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
+    log_dir: cds_migrator_kit/videos/weblecture_migration/log
+    extract:
+      dirpath: cds_migrator_kit/videos/weblecture_migration/data/digitized-lectures/dump/
+    transform:
+      files_dump_dir: /path_to/afs_files/
+      eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
 submitters:
     data_dir: cds_migrator_kit/videos/weblecture_migration/data/users
     tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp/users
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py
new file mode 100644
index 00000000..a12f02be
--- /dev/null
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/models/digitized_video_lecture.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2026 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+
+
+"""CDS-Videos Digitized Video Lecture model."""
+
+from cds_migrator_kit.transform.overdo import CdsOverdo
+
+from .video_lecture import model as video_lecture_model
+
+
+class DigitizedVideoLecture(CdsOverdo):
+    """Translation Index for CERN Digitized Video Lectures."""
+
+    __query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED 5831_.a:digitized"
+
+    __ignore_keys__ = {
+        "0248_a",  # oai identifier
+        "0248_p",  # oai identifier
+        "0248_q",  # oai
+        "100__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "111__z",  # End date (indico)
+        "250__a",  # Edition
+        "337__a",  # Checked values only `Video`
+        "511__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "5111_8",  # Video id for performer, multi video records.
+        "518__l",  # Lectures: length of speech
+        "518__h",  # Lectures: Starting time
+        "700__0",  # Author id (eg: AUTHOR|(CDS)2067852)
+        "518__e",  # Speaker (1 record has contributor in 518, it's also in 5111)
+        "8564_8",  # File: bibdoc id
+        "8564_s",  # File: file size
+        "916__y",  # Status week year
+        "960__a",  # Base number
+        # CDS modification tag
+        "961__h",
+        "961__l",
+    }
+
+    _default_fields = {
+        "language": "",
+        "description": "",
+        "performer": "",
+        "url_files": [],
+        "curated_copyright": {},
+        "lecture_infos": [],
+        "_curation": {
+            "preservation_values": [],
+        },
+        "contributors": [],
+        "alternate_identifiers": [],
+        "additional_languages": [],
+        "collections": [],
+        "keywords": [],
+    }
+
+
+model = DigitizedVideoLecture(
+    bases=(video_lecture_model,),
+    entry_point_group="cds_migrator_kit.videos.rules.digitized_video_lecture",
+)
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
index 6fe46173..549160c6 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -50,6 +50,7 @@
 from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
     transform_multiple_video_record,
     transform_multiple_video_wihtout_indico,
+    try_to_match_metadata,
 )
 
 cli_logger = logging.getLogger("migrator")
@@ -300,7 +301,7 @@ def guess_dates(json_data, key, subkey=None):
             ### Returns:
             - `set[str]`: A set of date strings.
             """
-            items = json_data.get(key, [])
+            items = get_values_in_json(json_data, key, type=list)
             if subkey:
                 return {
                     item[subkey]["date"]
@@ -318,6 +319,7 @@ def reformat_date(json_data):
                 or get_values_in_json(json_data, "publication_date")
                 or guess_dates(json_data, "url_files", subkey="indico")
                 | guess_dates(json_data, "notes")
+                or get_values_in_json(json_data, "imprint_date")
             )
             # If multiple video record use lecture_infos or url_files
             lecture_infos = [
@@ -331,6 +333,25 @@ def reformat_date(json_data):
                 and "date" in item["indico"]
                 and "event_id" in item["indico"]
             ]
+            if json_data.get("legacy_recid") in [319681, 319687]:
+                # Couldn't match date and video. Use the first date and add them to curation
+                legacy_dates = get_values_in_json(
+                    json_data.get("_curation", {}), "legacy_dates", type=list
+                )
+                legacy_dates.extend(dates_set)
+                json_data["_curation"]["legacy_dates"] = legacy_dates
+                dates_set = [sorted(dates_set)[0]]
+
+            # TODO: update after migrator fix
+            if not dates_set:
+                dates_set = [json_data.get("lecture_created")]
+                self.migration_logger.add_information(
+                    json_data.get("recid"),
+                    state={
+                        "message": "Lecture created date used!",
+                        "value": json_data.get("lecture_created"),
+                    },
+                )
 
             # Return the valid date
             if len(dates_set) >= 1 and not self.has_multiple_master:
@@ -353,9 +374,10 @@ def reformat_date(json_data):
 
         def description(json_data):
             """Reformat the description for the cds-videos data model."""
-            if not json_data.get("description"):
+            description = json_data.get("description", "")
+            if not description:
                 return json_data.get("title").get("title")
-            return json_data.get("description")
+            return description
 
         def format_contributors(json_data):
             """
@@ -394,7 +416,7 @@ def publication_date(json_data):
 
         def notes(json_data):
             """Get the notes."""
-            notes = entry.get("notes")
+            notes = get_values_in_json(json_data, "notes", type=list)
             if notes:
                 note_strings = [note.get("note") for note in notes]
                 return "\n".join(note_strings)
@@ -448,7 +470,7 @@ def location(json_data):
         def get_report_number(json_data):
             """Return the report number."""
             report_numbers = get_values_in_json(json_data, "report_number", type=list)
-            if len(report_numbers) > 1:
+            if len(report_numbers) >= 1:
                 # If report number exists put it in curation
                 report_number = report_numbers[0]
                 return report_numbers, self.check_pid_exists(
@@ -459,7 +481,9 @@ def get_report_number(json_data):
         def get_keywords(json_data):
             """Return keywords."""
             keywords = json_data.get("keywords", [])
-            subject_categories = json_data.get("subject_categories", [])
+            subject_categories = get_values_in_json(
+                json_data, "subject_categories", type=list
+            )
             subject_indicators = json_data.get("subject_indicators", [])
 
             all_keywords = [
@@ -578,7 +602,9 @@ def get_curation(json_data):
             """Return _curation."""
             _curation = json_data.get("_curation", {})
             # Add volumes
-            additional_titles = json_data.get("additional_titles", [])
+            additional_titles = get_values_in_json(
+                json_data, "additional_titles", type=list
+            )
             volumes = [item["volume"] for item in additional_titles if "volume" in item]
             if volumes:
                 _curation["volumes"] = volumes
@@ -594,7 +620,7 @@ def get_curation(json_data):
 
         def get_additional_titles(json_data):
             """Return additional_titles."""
-            tag_246 = json_data.get("additional_titles", {})
+            tag_246 = get_values_in_json(json_data, "additional_titles", type=list)
             _titles = [item for item in tag_246 if "title" in item]
             additional_titles = []
             for title_item in _titles:
@@ -626,10 +652,10 @@ def get_access(json_data):
             """Generate access permissions based on restrictions."""
             # Update permissions
             update = [current_app.config["WEBLECTURES_MIGRATION_SYSTEM_USER"]]
-            submitter = json_data.get("submitter")
+            submitter = json_data.get("submitters", [])
             if submitter:
                 # Add submitter to update list
-                update.append(submitter)
+                update.extend(submitter)
             collections = get_collections(json_data)
             if "Lectures::CERN Accelerator School" in collections:
                 update.extend(current_app.config["CAS_LECTURES_ACCESS"])
@@ -643,6 +669,13 @@ def get_access(json_data):
                 access["read"] = all_emails
             return access
 
+        def get_language(json_data):
+            """Return language."""
+            language = json_data.get("language")
+            if not language:
+                raise MissingRequiredField(message="Language is missing!")
+            return language
+
         record_dates = reformat_date(entry)
         # Date will be None if record has multiple masters
         record_date = record_dates[0] if not self.has_multiple_master else None
@@ -650,7 +683,7 @@ def get_access(json_data):
             "title": entry["title"],
             "description": description(entry),
             "contributors": format_contributors(entry),
-            "language": entry.get("language"),
+            "language": get_language(entry),
             "date": record_date,
             "publication_date": publication_date(entry) or record_date,
             "keywords": get_keywords(entry),
@@ -726,6 +759,7 @@ def transform(self, entry):
         }
         if self.has_multiple_master:
             record_json_output["is_multiple_video_record"] = True
+            self.multiple_video_record_entries["recid"] = json_data["legacy_recid"]
             try:
                 mapped_multiple_video_record, common = transform_multiple_video_record(
                     self.multiple_video_record_entries
@@ -747,6 +781,10 @@ def transform(self, entry):
                     )
                 else:
                     raise e
+            mapped_multiple_video_record = sorted(
+                mapped_multiple_video_record,
+                key=lambda r: r["files"]["master_path"],
+            )
             if len(mapped_multiple_video_record) > 10:
                 self.migration_logger.add_information(
                     json_data["recid"],
@@ -804,6 +842,35 @@ def transform(self, entry):
             record_json_output["metadata"] = metadata
             record_json_output["multiple_video_record"] = mapped_multiple_video_record
 
+            # Try to match metadata for multiple video record
+            curation = metadata.get("_curation", {})
+            tracked_fields = {
+                "digitized_description": "Digitized description matched!",
+                "digitized_language": "Digitized language matched!",
+                "digitized_keywords": "Digitized keywords matched!",
+                "legacy_report_number": "Legacy report number matched!",
+            }
+
+            if any(field in curation for field in tracked_fields):
+                mapped_multiple_video_record, new_curation = try_to_match_metadata(
+                    mapped_multiple_video_record, curation
+                )
+
+                for field, message in tracked_fields.items():
+                    if curation.get(field) != new_curation.get(field):
+                        self.migration_logger.add_information(
+                            json_data["recid"],
+                            {
+                                "message": message,
+                                "value": field,
+                            },
+                        )
+                metadata["_curation"] = new_curation
+                record_json_output["metadata"] = metadata
+                record_json_output["multiple_video_record"] = (
+                    mapped_multiple_video_record
+                )
+
         return {
             "created": self._created(record_dump),
             "updated": self._updated(record_dump),
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
index c0d2984b..b51614d0 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/contributors.py
@@ -52,7 +52,7 @@ def get_contributor(key, value, contributor_role="", name=""):
     contributor = {"name": name}
     if affiliation:
         if isinstance(affiliation, (list, tuple)):
-            contributor["affiliations"] = list(affiliation)
+            contributor["affiliations"] = list(set(affiliation))
         elif isinstance(affiliation, str):
             contributor["affiliations"] = [affiliation]
     if contributor_role:
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py
index 959e4206..e18c5ea6 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/curation.py
@@ -11,8 +11,17 @@
 def transform_subfields(key, value):
     """Helper to transform MARC subfields into key-prefixed strings."""
     value = dict(value)
+
+    priority = {
+        "9": 0,
+        "8": 1,
+    }
+
     output = []
-    for subfield, subvalue in value.items():
+    for subfield, subvalue in sorted(
+        value.items(),
+        key=lambda item: (priority.get(item[0], 99), item[0]),
+    ):
         if isinstance(subvalue, (list, tuple)):
             for item in subvalue:
                 output.append(f"{key}{subfield}:{item}")
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py
index 69143b41..490d5df1 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/quality/multiple_video.py
@@ -6,12 +6,17 @@
 # the terms of the MIT License; see LICENSE file for more details.
 
 """CDS-Videos transform step module."""
+import copy
 import logging
 import re
 
-from cds_migrator_kit.errors import ManualImportRequired
+import pycountry
+
+from cds_migrator_kit.errors import ManualImportRequired, UnexpectedValue
+from cds_migrator_kit.transform.xml_processing.quality.parsers import clean_str
 from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import (
     get_new_indico_id,
+    transform_legacy_urls,
 )
 
 cli_logger = logging.getLogger("migrator")
@@ -70,6 +75,15 @@ def transform_multiple_video_record(multiple_video_record):
             None,
         )
         if not indico_id:
+            if multiple_video_record["recid"] == 468903:
+                # use publication date for missing dates
+                mapped.append(
+                    {
+                        "files": file_group,
+                        "date": "2000-08-07",
+                    }
+                )
+                continue
             raise ManualImportRequired(
                 f"Multi video record: No matching indico id for {file_group['master_path']}",
                 stage="transform",
@@ -152,13 +166,8 @@ def transform_multiple_video_record(multiple_video_record):
 
     # raise if anything is missing
     for record in mapped:
-        # raise if any date, event_id, or files is missing
-        if (
-            not record.get("date")
-            or not record.get("event_id")
-            or not record.get("files")
-            or not record.get("url")
-        ):
+        # raise if any date or files is missing
+        if not record.get("date") or not record.get("files"):
             raise ManualImportRequired(
                 message="Multiple video record needs curation. Date, event_id, or files is missing",
                 stage="transform",
@@ -207,3 +216,301 @@ def transform_multiple_video_wihtout_indico(multiple_video_record):
         )
 
     return mapped, None
+
+
+def parse_entry(entry):
+    """Parse entry into code and value."""
+    left, value = entry.split(":", 1)
+    code = left.split("__")[-1].split("_")[-1]
+    return code, value
+
+
+def grouped_values_with_code(entries):
+    """
+    Group flat entries into logical MARC-like groups.
+    A new group starts when a new 9 subfield appears.
+    """
+    groups = []
+    current_parsed = {}
+    current_raw = []
+
+    for entry in entries:
+        code, value = parse_entry(entry)
+
+        if code == "9" and current_raw:
+            groups.append({"parsed": current_parsed, "raw": current_raw})
+            current_parsed = {}
+            current_raw = []
+
+        current_parsed[code] = value
+        current_raw.append(entry)
+
+    if current_raw:
+        groups.append({"parsed": current_parsed, "raw": current_raw})
+
+    return groups
+
+
+def get_single_selector(event_id):
+    """
+    Examples:
+        CERN-VIDEO-C-123-A      -> "a"
+        CERN-VIDEO-C-402-A_pt1  -> "a"
+        CERN-VIDEO-C-402-A_pt2  -> "a"
+        CERN-VIDEO-C-123-B-C    -> None
+    """
+    if not event_id:
+        return None
+
+    last_part = event_id.split("-")[-1]
+    match = re.fullmatch(r"([A-Za-z])(?:_pt\d+)?", last_part)
+    if match:
+        return match.group(1).lower()
+
+    return None
+
+
+def match_with_code(entries, event_id, value_code="a"):
+    """
+    Returns:
+        matches: matched values
+        matched_groups: raw groups that matched this record
+
+    Rules:
+    - groups without selector 8 are ignored for matching
+    - groups with selector but without target value are ignored for matching
+    - combined ids like ...-B-C do not match anything
+    - A_pt1 / A_pt2 both match selector a
+    """
+    matches = []
+    matched_groups = []
+    event_selector = get_single_selector(event_id)
+
+    for group in grouped_values_with_code(entries):
+        parsed = group["parsed"]
+        raw = group["raw"]
+
+        selector = parsed.get("8")
+        value = parsed.get(value_code)
+
+        if not selector:
+            continue
+
+        if value is None:
+            continue
+
+        if event_selector and selector.lower() == event_selector:
+            if value != "":
+                matches.append(value)
+            matched_groups.append(tuple(raw))
+
+    return matches, matched_groups
+
+
+def remove_matched_groups(entries, matched_groups):
+    """Remove groups that were matched at least once."""
+    matched_groups = set(matched_groups)
+    remaining_entries = []
+
+    for group in grouped_values_with_code(entries):
+        raw_tuple = tuple(group["raw"])
+        if raw_tuple not in matched_groups:
+            remaining_entries.extend(group["raw"])
+
+    return remaining_entries
+
+
+def normalize_languages(raw_langs):
+    """Normalize languages."""
+    normalized = []
+
+    for r in raw_langs:
+        try:
+            lang = pycountry.languages.lookup(clean_str(r).lower()).alpha_2.lower()
+        except Exception:
+            raise UnexpectedValue(field="041__", subfield="a", value=r)
+
+        if lang not in normalized:
+            normalized.append(lang)
+
+    return normalized
+
+
+def try_to_match_metadata(multiple_video_records, _curation):
+    """Try to match curated metadata for multiple video record."""
+    curation = copy.deepcopy(_curation)
+    mapped_multiple_video_records = copy.deepcopy(multiple_video_records)
+
+    digitized_description = curation.get("digitized_description", [])
+    digitized_language = curation.get("digitized_language", [])
+    digitized_keywords = curation.get("digitized_keywords", [])
+    report_numbers = _curation.get("legacy_report_number", [])
+
+    matched_description_groups = set()
+    matched_language_groups = set()
+    matched_keyword_groups = set()
+    matched_report_numbers = set()
+
+    for mapped_multiple_video_record in mapped_multiple_video_records:
+        files = mapped_multiple_video_record["files"]
+        event_id = mapped_multiple_video_record.get("event_id")
+
+        # If indico id is present, we can't match
+        if event_id:
+            continue
+
+        event_id = files.get("master_path", "").split("/")[-1]
+        if not event_id:
+            continue
+
+        matched_digitized_description, description_groups = match_with_code(
+            digitized_description, event_id, value_code="a"
+        )
+        matched_digitized_language, language_groups = match_with_code(
+            digitized_language, event_id, value_code="a"
+        )
+        matched_digitized_keywords, keyword_groups = match_with_code(
+            digitized_keywords, event_id, value_code="a"
+        )
+
+        matched_description_groups.update(description_groups)
+        matched_language_groups.update(language_groups)
+        matched_keyword_groups.update(keyword_groups)
+
+        if matched_digitized_description:
+            mapped_multiple_video_record["description"] = matched_digitized_description
+
+        if matched_digitized_language:
+            mapped_multiple_video_record["language"] = normalize_languages(
+                matched_digitized_language
+            )
+
+        if matched_digitized_keywords:
+            mapped_multiple_video_record["keywords"] = matched_digitized_keywords
+
+        matched_report_number = next(
+            (
+                report_number
+                for report_number in report_numbers
+                if report_number == event_id
+            ),
+            None,
+        )
+        if matched_report_number:
+            mapped_multiple_video_record["report_number"] = [matched_report_number]
+            matched_report_numbers.add(matched_report_number)
+
+    # Remove matched groups only after all records have been processed
+    digitized_description = remove_matched_groups(
+        digitized_description, matched_description_groups
+    )
+    digitized_language = remove_matched_groups(
+        digitized_language, matched_language_groups
+    )
+    digitized_keywords = remove_matched_groups(
+        digitized_keywords, matched_keyword_groups
+    )
+    report_numbers = [
+        report_number
+        for report_number in report_numbers
+        if report_number not in matched_report_numbers
+    ]
+
+    # update curation
+    if digitized_description:
+        curation["digitized_description"] = digitized_description
+    else:
+        curation.pop("digitized_description", None)
+    if digitized_language:
+        curation["digitized_language"] = digitized_language
+    else:
+        curation.pop("digitized_language", None)
+    if digitized_keywords:
+        curation["digitized_keywords"] = digitized_keywords
+    else:
+        curation.pop("digitized_keywords", None)
+    if report_numbers:
+        curation["legacy_report_number"] = report_numbers
+    else:
+        curation.pop("legacy_report_number", None)
+
+    return mapped_multiple_video_records, curation
+
+
+def update_metadata_multiple_video_record(record, common_metadata):
+    """Update metadata for multiple video record."""
+    # Copy common metadata
+    metadata = copy.deepcopy(common_metadata)
+
+    # Use the correct metadata for each record
+    event_id = record.get("event_id")
+    url = record.get("url")
+    date = record["date"]
+    location = record.get("location")
+    descriptions = record.get("description")
+    lang = record.get("language")
+    keywords = record.get("keywords")
+    report_number = record.get("report_number")
+
+    if lang and lang != [metadata.get("language")]:
+        # If it's same ignore
+        raise UnexpectedValue(field="language", subfield="a", value=lang, stage="load")
+    if report_number:
+        if metadata.get("report_number") and report_number != metadata.get(
+            "report_number"
+        ):
+            raise UnexpectedValue(
+                field="report_number", subfield="a", value=report_number, stage="load"
+            )
+        else:
+            metadata["report_number"] = report_number
+    if descriptions:
+        additional_descriptions = metadata.get("additional_descriptions", [])
+        for description in descriptions:
+            additional_descriptions.append(
+                {
+                    "description": description,
+                    "type": "Other",
+                    "lang": "en",
+                }
+            )
+        metadata["additional_descriptions"] = additional_descriptions
+    if keywords:
+        keyword_objects = metadata.get("keywords", [])
+        keyword_names = [keyword.get("name") for keyword in keyword_objects]
+        for new_keyword_name in keywords:
+            if new_keyword_name not in keyword_names:
+                keyword_objects.append(
+                    {
+                        "name": new_keyword_name,
+                    }
+                )
+        metadata["keywords"] = keyword_objects
+
+    related_identifiers = list(metadata.get("related_identifiers", []))
+    if event_id:
+        # Insert event_id at the beginning
+        related_identifiers.insert(
+            0,
+            {
+                "scheme": "Indico",
+                "identifier": str(event_id),
+                "relation_type": "IsPartOf",
+            },
+        )
+    if url:
+        url = transform_legacy_urls(url, type="indico")
+        url_identifier = {
+            "scheme": "URL",
+            "identifier": url,
+            "relation_type": "IsPartOf",
+        }
+        if url_identifier not in related_identifiers:
+            related_identifiers.append(url_identifier)
+
+    metadata["related_identifiers"] = related_identifiers
+    metadata["date"] = date
+    if location:
+        metadata["location"] = location
+
+    return metadata
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
index 57369455..20626291 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/base.py
@@ -92,17 +92,18 @@ def contributors(self, key, value):
     return get_contributor(key, value)
 
 
-@model.over("submitter", "(^859__)|(^856__)")
+@model.over("submitters", "(^859__)|(^856__)")
+@for_each_value
 @require(["f"])
 def record_submitter(self, key, value):
     """Translate record submitter."""
     submitter = value.get("f")
+    if not submitter:
+        raise UnexpectedValue(field=key, subfield="f", value=value.get("f"))
     if type(submitter) is tuple:
         submitter = submitter[0]
         raise UnexpectedValue(field=key, subfield="f", value=value.get("f"))
-    if submitter:
-        submitter = submitter.lower()
-    return submitter
+    return submitter.lower()
 
 
 @model.over("_created", "(^916__)")
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py
new file mode 100644
index 00000000..540657b8
--- /dev/null
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/digitized_video_lecture.py
@@ -0,0 +1,653 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2026 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+
+
+"""CDS-Videos Digitized Video Lecture rules."""
+
+import pycountry
+from dojson.errors import IgnoreKey
+
+from cds_migrator_kit.errors import MissingRequiredField, UnexpectedValue
+from cds_migrator_kit.transform.xml_processing.quality.decorators import (
+    for_each_value,
+    require,
+    strip_output,
+)
+from cds_migrator_kit.transform.xml_processing.quality.parsers import (
+    StringValue,
+    clean_str,
+)
+from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.curation import (
+    transform_subfields,
+)
+
+from ...models.digitized_video_lecture import model  # noqa: F401
+from ..quality.dates import parse_date
+from .video_lecture import presented_at, series, url_files
+
+
+def append_transformed_subfields(self, key, value, field_name, subfield_name=None):
+    """Helper to append transformed subfields to a curation field."""
+    curation = self["_curation"]
+    transformed = transform_subfields(key, value)
+
+    if subfield_name:
+        existing_values = curation.setdefault(field_name, {})
+        legacy_field = existing_values.get(subfield_name, [])
+        legacy_field.extend(transformed)
+        if legacy_field:
+            curation[field_name][subfield_name] = legacy_field
+    else:
+        existing_values = curation.get(field_name, [])
+        existing_values.extend(transformed)
+        if existing_values:
+            curation[field_name] = existing_values
+
+
+def add_contributor(self, name, role):
+    """Add a contributor to the record."""
+    contributors = self.get("contributors", [])
+    contributor_names = [contributor["name"] for contributor in contributors]
+    if name and name not in contributor_names:
+        contributors.append({"name": name, "role": role})
+        self["contributors"] = contributors
+
+
+def validate_copyright(self, key, holder, year):
+    if "copyright" in self:
+        copyright = self["copyright"]
+        if holder.lower() not in copyright["holder"].lower():
+            raise UnexpectedValue(message="Copyright holders not matching!", field=key)
+        if year:
+            if "year" not in copyright:
+                self["copyright"]["year"] = year
+            elif year not in copyright["year"]:
+                # Curated year is the correct year, use that one
+                # There is only one record with different year: https://cds.cern.ch/record/281783/export/xm
+                # If different curated years raise an error
+                curated_copyright = self.get("curated_copyright", {})
+                curated_year = curated_copyright.get("year")
+                if curated_year and curated_year != copyright.get("year"):
+                    raise UnexpectedValue(
+                        message="Copyright years not matching!", field=key
+                    )
+                # Different year in curated copyright
+                if curated_year and key == "5421_":
+                    raise UnexpectedValue(
+                        message="Copyright years not matching!", field=key
+                    )
+    else:
+        copyright = {"holder": holder}
+        if year:
+            copyright["year"] = year
+        if "cern" in holder.lower():
+            copyright["url"] = "http://copyright.web.cern.ch"
+        self["copyright"] = copyright
+        return copyright
+
+
+@model.over("related_id", "^962__", override=True)
+@for_each_value
+def related_id(self, key, value):
+    """Translates tag 962."""
+    pagination = value.get("k", "").strip()
+    # Only 2 records has this field: 300427, 317239
+    if pagination and pagination != "no pagination":
+        raise UnexpectedValue(field=key, subfield="k", value=pagination)
+    new_related_id = presented_at(self, key, value)
+    if new_related_id:
+        rel_id = new_related_id[0]
+        rel_ids = self.get("related_identifiers", [])
+        if rel_id not in rel_ids:
+            rel_ids.append(rel_id)
+            self["related_identifiers"] = rel_ids
+    raise IgnoreKey("related_id")
+
+
+@model.over("descriptions", "^520__", override=True)
+@for_each_value
+def descriptions(self, key, value):
+    """Translates description."""
+    description_text = StringValue(value.get("a")).parse()
+    provenance = value.get("9", "").strip()
+    curation_info = value.get("8", "").strip()
+
+    record_description = self.get("description", "")
+
+    # Decide once whether we need to append legacy data
+    should_append_legacy = bool(curation_info or provenance or record_description)
+
+    if not record_description:
+        self["description"] = description_text
+
+    if should_append_legacy:
+        append_transformed_subfields(self, key, value, "digitized_description")
+
+    IgnoreKey("descriptions")
+
+
+@model.over("language", "^041__", override=True)
+@require(["a"])
+@for_each_value
+@strip_output
+def languages(self, key, value):
+    """Translates language field."""
+    raw_lang = value.get("a")
+    raw_lang = raw_lang if isinstance(raw_lang, (list, tuple)) else [raw_lang]
+
+    try:
+        langs = [
+            pycountry.languages.lookup(clean_str(r).lower()).alpha_2.lower()
+            for r in raw_lang
+        ]
+    except Exception:
+        raise UnexpectedValue(field=key, subfield="a", value=raw_lang)
+
+    if not langs:
+        raise MissingRequiredField(field=key, subfield="a", value=raw_lang)
+    provenance = value.get("9", "").strip()
+
+    curation_field = value.get("8", "").strip()
+    if curation_field or provenance:
+        append_transformed_subfields(self, key, value, "digitized_language")
+
+    language = self.get("language", "")
+    additional_langs = langs[1:]
+    if not language:
+        self["language"] = langs[0]
+    else:
+        additional_langs = langs
+
+    # Extend additional languages if not already present
+    record_additional_languages = self.get("additional_languages", [])
+    additional_langs = [
+        lang
+        for lang in additional_langs
+        if lang != language and lang not in record_additional_languages
+    ]
+    record_additional_languages.extend(additional_langs)
+    self["additional_languages"] = record_additional_languages
+
+    IgnoreKey("language")
+
+
+@model.over("keywords", "^653[12_]_", override=True)
+@require(["a"])
+@for_each_value
+def keywords(self, key, value):
+    """Translates keywords from tag 6531."""
+    keyword = value.get("a", "").strip()
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["CERN", "review", "review Mar2021", "CERN QA"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+    curation_field = value.get("8", "").strip()
+    if curation_field and keyword:
+        append_transformed_subfields(self, key, value, "digitized_keywords")
+    if keyword:
+        return {"name": keyword}
+
+
+@model.over("copyright", "^542__", override=True)
+def copyright(self, key, value):
+    """Translates copyright."""
+    holder = value.get("d", "").strip()
+    a_value = value.get("a", "").strip()
+    year = value.get("g", "").strip()
+    if holder and a_value:
+        raise UnexpectedValue(
+            message="Holder and a_value present at the same time!", field=key
+        )
+    if not holder:
+        holder = a_value
+    if "copyright" in self:
+        validate_copyright(self, key, holder, year)
+        raise IgnoreKey("copyright")
+    statement = value.get("f", "").strip()
+    material = value.get("3", "").strip()
+
+    # Drop material
+    if material and material not in ["publication", "Report"]:
+        raise UnexpectedValue(field=key, subfield="3", value=material)
+
+    full_holder = f"{holder} {statement}".strip()
+    if not full_holder:
+        raise UnexpectedValue(message="Holder is missing for copyright!")
+
+    copyright = {"holder": full_holder}
+    if year:
+        copyright["year"] = year
+    if "cern" in holder.lower():
+        copyright["url"] = "http://copyright.web.cern.ch"
+
+    return copyright
+
+
+@model.over("publication_date", "^269__", override=True)
+@for_each_value
+def imprint(self, key, value):
+    """Translates date from tag 269."""
+    name = value.get("b")
+    place = value.get("a")
+
+    # Transform as contributor if different than CERN Geneva
+    producer = " ".join(
+        part
+        for part in (place, name)
+        if part and part.upper() not in {"GENEVA", "CERN"}
+    )
+    if producer:
+        self["contributors"].append({"name": producer, "role": "Producer"})
+
+    provenance = value.get("9", "").strip()
+    if provenance:
+        append_transformed_subfields(self, key, value, "digitized_imprint_date")
+        raise IgnoreKey("publication_date")
+
+    date_field = value.get("c")  # 269 'c' subfield (e.g., '1993-08-09')
+    parsed_date = parse_date(date_field)
+    if parsed_date:  # If parsing succeeds, return the formatted date
+        return parsed_date
+    parsed_date = parse_date(place)
+    if parsed_date:
+        return parsed_date
+
+
+@model.over("imprint_date", "^260__", override=True)
+@for_each_value
+def tag260(self, key, value):
+    """Check tag 269."""
+
+    def validate_location(val, subfield):
+        if val and val.upper() not in {"GENEVA", "CERN"}:
+            raise UnexpectedValue(
+                field=key, subfield=subfield, value=val, message="Unexpected tag 260"
+            )
+
+    name = value.get("b")
+    place = value.get("a")
+    curation_info = value.get("8", "").strip()
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["review", "review Mar2021"]:
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+    if curation_info:
+        append_transformed_subfields(self, key, value, "digitized_imprint_date")
+
+    validate_location(place, "a")
+    validate_location(name, "b")
+
+    date_field = value.get("c")
+    parsed_date = parse_date(date_field)
+    if parsed_date:  # If parsing succeeds, return the formatted date
+        return parsed_date
+    raise IgnoreKey("imprint_date")
+
+
+@model.over("notes", "^500__", override=True)
+@for_each_value
+@require(["a"])
+def notes(self, key, value):
+    """Detects notes."""
+    curation_field = value.get("8", "").strip()
+    if curation_field:
+        append_transformed_subfields(self, key, value, "digitized_notes")
+        return None
+    note_str = value.get("a").strip()
+    if value.get("9"):
+        note_str = value.get("9").strip() + " : " + value.get("a").strip()
+    note = {"note": note_str}
+
+    parts = note_str.split(",")
+    match_date = parts[-1].strip() if len(parts) > 1 else ""
+    if match_date:
+        parsed_date = parse_date(match_date)
+        if parsed_date:
+            note.update({"date": parsed_date})
+
+    return note
+
+
+@model.over("subject_indicators", "(^690C_)|(^690c_)", override=True)
+@for_each_value
+def subject_indicators(self, key, value):
+    """Translates subject_indicators as keywords from tag 690C."""
+    subject = value.get("a", "").strip()
+    if subject:
+        if subject not in [
+            "ACAD",
+            "CERN",
+            "TALK",
+            "movingimages",
+            "SSLP",
+            "reviewed",
+            "quality-controlled",
+        ]:
+            # checking if anything else stored in this field
+            raise UnexpectedValue(field=key, subfield="a", value=subject)
+    curated_field = value.get("9", "").strip()
+    if curated_field and curated_field not in ["review", "CERN QA"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=curated_field)
+    return {"name": subject}
+
+
+@model.over("subject_categories", "(^65017)|(^65027)", override=True)
+@for_each_value
+def subject_categories(self, key, value):
+    """
+    Translates subject_category as keywords from tag 65017,65027.
+    Digitization project: EPFL_MC Categories.
+    """
+    keyword = value.get("a", "").strip()
+    source = value.get("2", "").strip()
+    if source and source != "SzGeCERN":
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=source)
+    provenance = value.get("9", "").strip()
+    curation_field = value.get("8", "").strip()
+    if curation_field or provenance:
+        append_transformed_subfields(self, key, value, "digitized_subject_categories")
+        return None
+    if keyword:
+        return {"name": keyword}
+
+
+@model.over("additional_descriptions", "(^590__)|(^490__)", override=True)
+@for_each_value
+def additional_descriptions(self, key, value):
+    """Translates additional_descriptions."""
+    if key == "490__":
+        additional_description = series(self, key, value)
+        return additional_description[0]
+    description = value.get("a", "").strip()
+    provenance = value.get("9", "").strip()
+    curation_information = value.get("8", "").strip()
+    if provenance or curation_information:
+        append_transformed_subfields(self, key, value, "digitized_description")
+        return None
+    if description:
+        return {"description": description, "type": "Other", "lang": "fr"}
+    return None
+
+
+@model.over("lecture_created", "^961__", override=True)
+def creation_date(self, key, value):
+    """Translate record creation date.
+
+    - tag 961, subfield code x for creation date
+    - tag 961, subfield code c for modification_date
+
+    It can also store:
+    - library 'l' subfield
+    - hour 'h' subfield
+    - cataloguer 'a' subfield (name of the curator)
+    - cataloguer level 'b' subfield (curator?)
+    """
+
+    b_value = value.get("b", "").strip().lower()
+    if b_value and b_value != "curator":
+        raise UnexpectedValue(field=key, subfield="b", value=b_value)
+    a_value = value.get("a", "").strip()
+    if b_value or a_value:
+        append_transformed_subfields(self, key, value, "cds_modification_field")
+        raise IgnoreKey("lecture_created")
+
+    # 961 'x' subfield
+    creation_date = value.get("x", "").strip()
+    parsed_creation_date = parse_date(creation_date)
+    if not parsed_creation_date:
+        # Check if anything else stored
+        raise UnexpectedValue(field=key, subfield="x", value=creation_date)
+    # 961 'c' subfield
+    modification_date = value.get("c", "").strip()
+    parsed_modification_date = parse_date(modification_date)
+    if modification_date and not parsed_modification_date:
+        # Check if anything else stored
+        raise UnexpectedValue(field=key, subfield="c", value=modification_date)
+    return parsed_creation_date
+
+
+@model.over("action_note", "^5831_")
+@for_each_value
+def action_note(self, key, value):
+    """Translates action note (digitized information)."""
+    def format_field(value, subfield):
+        val = value.get(subfield)
+        if isinstance(val, (list, tuple)):
+            if subfield == "f":
+                cleaned = [str(v).strip() for v in val if v and str(v).strip()]
+                return cleaned if cleaned else None
+            raise UnexpectedValue(field=key, subfield=subfield, value=val)
+        if subfield == "f" and val:
+            return [val.strip()]
+        return val if val else None
+
+    preservation_entry = {
+        k: v
+        for k, v in {
+            "source": format_field(value, "2"),
+            "format": format_field(value, "3"),
+            "institution": format_field(value, "5"),
+            "batch": format_field(value, "6"),
+            "sequence_identifier": format_field(value, "8"),
+            "action": format_field(value, "a"),
+            "digitization_setup": format_field(value, "b"),
+            "date": format_field(value, "c"),
+            "preservation_notes": format_field(value, "f"),
+            "workflow": format_field(value, "i"),
+            "vendor": format_field(value, "k"),
+            "title": format_field(value, "l"),
+            "duration_value": format_field(value, "n"),
+            "duration_unit": format_field(value, "o"),
+            "related_record": format_field(value, "u"),
+            "timing_note": format_field(value, "x"),
+            "quality_control_note": format_field(value, "z"),
+        }.items()
+        if v
+    }
+    preservation_values = self["_curation"].get("preservation_values", [])
+    preservation_values.append(preservation_entry)
+    self["_curation"]["preservation_values"] = preservation_values
+    raise IgnoreKey("action_note")
+
+
+@model.over("curated_copyright", "^5421_")
+@for_each_value
+def curation_copyright(self, key, value):
+    """Translates curated copyright information."""
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["review", "review Mar2021"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+    year = value.get("g", "").strip()
+    holder = value.get("d", "").strip()
+    copyright = validate_copyright(self, key, holder, year)
+    if copyright:
+        self["curated_copyright"] = copyright
+    raise IgnoreKey("curated_copyright")
+
+
+@model.over("597", "^597__")
+@for_each_value
+def curation_local_note(self, key, value):
+    """Translates digitized information."""
+    append_transformed_subfields(self, key, value, "digitized_comments")
+
+
+@model.over("514", "^514__")
+@for_each_value
+def data_quality_note(self, key, value):
+    """Translates digitized information."""
+    append_transformed_subfields(self, key, value, "digitized_data_quality_note")
+
+
+@model.over("594", "^594__")
+@for_each_value
+def curated_type(self, key, value):
+    """Translates digitized information."""
+    type = value.get("a", "").strip()
+    if type and type not in [
+        "Conference Speech",
+        "Conférence Speech",
+        "Footage",
+        "Video Clip",
+        "Movie",
+    ]:
+        raise UnexpectedValue(field=key, subfield="a", value=type)
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["review", "review Mar2021", "CERN QA"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+    # add as digitized keyword to match with multiple video record
+    curation_field = value.get("8", "").strip()
+    if curation_field:
+        append_transformed_subfields(self, key, value, "digitized_keywords")
+    # add as keyword if not already present
+    else:
+        keywords_names = [keyword["name"] for keyword in self.get("keywords", [])]
+        if type and type not in keywords_names:
+            self["keywords"].append({"name": type})
+    raise IgnoreKey("594")
+
+
+@model.over("performers", "5111_")
+@for_each_value
+def performer_note(self, key, value):
+    """Translates digitized information."""
+    contributor_name = value.get("a", "").strip()
+    # Sometimes it's contributor
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["review", "review Mar2021", "CERN QA"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+
+    # Check if any contributor is different
+    performer = self.get("performer", "")
+    if performer:
+        if performer != contributor_name:
+            append_transformed_subfields(self, key, value, "digitized_filmed_people")
+            raise IgnoreKey("performers")
+    else:
+        self["performer"] = contributor_name
+
+    add_contributor(self, contributor_name, "Speaker")
+    raise IgnoreKey("performers")
+
+
+@model.over("963", "^963__")
+@for_each_value
+def digitized_access(self, key, value):
+    """Translates digitized information."""
+    owner = value.get("a", "").strip()
+    # One record's video is restricted. https://cds.cern.ch/record/1566223/
+    # If it's restricted digitized link is restricted to cern only
+    if owner and owner.lower() not in ["public", "restricted"]:
+        raise UnexpectedValue(field=key, subfield="a", value=owner)
+    append_transformed_subfields(self, key, value, "digitized_access")
+    raise IgnoreKey("963")
+
+
+@model.over("993", "^993__")
+@for_each_value
+def digitized_993(self, key, value):
+    """Translates digitized information."""
+    t_value = value.get("t", "").strip()
+    # Only one record have this field: https://cds.cern.ch/record/690303/export/xm
+    if t_value and t_value != "Fusion Research":
+        raise UnexpectedValue(field=key, subfield="t", value=t_value)
+    keywords_names = [keyword["name"] for keyword in self.get("keywords", [])]
+    if t_value and t_value not in keywords_names:
+        self["keywords"].append({"name": t_value})
+    raise IgnoreKey("993")
+
+
+@model.over("344", "^344__")
+@for_each_value
+def digitized_344(self, key, value):
+    """Translates digitized information."""
+    a_value = value.get("a", "").strip()
+    # Only one record have this field: https://cds.cern.ch/record/319677/export/xm
+    if a_value and a_value.lower() != "video":
+        raise UnexpectedValue(field=key, subfield="a", value=a_value)
+    raise IgnoreKey("344")
+
+
+@model.over("directed_by", "(^5081_)|(^508__)")
+@for_each_value
+def directed_by(self, key, value):
+    """Translates directed by information."""
+    # All the values are the same: ignored '9' and '8' subfields and transformed as director
+    director_name = value.get("a", "").strip()
+    provenance = value.get("9", "").strip()
+    if provenance and provenance not in ["CERN QA", "review"]:
+        # checking if anything else stored in this field
+        raise UnexpectedValue(field=key, subfield="9", value=provenance)
+    curation_field = value.get("8", "").strip()
+    directed_by = self.get("directed_by", [])
+    if directed_by:
+        if directed_by.lower() != director_name.lower():
+            # Checking if all the values are the same
+            raise UnexpectedValue(field=key, subfield="a", value=director_name)
+    else:
+        self["directed_by"] = director_name
+
+    add_contributor(self, director_name, "Director")
+    raise IgnoreKey("directed_by")
+
+
+@model.over("020", "^020__")
+@for_each_value
+def book_number(self, key, value):
+    """Translates digitized information."""
+    # Only one record have this field: https://cds.cern.ch/record/334106/export/xm
+    append_transformed_subfields(self, key, value, "legacy_marc_fields", "020")
+
+
+@model.over("856", "^856_2")
+@for_each_value
+def digitized_856_2(self, key, value):
+    """Translates digitized information."""
+    # Looks like 8564_ field: https://cds.cern.ch/record/1565555/export/xm
+    url_file = url_files(self, key, value)
+    if url_file:
+        self["url_files"].append(url_file[0])
+    raise IgnoreKey("856_2")
+
+
+@model.over("775", "^775__")
+@require(["w"])
+@for_each_value
+def digitized_775(self, key, value):
+    """Translates digitized information."""
+    # Only one record: https://cds.cern.ch/record/423086/
+    recid = value.get("w", "").strip()
+    resource_type = value.get("b", "").strip().lower()
+    if resource_type and resource_type != "article":
+        raise UnexpectedValue(field=key, subfield="b", value=resource_type)
+    related_identifier = {
+        "identifier": recid,
+        "scheme": "CDS",
+        "relation_type": "IsVariantFormOf",
+    }
+    if resource_type:
+        related_identifier["resource_type"] = "Text"
+    # Add as related identifier if not present
+    rel_ids = self.get("related_identifiers", [])
+    if related_identifier not in rel_ids:
+        rel_ids.append(related_identifier)
+        self["related_identifiers"] = rel_ids
+    raise IgnoreKey("775")
+
+
+@model.over("300", "^300__")
+@for_each_value
+def pyhsical_description(self, key, value):
+    """Translates tag 300."""
+    append_transformed_subfields(self, key, value, "digitized_physical_description")
diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
index 34555fef..4b599bd6 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/xml_processing/rules/video_lecture.py
@@ -157,6 +157,7 @@ def format_field(val):
                     "nonpublic_note": format_field(value.get("x")),
                     "md5_checksum": format_field(value.get("w")),
                     "source": format_field(value.get("2")),
+                    "description": format_field(value.get("i")),
                 }.items()
                 if v
             }
@@ -165,7 +166,7 @@ def format_field(val):
         indico_link = {}
 
         # Try to get event id
-        match_id = re.search(r"(?:ida=|confId=|event/)([\w\d]+)", url)
+        match_id = re.search(r"(?:ida=|confId=|confid=|event/)([\w\d]+)", url)
         if match_id:
             event_id = match_id.group(1)
             if event_id:
@@ -453,7 +454,12 @@ def additional_titles(self, key, value):
             if lang == "Titre français":
                 additional_title["lang"] = "fr"
             # Transform as AlternativeTitle
-            elif lang not in ["Previous title", "Also quoted as"]:
+            elif lang not in [
+                "Previous title",
+                "Also quoted as",
+                "Former title",
+                "Original title",
+            ]:
                 raise UnexpectedValue(field=key, subfield="i", value=lang)
 
     if volume:
diff --git a/setup.cfg b/setup.cfg
index c3f95e06..e1bf28ac 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,7 +39,8 @@ rdm =
 videos =
     Flask-Security-Invenio==3.4.0
     marshmallow<4.0
-    cds @ git+https://github.com/CERNDocumentServer/cds-videos@additional-files#egg=cds
+    cds @ git+https://github.com/CERNDocumentServer/cds-videos@main#egg=cds
+    srt-to-vtt==1.0.0
 
 tests =
     pytest-black>=0.3.0
@@ -165,11 +166,16 @@ invenio_pidstore.minters =
 # videos migration
 cds_migrator_kit.videos.models =
     video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.video_lecture:model
+    digitized_video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.models.digitized_video_lecture:model
 cds_migrator_kit.videos.rules.base =
     base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base
 cds_migrator_kit.videos.rules.video_lecture =
     base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base
     video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.video_lecture
+cds_migrator_kit.videos.rules.digitized_video_lecture =
+    base = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.base
+    video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.video_lecture
+    digitized_video_lecture = cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.rules.digitized_video_lecture
 cds_migrator_kit.videos.submitter.model =
     videos_submitter = cds_migrator_kit.videos.weblecture_migration.users.transform.xml_processing.models.submitter:videos_submitter_model
 

From afbda110c5a121845a8252405370df0a7ffc00db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Thu, 2 Apr 2026 12:41:08 +0200
Subject: [PATCH 2/2] videos: use creation date for missing date digitized
 records

---
 .../videos/weblecture_migration/transform/transform.py    | 8 +++++---
 tests/cds-videos/test_videos_transform_rules.py           | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
index 549160c6..996d636d 100644
--- a/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
+++ b/cds_migrator_kit/videos/weblecture_migration/transform/transform.py
@@ -342,14 +342,14 @@ def reformat_date(json_data):
                 json_data["_curation"]["legacy_dates"] = legacy_dates
                 dates_set = [sorted(dates_set)[0]]
 
-            # TODO: update after migrator fix
+            # TODO: remove this part after digitized lecturemedia migration
             if not dates_set:
-                dates_set = [json_data.get("lecture_created")]
+                dates_set = [json_data["creation_date"]]
                 self.migration_logger.add_information(
                     json_data.get("recid"),
                     state={
                         "message": "Lecture created date used!",
-                        "value": json_data.get("lecture_created"),
+                        "value": json_data["creation_date"],
                     },
                 )
 
@@ -749,6 +749,8 @@ def transform(self, entry):
 
         # Generate media files before metadata, it'll set if record has multiple masters
         media_files = self._media_files(json_data)
+        creation_date = self._created(record_dump)
+        json_data["creation_date"] = creation_date.date().isoformat()
         record_json_output = {
             "metadata": self._metadata(json_data),
             "created": self._created(record_dump),
diff --git a/tests/cds-videos/test_videos_transform_rules.py b/tests/cds-videos/test_videos_transform_rules.py
index 3bd7f89a..2edb91c6 100644
--- a/tests/cds-videos/test_videos_transform_rules.py
+++ b/tests/cds-videos/test_videos_transform_rules.py
@@ -159,7 +159,7 @@ def test_transform_date(dumpdir, base_app):
         res = load_and_dump_revision(modified_data)
 
         # Transform record
-        with pytest.raises(MissingRequiredField):
+        with pytest.raises(Exception):
             record_entry._metadata(res)
 
 
@@ -228,7 +228,7 @@ def test_transform_digitized(dumpdir, base_app):
 
         # Transform record it should fail (no valid date, it has date range)
         record_entry = CDSToVideosRecordEntry()
-        with pytest.raises(MissingRequiredField):
+        with pytest.raises(Exception):
             record_entry._metadata(res)
 
 
@@ -295,7 +295,7 @@ def test_transform_note(dumpdir, base_app):
 
         # Transform record it should fail (no valid date, it has date range)
         record_entry = CDSToVideosRecordEntry()
-        with pytest.raises(MissingRequiredField):
+        with pytest.raises(Exception):
             record_entry._metadata(res)
 
         # Test case: Add internal note which has a valid date to record