Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 6 additions & 34 deletions cds_migrator_kit/videos/weblecture_migration/load/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
MigrationProgressLogger,
RecordStateLogger,
)
from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.identifiers import (
transform_legacy_urls,
from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
update_metadata_multiple_video_record,
)

from .helpers import (
Expand Down Expand Up @@ -341,38 +341,10 @@ def create_publish_multiple_video_record(self, entry):
media_files = self._get_files(record["files"], json_data.get("files", []))
master_file_id = media_files["master_path"].split("/")[-1]

# Use the correct metadata for each record
event_id = record.get("event_id")
url = record.get("url")
date = record["date"]
location = record.get("location")

metadata = common_metadata.copy()
related_identifiers = list(metadata.get("related_identifiers", []))
if event_id:
# Insert event_id at the beginning
related_identifiers.insert(
0,
{
"scheme": "Indico",
"identifier": str(event_id),
"relation_type": "IsPartOf",
},
)
if url:
url = transform_legacy_urls(url, type="indico")
url_identifier = {
"scheme": "URL",
"identifier": url,
"relation_type": "IsPartOf",
}
if url_identifier not in related_identifiers:
related_identifiers.append(url_identifier)

metadata["related_identifiers"] = related_identifiers
metadata["date"] = date
if location:
metadata["location"] = location
# Update metadata for multiple video record
metadata = update_metadata_multiple_video_record(record, common_metadata)
# Mint report number if matched with video
report_number = metadata.get("report_number", None)

# Create video and flow
video_deposit, video_deposit_id, bucket_id, payload = (
Expand Down
9 changes: 9 additions & 0 deletions cds_migrator_kit/videos/weblecture_migration/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ records:
transform:
files_dump_dir: /path_to/afs_files/
eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
digitized-lectures:
data_dir: cds_migrator_kit/videos/weblecture_migration/data/
tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp
log_dir: cds_migrator_kit/videos/weblecture_migration/log
extract:
dirpath: cds_migrator_kit/videos/weblecture_migration/data/digitized-lectures/dump/
transform:
files_dump_dir: /path_to/afs_files/
eos_file_paths_dir: cds_migrator_kit/videos/weblecture_migration/data/file_paths/
submitters:
data_dir: cds_migrator_kit/videos/weblecture_migration/data/users
tmp_dir: cds_migrator_kit/videos/weblecture_migration/tmp/users
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
#
# This file is part of CERN Document Server.
# Copyright (C) 2026 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.


"""CDS-Videos Digitized Video Lecture model."""

from cds_migrator_kit.transform.overdo import CdsOverdo

from .video_lecture import model as video_lecture_model


class DigitizedVideoLecture(CdsOverdo):
"""Translation Index for CERN Digitized Video Lectures."""

__query__ = "8567_.x:'Absolute master path' 8567_.d:/mnt/master_share* -980__.C:MIGRATED -980__.c:DELETED 5831_.a:digitized"

__ignore_keys__ = {
"0248_a", # oai identifier
"0248_p", # oai identifier
"0248_q", # oai
"100__0", # Author id (eg: AUTHOR|(CDS)2067852)
"111__z", # End date (indico)
"250__a", # Edition
"337__a", # Checked values only `Video`
"511__0", # Author id (eg: AUTHOR|(CDS)2067852)
"5111_8", # Video id for performer, multi video records.
"518__l", # Lectures: length of speech
"518__h", # Lectures: Starting time
"700__0", # Author id (eg: AUTHOR|(CDS)2067852)
"518__e", # Speaker (1 record has contributor in 518, it's also in 5111)
"8564_8", # File: bibdoc id
"8564_s", # File: file size
"916__y", # Status week year
"960__a", # Base number
# CDS modification tag
"961__h",
"961__l",
}

_default_fields = {
"language": "",
"description": "",
"performer": "",
"url_files": [],
"curated_copyright": {},
"lecture_infos": [],
"_curation": {
"preservation_values": [],
},
"contributors": [],
"alternate_identifiers": [],
"additional_languages": [],
"collections": [],
"keywords": [],
}


model = DigitizedVideoLecture(
bases=(video_lecture_model,),
entry_point_group="cds_migrator_kit.videos.rules.digitized_video_lecture",
)
91 changes: 80 additions & 11 deletions cds_migrator_kit/videos/weblecture_migration/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from cds_migrator_kit.videos.weblecture_migration.transform.xml_processing.quality.multiple_video import (
transform_multiple_video_record,
transform_multiple_video_wihtout_indico,
try_to_match_metadata,
)

cli_logger = logging.getLogger("migrator")
Expand Down Expand Up @@ -300,7 +301,7 @@ def guess_dates(json_data, key, subkey=None):
### Returns:
- `set[str]`: A set of date strings.
"""
items = json_data.get(key, [])
items = get_values_in_json(json_data, key, type=list)
if subkey:
return {
item[subkey]["date"]
Expand All @@ -318,6 +319,7 @@ def reformat_date(json_data):
or get_values_in_json(json_data, "publication_date")
or guess_dates(json_data, "url_files", subkey="indico")
| guess_dates(json_data, "notes")
or get_values_in_json(json_data, "imprint_date")
)
# If multiple video record use lecture_infos or url_files
lecture_infos = [
Expand All @@ -331,6 +333,25 @@ def reformat_date(json_data):
and "date" in item["indico"]
and "event_id" in item["indico"]
]
if json_data.get("legacy_recid") in [319681, 319687]:
# Couldn't match date and video. Use the first date and add them to curation
legacy_dates = get_values_in_json(
json_data.get("_curation", {}), "legacy_dates", type=list
)
legacy_dates.extend(dates_set)
json_data["_curation"]["legacy_dates"] = legacy_dates
dates_set = [sorted(dates_set)[0]]

# TODO: remove this part after digitized lecturemedia migration
if not dates_set:
dates_set = [json_data["creation_date"]]
self.migration_logger.add_information(
json_data.get("recid"),
state={
"message": "Lecture created date used!",
"value": json_data["creation_date"],
},
)

# Return the valid date
if len(dates_set) >= 1 and not self.has_multiple_master:
Expand All @@ -353,9 +374,10 @@ def reformat_date(json_data):

def description(json_data):
"""Reformat the description for the cds-videos data model."""
if not json_data.get("description"):
description = json_data.get("description", "")
if not description:
return json_data.get("title").get("title")
return json_data.get("description")
return description

def format_contributors(json_data):
"""
Expand Down Expand Up @@ -394,7 +416,7 @@ def publication_date(json_data):

def notes(json_data):
"""Get the notes."""
notes = entry.get("notes")
notes = get_values_in_json(json_data, "notes", type=list)
if notes:
note_strings = [note.get("note") for note in notes]
return "\n".join(note_strings)
Expand Down Expand Up @@ -448,7 +470,7 @@ def location(json_data):
def get_report_number(json_data):
"""Return the report number."""
report_numbers = get_values_in_json(json_data, "report_number", type=list)
if len(report_numbers) > 1:
if len(report_numbers) >= 1:
# If report number exists put it in curation
report_number = report_numbers[0]
return report_numbers, self.check_pid_exists(
Expand All @@ -459,7 +481,9 @@ def get_report_number(json_data):
def get_keywords(json_data):
"""Return keywords."""
keywords = json_data.get("keywords", [])
subject_categories = json_data.get("subject_categories", [])
subject_categories = get_values_in_json(
json_data, "subject_categories", type=list
)
subject_indicators = json_data.get("subject_indicators", [])

all_keywords = [
Expand Down Expand Up @@ -578,7 +602,9 @@ def get_curation(json_data):
"""Return _curation."""
_curation = json_data.get("_curation", {})
# Add volumes
additional_titles = json_data.get("additional_titles", [])
additional_titles = get_values_in_json(
json_data, "additional_titles", type=list
)
volumes = [item["volume"] for item in additional_titles if "volume" in item]
if volumes:
_curation["volumes"] = volumes
Expand All @@ -594,7 +620,7 @@ def get_curation(json_data):

def get_additional_titles(json_data):
"""Return additional_titles."""
tag_246 = json_data.get("additional_titles", {})
tag_246 = get_values_in_json(json_data, "additional_titles", type=list)
_titles = [item for item in tag_246 if "title" in item]
additional_titles = []
for title_item in _titles:
Expand Down Expand Up @@ -626,10 +652,10 @@ def get_access(json_data):
"""Generate access permissions based on restrictions."""
# Update permissions
update = [current_app.config["WEBLECTURES_MIGRATION_SYSTEM_USER"]]
submitter = json_data.get("submitter")
submitter = json_data.get("submitters", [])
if submitter:
# Add submitter to update list
update.append(submitter)
update.extend(submitter)
collections = get_collections(json_data)
if "Lectures::CERN Accelerator School" in collections:
update.extend(current_app.config["CAS_LECTURES_ACCESS"])
Expand All @@ -643,14 +669,21 @@ def get_access(json_data):
access["read"] = all_emails
return access

def get_language(json_data):
"""Return language."""
language = json_data.get("language")
if not language:
raise MissingRequiredField(message="Language is missing!")
return language

record_dates = reformat_date(entry)
# Date will be None if record has multiple masters
record_date = record_dates[0] if not self.has_multiple_master else None
metadata = {
"title": entry["title"],
"description": description(entry),
"contributors": format_contributors(entry),
"language": entry.get("language"),
"language": get_language(entry),
"date": record_date,
"publication_date": publication_date(entry) or record_date,
"keywords": get_keywords(entry),
Expand Down Expand Up @@ -716,6 +749,8 @@ def transform(self, entry):

# Generate media files before metadata, it'll set if record has multiple masters
media_files = self._media_files(json_data)
creation_date = self._created(record_dump)
json_data["creation_date"] = creation_date.date().isoformat()
record_json_output = {
"metadata": self._metadata(json_data),
"created": self._created(record_dump),
Expand All @@ -726,6 +761,7 @@ def transform(self, entry):
}
if self.has_multiple_master:
record_json_output["is_multiple_video_record"] = True
self.multiple_video_record_entries["recid"] = json_data["legacy_recid"]
try:
mapped_multiple_video_record, common = transform_multiple_video_record(
self.multiple_video_record_entries
Expand All @@ -747,6 +783,10 @@ def transform(self, entry):
)
else:
raise e
mapped_multiple_video_record = sorted(
mapped_multiple_video_record,
key=lambda r: r["files"]["master_path"],
)
if len(mapped_multiple_video_record) > 10:
self.migration_logger.add_information(
json_data["recid"],
Expand Down Expand Up @@ -804,6 +844,35 @@ def transform(self, entry):
record_json_output["metadata"] = metadata
record_json_output["multiple_video_record"] = mapped_multiple_video_record

# Try to match metadata for multiple video record
curation = metadata.get("_curation", {})
tracked_fields = {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the value of this dict used?

Copy link
Copy Markdown
Contributor Author

@zubeydecivelek zubeydecivelek Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These values are checked to match with correct video and this is just logging them if any of them matched with video. And if any of these keys are in curation then we try to match values

Image

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unmatched values are kept in curation

"digitized_description": "Digitized description matched!",
"digitized_language": "Digitized language matched!",
"digitized_keywords": "Digitized keywords matched!",
"legacy_report_number": "Legacy report number matched!",
}

if any(field in curation for field in tracked_fields):
mapped_multiple_video_record, new_curation = try_to_match_metadata(
mapped_multiple_video_record, curation
)

for field, message in tracked_fields.items():
if curation.get(field) != new_curation.get(field):
self.migration_logger.add_information(
json_data["recid"],
{
"message": message,
"value": field,
},
)
metadata["_curation"] = new_curation
record_json_output["metadata"] = metadata
record_json_output["multiple_video_record"] = (
mapped_multiple_video_record
)

return {
"created": self._created(record_dump),
"updated": self._updated(record_dump),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_contributor(key, value, contributor_role="", name=""):
contributor = {"name": name}
if affiliation:
if isinstance(affiliation, (list, tuple)):
contributor["affiliations"] = list(affiliation)
contributor["affiliations"] = list(set(affiliation))
elif isinstance(affiliation, str):
contributor["affiliations"] = [affiliation]
if contributor_role:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,17 @@
def transform_subfields(key, value):
"""Helper to transform MARC subfields into key-prefixed strings."""
value = dict(value)

priority = {
"9": 0,
"8": 1,
}

output = []
for subfield, subvalue in value.items():
for subfield, subvalue in sorted(
value.items(),
key=lambda item: (priority.get(item[0], 99), item[0]),
):
if isinstance(subvalue, (list, tuple)):
for item in subvalue:
output.append(f"{key}{subfield}:{item}")
Expand Down
Loading
Loading