From 7cc0f7aa01a900cbec8a86f5085d2de77d3b0afc Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 10:50:13 -0500 Subject: [PATCH 01/21] Initial issue commit. --- scan-batch-dir | 1119 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 874 insertions(+), 245 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 0b70f63..5a9fa25 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -1,4 +1,5 @@ -#!/usr/bin/python3 +#!/usr/bin/python3.12 + import yaml import argparse import os @@ -11,10 +12,14 @@ import logging import openpyxl import csv import io +import re +import requests import pandas as pd from google.oauth2 import service_account from googleapiclient.discovery import build +pd.set_option('future.no_silent_downcasting', True) + # Setup the log file format. log_formatter = logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(levelname)s %(message)s',datefmt="%Y%m%d %H:%M:%S") @@ -172,7 +177,7 @@ def read_google_sheet(spreadsheet_id: str, sheet_name="Sheet1", credentials_file # Create DataFrame df = pd.DataFrame(padded_rows, columns=headers) - + return df except Exception as e: @@ -234,7 +239,7 @@ def update_dataframe(df: pd.DataFrame, match_column: str, match_value: str, upda Args: df (pandas.DataFrame): The DataFrame containing updated data. - match_column (str): + match_column (str): match_value (str): update_dict (dict): @@ -268,7 +273,7 @@ def update_dataframe(df: pd.DataFrame, match_column: str, match_value: str, upda if column in df_copy.columns: df_copy.loc[mask, column] = value else: - logger.warning(f"update_dataframe: Column:{column} does not exist - unable to update with value: {value}.") + logger.warning(f"update_dataframe: Column:{column} does not exist - unable to update with value: {value}.") print(f"update_dataframe: Column:{column} does not exist - unable to update with value: {value}.") # Clean Dataframe of NaN values. @@ -278,18 +283,18 @@ def update_dataframe(df: pd.DataFrame, match_column: str, match_value: str, upda except Exception as e: return df, False, f"update_dataframe - An error occurred: {str(e)}" - + def add_row_to_dataframe(df, row_data, key_column): """ Add a new row to a pandas DataFrame, ensuring the key column value is unique. - + Parameters: df (pandas.DataFrame): The input DataFrame row_data (dict): Dictionary containing the new row data with column names as keys key_column (str): The column name to check for uniqueness - + Returns: pandas.DataFrame: DataFrame with the new row appended if key is unique, original DataFrame otherwise """ @@ -302,26 +307,26 @@ def add_row_to_dataframe(df, row_data, key_column): # Check if key_column exists in DataFrame if key_column not in df.columns: raise ValueError(f"add_row_to_dataframe - Key column '{key_column}' not found in DataFrame") - + # Check if key_column exists in row_data if key_column not in row_data: raise ValueError(f"add_row_to_dataframe - Key column '{key_column}' not found in row_data") - + # Check if the key value already exists if row_data[key_column] in df[key_column].values: #print(f"Warning: Key value '{row_data[key_column]}' already exists in column '{key_column}'. Row not added.") return df, False, "add_row_to_dataframe - Row with Key column {key_column} already exists." - + # Convert row_data to DataFrame new_row = pd.DataFrame([row_data]) - + # Append the new row to DataFrame df_ret = pd.concat([df, new_row], ignore_index=True) # Remove any NaN values. df = df_ret.fillna('') - return df, True, f"add_row_to_dataframe - Successfully added dataframe row" + return df, True, f"add_row_to_dataframe - Successfully added dataframe row" except Exception as e: return df, False, f"add_row_to_dataframe - Failed to add dataframe row: {str(e)}" @@ -355,10 +360,10 @@ def value_exists_in_column(df, column_name, value): def scan_directory(directory): """ Recursively scan a directory and return a list of files and subdirectories. - + Args: directory (str): Path to the directory to scan. - + Returns: list: List of tuples containing file/subdirectory name, type, and path. """ @@ -381,7 +386,7 @@ def convert_tiff_to_jp2(tif_file_path, jp2_file_path): tif_file_path (str): A full path the TIFF file. jp2_file_path (str): A full path to the resulting JP2 file. Returns: - None + None """ args = [ "/usr/bin/gm", @@ -395,12 +400,12 @@ def convert_tiff_to_jp2(tif_file_path, jp2_file_path): "-define", "jp2:rate=1.0", "-define", - "jp2:lazy", + "jp2:lazy", "-define", "jp2:prg=rlcp", "-define", "jp2:mode=int", - "-define", + "-define", "jp2:ilyrrates='0.015625,0.01858,0.0221,0.025,0.03125,0.03716,0.04419,0.05,0.0625,0.075,0.088,0.1,0.125,0.15,0.18,0.21,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.84'", tif_file_path, jp2_file_path @@ -411,7 +416,8 @@ def convert_tiff_to_jp2(tif_file_path, jp2_file_path): if (result.returncode == 0): logger.info(f"Successfully converted TIFF to JP2: {tif_file_path}") except Exception as e: - raise Exception(f"Failed to convert TIFF to JP2: {str(e)}") + logger.error(f"Failed to convert TIFF to JP2: {str(e)}") + #raise Exception(f"Failed to convert TIFF to JP2: {str(e)}") def is_valid_image(image_path): """ @@ -510,12 +516,13 @@ def parse_arguments(): parser.add_argument('--config-file', dest="config_file", required=True, help='Path to the YAML configuration file.') parser.add_argument('--log-file', dest="log_file", required=True, help='Path to the log file.') parser.add_argument('--directory', dest="directory", required=True, help='Path to the directory to scan.') + parser.add_argument('--model', dest="model", required=True, help='The Islandora Model associated with the batch.') # Optional arguments parser.add_argument('--in-google-sheet-id', dest="in_gs_id", help='Google Sheet ID related to the directory.') parser.add_argument('--in-google-sheet-name', dest="in_gs_name", help='Google Sheet Tab Name.') parser.add_argument('--in-google-creds-file', dest="in_gs_creds", help='Google Credentials json file.') - + # Parse Arguments args = parser.parse_args() @@ -524,7 +531,7 @@ def parse_arguments(): def process_tiff(file_path:str): """ Processes a TIFF (.tif) file and converts it to a JP2 (.jp2) file. - + Args: file_path (str) The path to incoming TIFF file. @@ -543,9 +550,13 @@ def process_tiff(file_path:str): out_pid = f"{parent}-{pid}" logger.info(f"Tiff: Creating JP2: {jp2_path}") - print(f"Tiff: Creating JP2: {jp2_path} from {file_path}") if not os.path.exists(jp2_path): + logger.info("Creating JP2: In: {file_path} Out: {jp2_path}") + print(f" Creating JP2:\n In: {file_path}\n Out: {jp2_path}") convert_tiff_to_jp2(file_path, jp2_path) + else: + logger.warning("JP2 Already Exists: {jp2_path}") + print(f" JP2 Already Exists: {jp2_path}") return out_pid,jp2_path @@ -590,7 +601,7 @@ def process_transcript(file_path:str): def process_thumbnail(file_path:str): """ Determine the pid and path to the thumbnail file. - + Args: file_path (str) The path to the incoming thumbnail. @@ -622,7 +633,7 @@ def process_unknown(file_path:str): pid,parent,file,dir,ext = get_file_info(file_path) out_pid = f"{parent}-{pid}" unknown_path = f"{dir}/{pid}{ext}" - + return out_pid,unknown_path @@ -650,7 +661,31 @@ def get_file_info(file_path: str): return(pid,parent,file,dir,ext) -def add_update_dataframe(df: pd.DataFrame, pid: str, row_data, file_name: str): +def dump_df_columns(df: pd.DataFrame, columns: list): + """ + Prints a DataFrame containing only the specified columns. + + Parameters: + df (pd.DataFrame): The original DataFrame. + columns (list): List of column names to include. + + Raises: + ValueError: If any requested column does not exist in the DataFrame. + """ + # Validate columns + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + logger.error(f"The following columns do not exist in the DataFrame: {missing_cols}") + #raise ValueError(f"The following columns do not exist in the DataFrame: {missing_cols}") + + # Prints the DataFrame with selected columns + with pd.option_context( + 'display.max_rows', None, + ): + print(f"Dataframe:\n{df[columns]}") + + +def add_update_dataframe(df: pd.DataFrame, pid: str, row_data): """ Depending upon if the pid exists in the 'id' column of the dataframe either add or update the row to the dataframe. @@ -659,13 +694,12 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data, file_name: str): df (DataFrame) The Pandas Dataframe to work with. pid (str) The PID to find in the DataFrame 'id' column. row_data (dict) The row data to add/update the row with. - file_name (str) The file_name of the file we are working with. Returns: df (DataFrame) The updated DataFrame. """ if (value_exists_in_column(df,'id',pid)): - logger.info(f"Updating Dataframe: {pid},{file_name}") + logger.info(f"Updating Dataframe: {pid}") updated_df,success,msg = update_dataframe(df, 'id', pid, row_data) if not (success): logger.warning(f"Update DataFrame: Failed for PID: {pid} - {msg}") @@ -675,7 +709,7 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data, file_name: str): logger.info(f"Updating DataFrame: Success for PID: {pid}") return updated_df else: - logger.info(f"Adding to Dataframe: {pid},{file_name}") + logger.info(f"Adding to Dataframe: {pid}") updated_df,success,msg = add_row_to_dataframe(df,row_data,key_column='id') if not (success): logger.warning(f"Adding to DataFrame: Failed for PID: {pid} - {msg}") @@ -686,245 +720,635 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data, file_name: str): return updated_df -def process_object(file_type: str, file_path: str, parent: str, df: pd.DataFrame): +#def process_object(file_type: str, file_path: str, parent: str, df: pd.DataFrame): +# """ +# Process the incoming file object. +# +# Args: +# file_type (str) The type of object ('File','Directory') +# file_path (str) The file_path to the object. +# parent (str) The parent directory of the object. +# df (pd.DataFrame) The Pandas DataFrame we will be updating. +# +# Returns: +# df (pd.DataFrame) The Updated Pandas DataFrame. +# """ +# #logger.info(f"Processing an object.") +# # object files are kept within a folder of the top level folder. +# # files are .tif files which need to be converted. +# +# if not (is_valid_dataframe(df)): +# logger.warning("process_object: Incoming dataframe is invalid.") +# print(f"process_object: Incoming dataframe is invalid.") +# print(f"Invalid DataFrame: {df}") +# +# # Process File... +# if (file_type == "File" ) and ( parent != "" ): +# #print(f"Processing File") +# ignore_file_list = ["target.tif","manifest.csv","manifest.xlsx","manifest.xls"] +# +# dir,ext = os.path.splitext(file_path) +# file_name = os.path.basename(dir) +# pid = file_name +# +# if ( ext == ".tif" ): +# logger.info(f"Processing TIFF: {file_path}") +# print(f"Processing TIFF: {file_path}") +# outpid,outfile = process_tiff(file_path) +# if (is_valid_image(file_path) and is_valid_filesize(134,file_path) and is_valid_image(outfile) and is_valid_filesize(134,outfile)): +# row_data = {'id': outpid, 'file': outfile} +# updated_df = add_update_dataframe(df,outpid,row_data) +# return updated_df +# +# elif ( ext == ".mp3" ): +# logger.info(f"Processing MP3: {file_path}") +# print(f"Procesing MP3: {file_path}") +# outpid,outfile = process_mp3(file_path) +# row_data = {'id': outpid, 'file': outfile} +# updated_df = add_update_dataframe(df,outpid,row_data) +# return updated_df +# +# elif (( ext == ".vtt" ) or ( ext == ".srt" )): +# logger.info(f"Processing WebVTT/SRT: {file_path}") +# print(f"Processing WebVTT/SRT: {file_path}") +# outpid,outfile = process_transcript(file_path) +# row_data = {'id': outpid, 'transcript': outfile} +# updated_df = add_update_dataframe(df,outpid,row_data) +# return updated_df +# +# elif (( ext == ".jpg" ) or ( ext == ".png" )): +# logger.info(f"Processing Thumbnail: {file_path}") +# print(f"Processing Thumbnail: {file_path}") +# outpid,outfile = process_thumbnail(file_path) +# row_data = {'id': outpid, 'thumbnail': outfile} +# updated_df = add_update_dataframe(df,outpid,row_data) +# return updated_df +# +# else: +# logger.info(f"Processing Unknown: {file_path}") +# print(f"Unknown File extension: {ext} - Skipping.") +# outpid,outfile = process_unknown(file_path) +# row_data = {'id': outpid, 'file': outfile} +# updated_df = add_update_dataframe(df,outpid,row_data) +# return updated_df +# +# +# # Process Top Level file... +# elif (file_type == "File") and (parent == ""): +# # Top Level file. +# #print(f"Processing Top-Level File") +# ignore_file_list = ["ignore"] +# if file_path.casefold() in ignore_file_list: +# logger.info(f"Ingoring: {file_path}") +# else: +# logger.info(f"Processing Top Level file: {file_path}") +# dir,ext = os.path.splitext(file_path) +# pid = os.path.basename(dir) +# +# if ( ext == ".tif" ): +# # Top Level file is a .tif file. +# logger.info(f"Processing Top Level TIFF: {file_path}") +# print(f"Processing Top Level TIFF: {file_path}") +# outpid,outfile = process_tiff(file_path) +# row_data = {'id': pid, 'file': outfile} +# updated_df = add_update_dataframe(df,pid,row_data) +# return updated_df +# +# elif ( ext == ".mp3" ): +# logger.info(f"Processing MP3: {file_path}") +# print(f"Procesing MP3: {file_path}") +# outpid,outfile = process_mp3(file_path) +# row_data = {'id': pid, 'file': outfile} +# updated_df = add_update_dataframe(df,pid,row_data) +# return updated_df +# +# elif (( ext == ".vtt" ) or ( ext == ".srt" )): +# logger.info(f"Processing WebVTT/SRT: {file_path}") +# print(f"Processing WebVTT/SRT: {file_path}") +# outpid,outfile = process_transcript(file_path) +# row_data = {'id': pid, 'transcript': outfile} +# updated_df = add_update_dataframe(df,pid,row_data) +# return updated_df +# +# elif (( ext == ".jpg" ) or ( ext == ".png" )): +# logger.info(f"Processing Thumbnail: {file_path}") +# print(f"Processing Thumbnail: {file_path}") +# outpid,outfile = process_thumbnail(file_path) +# row_data = {'id': pid, 'thumbnail': outfile} +# updated_df = add_update_dataframe(df,pid,row_data) +# return updated_df +# +# else: +# logger.info(f"Processing Unknown: {file_path}") +# print(f"Unknown File extension: {ext} - Skipping.") +# outpid,outfile = process_unknown(file_path) +# row_data = {'id': pid, 'file': outfile} +# updated_df = add_update_dataframe(df,pid,row_data) +# return updated_df +# +# # Process Directory... +# elif (file_type == "Directory"): +# #print(f"Processing Directory") +# ignore_dir_list = ["ignore"] +# if file_path.casefold() in ignore_dir_list: +# logger.info(f"Ignoreing: {file_path}") +# else: +# # Continue +# logger.info(f"Processing Directory: {file_path}") +# dir,ext = os.path.splitext(file_path) +# pid = os.path.basename(dir) +# +# # Check if pid in Google Sheet. +# if (value_exists_in_column(df,'id',pid)): +# # Update Existing info. +# print(f"Found: {pid} in Google Sheet") +# logger.info(f"Found: {pid} in Google Sheet") +# +# # Set Row Data +# row_data = {'id': pid} +# +# # Update the dataframe. +# logger.info(f"Updating DataFrame: {pid}") +# #updated_df,success,msg = update_dataframe(df, 'id', pid, row_data) +# df,success,msg = update_dataframe(df, 'id', pid, row_data) +# +# if not (success): +# logger.warning(f"Update DataFrame: Failed for PID: {pid} - {msg}") +# print(f"Update DataFrame: Failed for PID: {pid} - {msg}") +# else: +# #return updated_df +# return df +# +# else: +# # Add new info. +# print(f"Not Found: Adding {pid} to DataFrame") +# logger.info(f"Not Found: Adding {pid} to DataFrame") +# +# # Set Row Data. +# row_data = {'id': pid} +# +# # Update the dataframe. +# logger.info(f"Updating DataFrame: {pid}") +# print(f"Adding: {row_data}") +# #updated_df,success,msg = add_row_to_dataframe(df,row_data,key_column='id') +# df,success,msg = add_row_to_dataframe(df,row_data,'id') +# +# if not (success): +# logger.warning(f"Add to DataFrame: failed for PID: {pid} - {msg}") +# print(f"Add to DataFrame: Failed for PID: {pid} - {msg}") +# else: +# return df +# +# else: +# print(f"Unknown object.") +# exit() + + +def get_value_from_df(df: pd.DataFrame,match_column: str,match_value,return_column:str): """ - Process the incoming file object. + Given a pandas DataFrame, find the row where match_column == match_value + and return the value from return_column. - Args: - file_type (str) The type of object ('File','Directory') - file_path (str) The file_path to the object. - parent (str) The parent directory of the object. - df (pd.DataFrame) The Pandas DataFrame we will be updating. + Parameters: + df (pd.DataFrame): The DataFrame to search. + match_column (str): Column name to match on. + match_value: Value to look for in match_column. + return_column (str): Column name whose value should be returned. Returns: - df (pd.DataFrame) The Updated Pandas DataFrame. - """ - #logger.info(f"Processing an object.") - # object files are kept within a folder of the top level folder. - # files are .tif files which need to be converted. - - if not (is_valid_dataframe(df)): - logger.warning("process_object: Incoming dataframe is invalid.") - print(f"process_object: Incoming dataframe is invalid.") - print(f"Invalid DataFrame: {df}") - - # Process File... - if (file_type == "File" ) and ( parent != "" ): - #print(f"Processing File") - ignore_file_list = ["target.tif","manifest.csv","manifest.xlsx","manifest.xls"] - - dir,ext = os.path.splitext(file_path) - file_name = os.path.basename(dir) - pid = file_name - - if ( ext == ".tif" ): - logger.info(f"Processing TIFF: {file_path}") - print(f"Processing TIFF: {file_path}") - outpid,outfile = process_tiff(file_path) - if (is_valid_image(file_path) and is_valid_filesize(134,file_path) and is_valid_image(outfile) and is_valid_filesize(134,outfile)): - row_data = {'id': outpid, 'file': outfile} - updated_df = add_update_dataframe(df,outpid,row_data,outfile) - return updated_df - - elif ( ext == ".mp3" ): - logger.info(f"Processing MP3: {file_path}") - print(f"Procesing MP3: {file_path}") - outpid,outfile = process_mp3(file_path) - row_data = {'id': outpid, 'file': outfile} - updated_df = add_update_dataframe(df,outpid,row_data,outfile) - return updated_df + The value from return_column if found, else None. + """ - elif (( ext == ".vtt" ) or ( ext == ".srt" )): - logger.info(f"Processing WebVTT/SRT: {file_path}") - print(f"Processing WebVTT/SRT: {file_path}") - outpid,outfile = process_transcript(file_path) - row_data = {'id': outpid, 'transcript': outfile} - updated_df = add_update_dataframe(df,outpid,row_data,outfile) - return updated_df + # Validate columns + if match_column not in df.columns: + #raise ValueError(f"Column '{match_column}' does not exist in the DataFrame.") + logger.warning(f"Column '{match_column}' does not exist in the DataFrame.") + return None + if return_column not in df.columns: + #raise ValueError(f"Column '{return_column}' does not exist in the DataFrame.") + logger.warning(f"Column '{return_column}' does not exist in the DataFrame.") + return None - elif (( ext == ".jpg" ) or ( ext == ".png" )): - logger.info(f"Processing Thumbnail: {file_path}") - print(f"Processing Thumbnail: {file_path}") - outpid,outfile = process_thumbnail(file_path) - row_data = {'id': outpid, 'thumbnail': outfile} - updated_df = add_update_dataframe(df,outpid,row_data,outfile) - return updated_df + # Filter the DataFrame + filtered = df[df[match_column] == match_value] - else: - logger.info(f"Processing Unknown: {file_path}") - print(f"Unknown File extension: {ext} - Skipping.") - outpid,outfile = process_unknown(file_path) - row_data = {'id': outpid, 'file': outfile} - updated_df = add_update_dataframe(df,outpid,row_data,outfile) - return updated_df + if not filtered.empty: + return filtered.iloc[0][return_column] # Return first match + else: + return None + + +def get_taxonomy_tid(base_url, vocabulary, term_name, auth_token=None): + """ + Fetch taxonomy term ID (tid) from Drupal JSON:API given term name. + + Parameters: + base_url (str): Base URL of the Drupal site (e.g., 'https://example.com'). + vocabulary (str): Machine name of the vocabulary (e.g., 'tags'). + term_name (str): Name of the taxonomy term to search. + auth_token (str): Optional Bearer token for authentication. + + Returns: + str or None: The taxonomy term ID if found, else None. + """ + # JSON:API endpoint for taxonomy terms + url = f"{base_url}/jsonapi/taxonomy_term/{vocabulary}" + + # Filter by term name + params = { + "filter[name]": term_name + } + + headers = { + "Accept": "application/vnd.api+json" + } + + if auth_token: + headers["Authorization"] = f"Bearer {auth_token}" + + try: + response = requests.get(url, headers=headers, params=params) + except Exception as e: + logger.error(f"get_taxonomy_tid - Unexpected error: {e}") + print(f"Unexpected error: {e}") - # Process Top Level file... - elif (file_type == "File") and (parent == ""): - # Top Level file. - #print(f"Processing Top-Level File") - ignore_file_list = ["ignore"] - if file_path.casefold() in ignore_file_list: - logger.info(f"Ingoring: {file_path}") + if response.status_code == 200: + data = response.json() + if data.get("data"): + return data["data"][0]["attributes"]["drupal_internal__tid"] else: - logger.info(f"Processing Top Level file: {file_path}") - dir,ext = os.path.splitext(file_path) - pid = os.path.basename(dir) - - if ( ext == ".tif" ): - # Top Level file is a .tif file. - logger.info(f"Processing Top Level TIFF: {file_path}") - print(f"Processing Top Level TIFF: {file_path}") - outpid,outfile = process_tiff(file_path) - row_data = {'id': pid, 'file': outfile} - updated_df = add_update_dataframe(df,pid,row_data,outfile) - return updated_df - - elif ( ext == ".mp3" ): - logger.info(f"Processing MP3: {file_path}") - print(f"Procesing MP3: {file_path}") - outpid,outfile = process_mp3(file_path) - row_data = {'id': pid, 'file': outfile} - updated_df = add_update_dataframe(df,pid,row_data,outfile) - return updated_df - - elif (( ext == ".vtt" ) or ( ext == ".srt" )): - logger.info(f"Processing WebVTT/SRT: {file_path}") - print(f"Processing WebVTT/SRT: {file_path}") - outpid,outfile = process_transcript(file_path) - row_data = {'id': pid, 'transcript': outfile} - updated_df = add_update_dataframe(df,pid,row_data,outfile) - return updated_df - - elif (( ext == ".jpg" ) or ( ext == ".png" )): - logger.info(f"Processing Thumbnail: {file_path}") - print(f"Processing Thumbnail: {file_path}") - outpid,outfile = process_thumbnail(file_path) - row_data = {'id': pid, 'thumbnail': outfile} - updated_df = add_update_dataframe(df,pid,row_data,outfile) - return updated_df + return None + else: + logger.error(f"get_taxonomy_tid - Error {response.status_code}: {response.text}") + + +def process_file(df, file_path: str, level): + logger.info(f"Function: process_file") + logger.info(f"Processing File: {file_path}") + print(f"Processing File: {file_path}") + + # Skip objects that we don't want to process. + if not any(sub in file_path for sub in skip): + file_folder = os.path.dirname(file_path) # The Full Path including Directory containing the file. + base_name = os.path.basename(file_path) # The Base filename with extension of the file give the file_path. + file_name,file_ext = os.path.splitext(base_name)# The Separated Filename and Extension of the Base filename. + + # Set the file weight if the file_name is a digit or contains "-\d{4}" + file_weight = '' + if level > 1: + if file_name.isdigit(): + file_weight = int(file_name) else: - logger.info(f"Processing Unknown: {file_path}") - print(f"Unknown File extension: {ext} - Skipping.") - outpid,outfile = process_unknown(file_path) - row_data = {'id': pid, 'file': outfile} - updated_df = add_update_dataframe(df,pid,row_data,outfile) - return updated_df - - # Process Directory... - elif (file_type == "Directory"): - #print(f"Processing Directory") - ignore_dir_list = ["ignore"] - if file_path.casefold() in ignore_dir_list: - logger.info(f"Ignoreing: {file_path}") - else: - # Continue - logger.info(f"Processing Directory: {file_path}") - dir,ext = os.path.splitext(file_path) - pid = os.path.basename(dir) - - # Check if pid in Google Sheet. - if (value_exists_in_column(df,'id',pid)): - # Update Existing info. - print(f"Found: {pid} in Google Sheet") - logger.info(f"Found: {pid} in Google Sheet") - - # Set Row Data - row_data = {'id': pid} - - # Update the dataframe. - logger.info(f"Updating DataFrame: {pid}") - #updated_df,success,msg = update_dataframe(df, 'id', pid, row_data) - df,success,msg = update_dataframe(df, 'id', pid, row_data) - - if not (success): - logger.warning(f"Update DataFrame: Failed for PID: {pid} - {msg}") - print(f"Update DataFrame: Failed for PID: {pid} - {msg}") - else: - #return updated_df - return df + pattern = r'.*-(\d{4})' + match = re.search(pattern,file_name) + if match: + file_weight = int(match.group(1)) + + # Set parent information. + parent_path = os.path.dirname(file_path) + parent_folder = os.path.basename(parent_path) + + # Get the model from the map. + my_model = get_model(level) + field_model = get_taxonomy_tid('https://i2.digital.library.pitt.edu','islandora_models',my_model) + logger.info(f"File is model: {my_model}, TID: {field_model}") + + # Get the resource_type from the map + model_info = get_model_info(my_model,models) + resource_type = model_info.get('resource_type','None') + + # Process any .tif files. + if (file_ext.lower() == ".tif"): + logger.info("File is type: TIFF") + print(f" Type: TIFF") + + # Create .jp2 file. + pid,outfile = process_tiff(file_path) + + # Handle top level files. + if not level == 1: + # Not in top level folder. + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + parent_folder = '' + + # Build Row Data. + row_data = { + 'id': pid, + 'file': outfile, + 'parent_id': parent_folder, + 'field_weight': file_weight, + 'field_model': field_model, + 'model': my_model, + 'field_resource_type': resource_type, + 'level': level, + } + + # Add a page title if it is a model of 'Page'. + # if my_model == 'Page': + # row_data.update({ + # 'title': f"{pid}-{file_name}", + # }) + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any .jp2 files. + if (file_ext.lower() == ".jp2"): + print(f" Type: JP2") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + parent_folder = '' + + # Build Row Data. + row_data = { + 'id': pid, + 'file': outfile, + 'parent_id': parent_folder, + 'field_weight': file_weight, + 'field_model': field_model, + 'model': my_model, + 'field_resource_type': resource_type, + 'level': level, + } + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any audio files. + if (file_ext.lower() == ".mp3"): + print(f" Type: Audio") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + parent_folder = '' + + # Build Row Data. + row_data = { + 'id': pid, + 'file': file_path, + 'parent_id': parent_folder, + 'level': level, + 'field_model': field_model, + 'field_weight': file_weight, + 'model': my_model, + 'field_resource_type': resource_type, + } + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any video files. + if (file_ext.lower() == ".mkv" or file_ext.lower() == ".mp4"): + print(f" Type: Video") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + parent_folder = '' + + # Build Row Data. + row_data = { + 'id': pid, + 'file': file_path, + 'parent_id': parent_folder, + 'level': level, + 'field_model': field_model, + 'field_weight': file_weight, + 'model': my_model, + 'field_resource_type': resource_type, + } + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any transcription files. + if (file_ext.lower() == ".srt" or file_ext.lower() == ".vtt"): + print(f" Type: Transcript") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + + # Build Row Data. + row_data = { + 'transcript': file_path, + } + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any PDF files. + if (file_ext.lower() == ".pdf"): + print(f" Type: PDF") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" else: - # Add new info. - print(f"Not Found: Adding {pid} to DataFrame") - logger.info(f"Not Found: Adding {pid} to DataFrame") - - # Set Row Data. - row_data = {'id': pid} - - # Update the dataframe. - logger.info(f"Updating DataFrame: {pid}") - print(f"Adding: {row_data}") - #updated_df,success,msg = add_row_to_dataframe(df,row_data,key_column='id') - df,success,msg = add_row_to_dataframe(df,row_data,'id') - - if not (success): - logger.warning(f"Add to DataFrame: failed for PID: {pid} - {msg}") - print(f"Add to DataFrame: Failed for PID: {pid} - {msg}") - else: - return df - - else: - print(f"Unknown object.") - exit() + pid = f"{file_name}" + + + # Build Row Data. + row_data = { + 'id': pid, + 'file': file_path, + } + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + + # Process any simple image files. + if (file_ext.lower() == ".png" or file_ext.lower() == ".jpg"): + print(f" Type: Simple Image") + + print(f"\n") + + return(df) -def process_objects(data, directory: str, df: pd.DataFrame): +def get_directory(directory): """ - Process a list of objects + Scan a directory and return a list of files and subdirectories. Args: - data (list of tupples) The result of scan_directory. - directory (str) The full directory path that we will be working with. - df (DataFrame) The Pandas DataFrame that we will be working with. + directory (str): Path to the directory to scan. Returns: - N/a. + list: List of tuples containing file/subdirectory name, type, and path. + """ + print(f"Processing directory: {directory}") + result = [] + for item in os.listdir(directory): + item_path = os.path.join(directory, item) + if os.path.isfile(item_path): + result.append((item, "File", item_path)) + elif os.path.isdir(item_path): + result.append((item, "Directory", item_path)) + result.sort(key=lambda x: (x[1] != "Directory", x[0].lower())) + return result + +def process_objects(df, directory: str, level): """ - #print(f"process_objects: Input DataFrame:") - #print(df) + Process the objects of the directory. - # Begin Processing the listing of the directory. - logger.info(f"Processing file data.") - print(f"Processing the directory: {directory}") + Args: + df: Pandas Dataframe + directory: Directory we are working with. + level: What level are we at directory wise. + """ + #print(f"Level: {level}, {directory}") + # Get the objects in the directory. + data = get_directory(directory) + + # Get the PID of the current directory. + pid = os.path.basename(directory) + + # Set parent information. + parent_path = os.path.dirname(directory) + parent_folder = os.path.basename(parent_path) + + # Set model. + my_model = get_model(level) + + # Get the resource_type from the map + model_info = get_model_info(my_model,models) + resource_type = model_info.get('resource_type','None') + + # if level > 1 + if level > 1: + # For this directory. + row_data = { + 'id': pid, + 'level': level, + 'parent_id': parent_folder, + 'model': my_model, + 'field_resource_type': resource_type, + } + df = add_update_dataframe(df,pid,row_data) + + if level == 1: + row_data = { + 'id': pid, + 'level': level, + 'model': my_model, + 'field_resource_type': resource_type, + } + df = add_update_dataframe(df,pid,row_data) - # Loop through the file listing. + # For each object in the directory do. for row, (file_name, file_type, file_path) in enumerate(data, start=2): - # Skip things we don't want to include. - if (( file_name == "target.tif" ) or ( file_name == "manifest.csv" ) or ( file_name == "manifest.xlsx" )): - next - else: - print(f"Inspecting: {file_path}") - # Define the parent - parent = file_path.replace(directory, "") - parent = parent.replace("/"+file_name, "") - parent = parent.replace("/","") - #print(f"File_type: {file_type}") - #print(f"File_Path: {file_path}") - #print(f"Parent: {parent}") + # Skip objects that we don't want to process. + if not any(sub in file_path for sub in skip): - df = process_object(file_type,file_path,parent,df) + # If the object is a directory. + if file_type == "Directory": + # process the directory. + df = process_objects(df, file_path, level + 1) - print(f"\n") + # If the object is a file. + if file_type == "File": + # process the file. + df = process_file(df, file_path, level + 1) + #else: + # print(f"Skip: {file_path}") - # Display the DataFrame before sending it to Google Sheets. - #print(f"Result Dataframe:") - #print(df) + # End if-else + # End For - # Save the DataFrame to Google Sheets. - logger.info(f"Updating Google Sheet with DataFrame.") - print(f"Updating Google Sheet with DataFrame.") - success,msg = update_google_sheet(df, google_sheet_id, google_sheet_name, google_credentials) - if (success): - logger.info(f"Successfully Updated Google Sheet.") - print(f"Successfully Updated Google Sheet.") + return df + +def get_model_paths(start_model, models_dict): + """ + Recursively map paths from a starting model to all leaf models. + + Args: + start_model: The key of the starting model in the models dictionary + models_dict: The dictionary containing all model definitions + + Returns: + A list of paths, where each path is a list of model keys from start to leaf + """ + # Check if the starting model exists + if start_model not in models_dict: + return [] + + model_info = models_dict[start_model] + child = model_info.get('child', 'None') + model = model_info.get('model', 'None') + + # Base case: if there's no child or child is 'None', return path with just this model + if child == 'None' or child not in models_dict: + #return [[start_model]] + return [[model]] + + # Recursive case: get all paths from child and prepend current model + child_paths = get_model_paths(child, models_dict) + #return [[start_model] + path for path in child_paths] + return [[model] + path for path in child_paths] + +def get_model_info(model,models_dict): + # Check if the starting model exists + if model not in models_dict: + return [] + + model_info = models_dict[model] + return model_info + +def get_model(level): + #if model_paths[0][level-1] == "File": + # Return parent model + # return model_paths[0][level-2] + #else: + # Return model + model_length = len(model_paths[0]) + if model_length <= 2: + return model_paths[0][0] else: - logger.warning(f"Failed to update Google Sheet: {msg}") - print(f"Failed to update Google Sheet: {msg}") + return model_paths[0][level-1] + +def get_resource_type(level): + return 0 +def is_target_in_list(target,list): + return target in list + +def get_value(map, key, default=None): + return map.get(key, default) + +def add_column(df: pd.DataFrame, column_name: str, default=None): + if column_name not in df.columns: + df[column_name] = default + return df + def main(): """ Main Process that sets up the environment variables etc and kicks off @@ -936,6 +1360,174 @@ def main(): Returns: N/a """ + + # Build models list which produces a map based upon the model. + # Model should eventually arive at a "File" model. + # The 'model' is obtained from the list of models in Islandora 2. + # All 'model' should be referenced below as a possible type. + # The 'resource_type'... + # The 'child' points to another model with "File" model being the end. + globals()['models'] = { + 'Compound Audio 1': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Audio', + }, + 'Compound Audio 2': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'COA', + }, + 'COA': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Audio', + }, + 'Compound Book': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Paged Content', + }, + 'Compound Image 1': { + 'model': 'Paged Content', + 'resource_type': 'Collection', + 'child': 'Page', + }, + 'Compound Image 2': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Image', + }, + 'Compound Video 1': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Video', + }, + 'Compound Video 2': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'COV', + }, + 'COV': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'Video', + }, + 'Compound Object': { + 'model': 'Compound Object', + 'resource_type': 'Collection', + 'child': 'None', + }, + 'Collection': { + 'model': 'Collection', + 'resource_type': 'Collection', + 'child': 'None', + }, + 'Serial': { + 'model': 'Newspaper', + 'resource_type': 'Collection', + 'child': 'Issue', + }, + 'Newspaper': { + 'model': 'Newspaper', + 'resource_type': 'Collection', + 'child': 'Publication Issue', + }, + 'Issue': { + 'model': 'Publication Issue', + 'resource_type': 'Collection', + 'child': 'Page', + }, + 'Publication Issue': { + 'model': 'Publication Issue', + 'resource_type': 'Collection', + 'child': 'Page', + }, + 'Book': { + 'model': 'Paged Content', + 'resource_type': 'Collection', + 'child': 'Page', + }, + 'Paged Content': { + 'model': 'Paged Content', + 'resource_type': 'Collection', + 'child': 'Page', + }, + 'Digital Document': { + 'model': 'Digital Document', + 'resource_type': 'Text', + 'child': 'None', + }, + 'Page': { + 'model': 'Page', + 'resource_type': 'Text', + 'child': 'File', + }, + 'Image': { + 'model': 'Image', + 'resource_type': 'Still Image', + 'child': 'File', + }, + 'Audio': { + 'model': 'Audio', + 'resource_type': 'Sound', + 'child': 'File', + }, + 'Video': { + 'model': 'Video', + 'resource_type': 'Moving Image', + 'child': 'File', + }, + 'PDF': { + 'model': 'Digital Document', + 'resource_type': 'Text', + 'child': 'File', + }, + 'Binary': { + 'model': 'Binary', + 'resource_type': 'Unspecified', + 'child': 'File', + }, + 'File': { + 'model': 'File', + 'child': 'None', + }, + } + + + # Valid models. + # This is the list of valid models that we will work with. + # + globals()['allowed_models'] = [ + "compound audio", + "compound video", + "compound image", + "compound book", + "compound object", + "collection", + "serial", + "newspaper", + "publication issue", + "issue", + "book", + "digital document", + "page", + "image", + "audio", + "video", + "pdf", + "binary" + ] + + ### Other models: + # "collection": "Collection", + # "digital document": "Digital Document", + # "paged content": "Paged Content", + # "publication issue": "Publication Issue", + # "compound object": "Compound Object", + # "newspaper": "Newspaper", + + # Setup the log file format. globals()['log_formatter'] = logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(levelname)s %(message)s',datefmt="%Y%m%d %H:%M:%S") @@ -959,17 +1551,41 @@ def main(): globals()['google_credentials'] = args.in_gs_creds if args.in_gs_id is not None: globals()['google_sheet_id'] = args.in_gs_id - if args.in_gs_name is not None: + if args.in_gs_name is not None: globals()['google_sheet_name'] = args.in_gs_name if args.log_file is not None: globals()['log_file'] = args.log_file + # Required Sheet Columns: + globals()['required_columns'] = ["id","file","level","parent_id", + "field_weight","field_model","model","field_resource_type","transcript"] + + # Global file patterns to skip over. + globals()['skip'] = ["ignore",".jp2",".metadata","meta",".opex",".fits", + "target.tif","metadata.csv","metadata.xlsx","manifest.xlsx","manifest.csv"] + # Create the Log file. - #print(f"Creating log file: {args.log_file}") - #globals()['logger'] = setup_logger('logger', args.log_file, level=logging.DEBUG) + print(f"Creating Log file: {log_file}") globals()['logger'] = setup_logger('logger', log_file, level=logging.DEBUG) logger.info(f"Begin log.") + # Read Content Models + # globals()['content_models'] = read_yaml_file('content_models.yml') + + # Get Batch Model. + globals()['model'] = args.model + logger.info(f"Batch Model: {model}") + if not is_target_in_list(model.lower(), allowed_models): + logger.error(f"Model: {model} is not in the list of allowed_models.") + print(f"Model: {model} is not in the list of allowed_models.") + + # Show Model path. + globals()['model_paths'] = get_model_paths(model,models) + logger.info(f"Model Path: {model_paths[0]}") + print(f"Model Path: {model_paths[0]}") + print(f"{json.dumps(model_paths[0],indent=4)}") + + # Get external command paths. gm_path = shutil.which("gm") @@ -977,8 +1593,8 @@ def main(): if gm_path: logger.info(f"GraphicsMagick Executable found at: {gm_path}") else: - logger.error(f"GraphicsMagick Executable 'gm' not found and is required.") - print(f"GraphicsMagick Executable 'gm' not found and is required.") + logger.error(f"GraphicsMagick Executable 'gm' not found in PATH and is required.") + print(f"GraphicsMagick Executable 'gm' not found in PATH and is required.") print(f"Exiting...") exit() @@ -987,13 +1603,28 @@ def main(): print(f"Reading Google Sheet: {google_sheet_id},{google_sheet_name}") df = read_google_sheet(google_sheet_id, google_sheet_name, google_credentials) - # Scan the directory and return a list of directory contents. - logger.info(f"Scan the directory: {args.directory}") - file_data = scan_directory(args.directory) + # Ensure all required_columns exist. + for col in required_columns: + df = add_column(df,col) # Process the contents. - logger.info(f"Process the directory: {args.directory}") - process_objects(file_data,args.directory,df) + globals()['top'] = args.directory + logger.info(f"Process the directory: {top}") + updated_df = process_objects(df,top,0) + + # Save the DataFrame to Google Sheets. + logger.info(f"Updating Google Sheet with DataFrame.") + print(f"Updating Google Sheet with DataFrame.") + success,msg = update_google_sheet(updated_df, google_sheet_id, google_sheet_name, google_credentials) + if (success): + logger.info(f"Successfully Updated Google Sheet.") + print(f"Successfully Updated Google Sheet.") + else: + logger.warning(f"Failed to update Google Sheet: {msg}") + print(f"Failed to update Google Sheet: {msg}") + + # Display df + print(f"Dataframe:\n{updated_df}") exit() @@ -1005,5 +1636,3 @@ google_sheet_name = None if __name__ == "__main__": main() - - From 7075968fece02a683d402dea74f7538e971b7eb2 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 11:37:12 -0500 Subject: [PATCH 02/21] Adjusting Islandora Models. --- scan-batch-dir | 48 ++++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 5a9fa25..268a29c 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -1373,16 +1373,6 @@ def main(): 'resource_type': 'Collection', 'child': 'Audio', }, - 'Compound Audio 2': { - 'model': 'Compound Object', - 'resource_type': 'Collection', - 'child': 'COA', - }, - 'COA': { - 'model': 'Compound Object', - 'resource_type': 'Collection', - 'child': 'Audio', - }, 'Compound Book': { 'model': 'Compound Object', 'resource_type': 'Collection', @@ -1393,26 +1383,11 @@ def main(): 'resource_type': 'Collection', 'child': 'Page', }, - 'Compound Image 2': { - 'model': 'Compound Object', - 'resource_type': 'Collection', - 'child': 'Image', - }, 'Compound Video 1': { 'model': 'Compound Object', 'resource_type': 'Collection', 'child': 'Video', }, - 'Compound Video 2': { - 'model': 'Compound Object', - 'resource_type': 'Collection', - 'child': 'COV', - }, - 'COV': { - 'model': 'Compound Object', - 'resource_type': 'Collection', - 'child': 'Video', - }, 'Compound Object': { 'model': 'Compound Object', 'resource_type': 'Collection', @@ -1423,21 +1398,26 @@ def main(): 'resource_type': 'Collection', 'child': 'None', }, - 'Serial': { + 'Serial 1': { 'model': 'Newspaper', 'resource_type': 'Collection', - 'child': 'Issue', + 'child': 'Issue 1', }, - 'Newspaper': { + 'Serial 2': { 'model': 'Newspaper', 'resource_type': 'Collection', - 'child': 'Publication Issue', + 'child': 'Issue 2', }, - 'Issue': { + 'Issue 1': { 'model': 'Publication Issue', 'resource_type': 'Collection', 'child': 'Page', }, + 'Issue 2': { + 'model': 'Publication Issue', + 'resouce_type': 'Text', + 'child': 'File', + }, 'Publication Issue': { 'model': 'Publication Issue', 'resource_type': 'Collection', @@ -1505,10 +1485,11 @@ def main(): "compound book", "compound object", "collection", - "serial", - "newspaper", + "serial 1", + "serial 2", "publication issue", - "issue", + "issue 1", + "issue 2", "book", "digital document", "page", @@ -1585,7 +1566,6 @@ def main(): print(f"Model Path: {model_paths[0]}") print(f"{json.dumps(model_paths[0],indent=4)}") - # Get external command paths. gm_path = shutil.which("gm") From fb439896262a64f252942cd60e983fddfa81c234 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 11:54:47 -0500 Subject: [PATCH 03/21] A bit of cleanup. --- scan-batch-dir | 214 +------------------------------------------------ 1 file changed, 4 insertions(+), 210 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 268a29c..8c3d783 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -258,8 +258,6 @@ def update_dataframe(df: pd.DataFrame, match_column: str, match_value: str, upda # Verify columns exist if match_column not in df.columns: return df, False, f"update_dataframe - Match column '{match_column}' not found" - #if update_column not in df.columns: - # return df, False, f"update_dataframe - Update column '{update_column}' not found" # Make a copy of the dataframe df_copy = df.copy() @@ -314,7 +312,6 @@ def add_row_to_dataframe(df, row_data, key_column): # Check if the key value already exists if row_data[key_column] in df[key_column].values: - #print(f"Warning: Key value '{row_data[key_column]}' already exists in column '{key_column}'. Row not added.") return df, False, "add_row_to_dataframe - Row with Key column {key_column} already exists." # Convert row_data to DataFrame @@ -348,13 +345,11 @@ def value_exists_in_column(df, column_name, value): # Check if the column exists in the DataFrame if column_name not in df.columns: return False - #raise ValueError(f"The column '{column_name}' does not exist in the DataFrame") # Check if the value exists in the column return value in df[column_name].values except Exception as e: - #print(f"An unexpected error occurred: {e}") return False def scan_directory(directory): @@ -417,7 +412,6 @@ def convert_tiff_to_jp2(tif_file_path, jp2_file_path): logger.info(f"Successfully converted TIFF to JP2: {tif_file_path}") except Exception as e: logger.error(f"Failed to convert TIFF to JP2: {str(e)}") - #raise Exception(f"Failed to convert TIFF to JP2: {str(e)}") def is_valid_image(image_path): """ @@ -541,7 +535,6 @@ def process_tiff(file_path:str): """ # Process a .tif file. - #print(f"Processing a TIFF file: {file_path}.") pid,parent,file,dir,ext = get_file_info(file_path) # Create the JP2 derivative if it does not already exist. @@ -572,7 +565,6 @@ def process_mp3(file_path:str): mp3_path (str) The Path to the MP3 file. """ # Process a .mp3 file. - #print(f"Processing a MP3 file: {file_path}.") pid,parent,file,dir,ext = get_file_info(file_path) out_pid = f"{parent}-{pid}" mp3_path = f"{dir}/{pid}{ext}" @@ -591,7 +583,6 @@ def process_transcript(file_path:str): vtt_path (str) The Path to the Transcript file. """ # Process a .vtt or .srt file. - #print(f"Processing a WebVTT/SRT file: {file_path}.") pid,parent,file,dir,ext = get_file_info(file_path) out_pid = f"{parent}-{pid}" vtt_path = f"{dir}/{pid}{ext}" @@ -610,7 +601,6 @@ def process_thumbnail(file_path:str): tn_path (str) the Path to the thumbnail file. """ # Process a thumbnail (.jpg/.png). - #print(f"Processing a Thumbnail file: {file_path}.") pid,parent,file,dir,ext = get_file_info(file_path) out_pid = f"{parent}-{pid}" tn_path = f"{dir}/{pid}{ext}" @@ -629,7 +619,6 @@ def process_unknown(file_path:str): unknown_path (str) The path to the unknown file. """ # Process a unknown file. - #print(f"Processing an Unknown file: {file_path}.") pid,parent,file,dir,ext = get_file_info(file_path) out_pid = f"{parent}-{pid}" unknown_path = f"{dir}/{pid}{ext}" @@ -676,7 +665,6 @@ def dump_df_columns(df: pd.DataFrame, columns: list): missing_cols = [col for col in columns if col not in df.columns] if missing_cols: logger.error(f"The following columns do not exist in the DataFrame: {missing_cols}") - #raise ValueError(f"The following columns do not exist in the DataFrame: {missing_cols}") # Prints the DataFrame with selected columns with pd.option_context( @@ -719,191 +707,6 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data): logger.info(f"Adding to DataFrame: Success for PID: {pid}") return updated_df - -#def process_object(file_type: str, file_path: str, parent: str, df: pd.DataFrame): -# """ -# Process the incoming file object. -# -# Args: -# file_type (str) The type of object ('File','Directory') -# file_path (str) The file_path to the object. -# parent (str) The parent directory of the object. -# df (pd.DataFrame) The Pandas DataFrame we will be updating. -# -# Returns: -# df (pd.DataFrame) The Updated Pandas DataFrame. -# """ -# #logger.info(f"Processing an object.") -# # object files are kept within a folder of the top level folder. -# # files are .tif files which need to be converted. -# -# if not (is_valid_dataframe(df)): -# logger.warning("process_object: Incoming dataframe is invalid.") -# print(f"process_object: Incoming dataframe is invalid.") -# print(f"Invalid DataFrame: {df}") -# -# # Process File... -# if (file_type == "File" ) and ( parent != "" ): -# #print(f"Processing File") -# ignore_file_list = ["target.tif","manifest.csv","manifest.xlsx","manifest.xls"] -# -# dir,ext = os.path.splitext(file_path) -# file_name = os.path.basename(dir) -# pid = file_name -# -# if ( ext == ".tif" ): -# logger.info(f"Processing TIFF: {file_path}") -# print(f"Processing TIFF: {file_path}") -# outpid,outfile = process_tiff(file_path) -# if (is_valid_image(file_path) and is_valid_filesize(134,file_path) and is_valid_image(outfile) and is_valid_filesize(134,outfile)): -# row_data = {'id': outpid, 'file': outfile} -# updated_df = add_update_dataframe(df,outpid,row_data) -# return updated_df -# -# elif ( ext == ".mp3" ): -# logger.info(f"Processing MP3: {file_path}") -# print(f"Procesing MP3: {file_path}") -# outpid,outfile = process_mp3(file_path) -# row_data = {'id': outpid, 'file': outfile} -# updated_df = add_update_dataframe(df,outpid,row_data) -# return updated_df -# -# elif (( ext == ".vtt" ) or ( ext == ".srt" )): -# logger.info(f"Processing WebVTT/SRT: {file_path}") -# print(f"Processing WebVTT/SRT: {file_path}") -# outpid,outfile = process_transcript(file_path) -# row_data = {'id': outpid, 'transcript': outfile} -# updated_df = add_update_dataframe(df,outpid,row_data) -# return updated_df -# -# elif (( ext == ".jpg" ) or ( ext == ".png" )): -# logger.info(f"Processing Thumbnail: {file_path}") -# print(f"Processing Thumbnail: {file_path}") -# outpid,outfile = process_thumbnail(file_path) -# row_data = {'id': outpid, 'thumbnail': outfile} -# updated_df = add_update_dataframe(df,outpid,row_data) -# return updated_df -# -# else: -# logger.info(f"Processing Unknown: {file_path}") -# print(f"Unknown File extension: {ext} - Skipping.") -# outpid,outfile = process_unknown(file_path) -# row_data = {'id': outpid, 'file': outfile} -# updated_df = add_update_dataframe(df,outpid,row_data) -# return updated_df -# -# -# # Process Top Level file... -# elif (file_type == "File") and (parent == ""): -# # Top Level file. -# #print(f"Processing Top-Level File") -# ignore_file_list = ["ignore"] -# if file_path.casefold() in ignore_file_list: -# logger.info(f"Ingoring: {file_path}") -# else: -# logger.info(f"Processing Top Level file: {file_path}") -# dir,ext = os.path.splitext(file_path) -# pid = os.path.basename(dir) -# -# if ( ext == ".tif" ): -# # Top Level file is a .tif file. -# logger.info(f"Processing Top Level TIFF: {file_path}") -# print(f"Processing Top Level TIFF: {file_path}") -# outpid,outfile = process_tiff(file_path) -# row_data = {'id': pid, 'file': outfile} -# updated_df = add_update_dataframe(df,pid,row_data) -# return updated_df -# -# elif ( ext == ".mp3" ): -# logger.info(f"Processing MP3: {file_path}") -# print(f"Procesing MP3: {file_path}") -# outpid,outfile = process_mp3(file_path) -# row_data = {'id': pid, 'file': outfile} -# updated_df = add_update_dataframe(df,pid,row_data) -# return updated_df -# -# elif (( ext == ".vtt" ) or ( ext == ".srt" )): -# logger.info(f"Processing WebVTT/SRT: {file_path}") -# print(f"Processing WebVTT/SRT: {file_path}") -# outpid,outfile = process_transcript(file_path) -# row_data = {'id': pid, 'transcript': outfile} -# updated_df = add_update_dataframe(df,pid,row_data) -# return updated_df -# -# elif (( ext == ".jpg" ) or ( ext == ".png" )): -# logger.info(f"Processing Thumbnail: {file_path}") -# print(f"Processing Thumbnail: {file_path}") -# outpid,outfile = process_thumbnail(file_path) -# row_data = {'id': pid, 'thumbnail': outfile} -# updated_df = add_update_dataframe(df,pid,row_data) -# return updated_df -# -# else: -# logger.info(f"Processing Unknown: {file_path}") -# print(f"Unknown File extension: {ext} - Skipping.") -# outpid,outfile = process_unknown(file_path) -# row_data = {'id': pid, 'file': outfile} -# updated_df = add_update_dataframe(df,pid,row_data) -# return updated_df -# -# # Process Directory... -# elif (file_type == "Directory"): -# #print(f"Processing Directory") -# ignore_dir_list = ["ignore"] -# if file_path.casefold() in ignore_dir_list: -# logger.info(f"Ignoreing: {file_path}") -# else: -# # Continue -# logger.info(f"Processing Directory: {file_path}") -# dir,ext = os.path.splitext(file_path) -# pid = os.path.basename(dir) -# -# # Check if pid in Google Sheet. -# if (value_exists_in_column(df,'id',pid)): -# # Update Existing info. -# print(f"Found: {pid} in Google Sheet") -# logger.info(f"Found: {pid} in Google Sheet") -# -# # Set Row Data -# row_data = {'id': pid} -# -# # Update the dataframe. -# logger.info(f"Updating DataFrame: {pid}") -# #updated_df,success,msg = update_dataframe(df, 'id', pid, row_data) -# df,success,msg = update_dataframe(df, 'id', pid, row_data) -# -# if not (success): -# logger.warning(f"Update DataFrame: Failed for PID: {pid} - {msg}") -# print(f"Update DataFrame: Failed for PID: {pid} - {msg}") -# else: -# #return updated_df -# return df -# -# else: -# # Add new info. -# print(f"Not Found: Adding {pid} to DataFrame") -# logger.info(f"Not Found: Adding {pid} to DataFrame") -# -# # Set Row Data. -# row_data = {'id': pid} -# -# # Update the dataframe. -# logger.info(f"Updating DataFrame: {pid}") -# print(f"Adding: {row_data}") -# #updated_df,success,msg = add_row_to_dataframe(df,row_data,key_column='id') -# df,success,msg = add_row_to_dataframe(df,row_data,'id') -# -# if not (success): -# logger.warning(f"Add to DataFrame: failed for PID: {pid} - {msg}") -# print(f"Add to DataFrame: Failed for PID: {pid} - {msg}") -# else: -# return df -# -# else: -# print(f"Unknown object.") -# exit() - - def get_value_from_df(df: pd.DataFrame,match_column: str,match_value,return_column:str): """ Given a pandas DataFrame, find the row where match_column == match_value @@ -921,11 +724,9 @@ def get_value_from_df(df: pd.DataFrame,match_column: str,match_value,return_colu # Validate columns if match_column not in df.columns: - #raise ValueError(f"Column '{match_column}' does not exist in the DataFrame.") logger.warning(f"Column '{match_column}' does not exist in the DataFrame.") return None if return_column not in df.columns: - #raise ValueError(f"Column '{return_column}' does not exist in the DataFrame.") logger.warning(f"Column '{return_column}' does not exist in the DataFrame.") return None @@ -1172,7 +973,6 @@ def process_file(df, file_path: str, level): else: pid = f"{file_name}" - # Build Row Data. row_data = { 'id': pid, @@ -1223,7 +1023,6 @@ def process_objects(df, directory: str, level): directory: Directory we are working with. level: What level are we at directory wise. """ - #print(f"Level: {level}, {directory}") # Get the objects in the directory. data = get_directory(directory) @@ -1277,10 +1076,7 @@ def process_objects(df, directory: str, level): if file_type == "File": # process the file. df = process_file(df, file_path, level + 1) - #else: - # print(f"Skip: {file_path}") - - # End if-else + # End if # End For return df @@ -1306,12 +1102,10 @@ def get_model_paths(start_model, models_dict): # Base case: if there's no child or child is 'None', return path with just this model if child == 'None' or child not in models_dict: - #return [[start_model]] return [[model]] # Recursive case: get all paths from child and prepend current model child_paths = get_model_paths(child, models_dict) - #return [[start_model] + path for path in child_paths] return [[model] + path for path in child_paths] def get_model_info(model,models_dict): @@ -1550,7 +1344,7 @@ def main(): globals()['logger'] = setup_logger('logger', log_file, level=logging.DEBUG) logger.info(f"Begin log.") - # Read Content Models + # Read Content Models - Proposed for later. # globals()['content_models'] = read_yaml_file('content_models.yml') # Get Batch Model. @@ -1569,7 +1363,7 @@ def main(): # Get external command paths. gm_path = shutil.which("gm") - # Check 'gm' exists. + # Check that 'gm' program exists. if gm_path: logger.info(f"GraphicsMagick Executable found at: {gm_path}") else: @@ -1606,7 +1400,7 @@ def main(): # Display df print(f"Dataframe:\n{updated_df}") - exit() + sys.exit() # Setup global variables. From 97078c791c104671f8c08bb7151252079f6af29a Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 15:30:33 -0500 Subject: [PATCH 04/21] Built out the PDF model, updated the model map, and explicitly set the islandora model in the model map. --- scan-batch-dir | 59 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 8c3d783..cd20558 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -798,7 +798,7 @@ def process_file(df, file_path: str, level): # Set the file weight if the file_name is a digit or contains "-\d{4}" file_weight = '' - if level > 1: + if level >= 1: if file_name.isdigit(): file_weight = int(file_name) else: @@ -813,12 +813,17 @@ def process_file(df, file_path: str, level): # Get the model from the map. my_model = get_model(level) - field_model = get_taxonomy_tid('https://i2.digital.library.pitt.edu','islandora_models',my_model) - logger.info(f"File is model: {my_model}, TID: {field_model}") + #field_model = get_taxonomy_tid('https://i2.digital.library.pitt.edu','islandora_models',my_model) + #logger.info(f"File is model: {my_model}, TID: {field_model}") - # Get the resource_type from the map + # Get model info from the map. model_info = get_model_info(my_model,models) resource_type = model_info.get('resource_type','None') + imodel = model_info.get('imodel','None') + + # Get the field_model from JSONAPI + field_model = get_taxonomy_tid('https://i2.digital.library.pitt.edu','islandora_models',imodel) + logger.info(f"File is model: {my_model}, TID: {field_model}") # Process any .tif files. if (file_ext.lower() == ".tif"): @@ -843,7 +848,7 @@ def process_file(df, file_path: str, level): 'parent_id': parent_folder, 'field_weight': file_weight, 'field_model': field_model, - 'model': my_model, + 'model': imodel, 'field_resource_type': resource_type, 'level': level, } @@ -877,7 +882,7 @@ def process_file(df, file_path: str, level): 'parent_id': parent_folder, 'field_weight': file_weight, 'field_model': field_model, - 'model': my_model, + 'model': imodel, 'field_resource_type': resource_type, 'level': level, } @@ -906,7 +911,7 @@ def process_file(df, file_path: str, level): 'level': level, 'field_model': field_model, 'field_weight': file_weight, - 'model': my_model, + 'model': imodel, 'field_resource_type': resource_type, } @@ -934,9 +939,9 @@ def process_file(df, file_path: str, level): 'level': level, 'field_model': field_model, 'field_weight': file_weight, - 'model': my_model, + 'model': imodel, 'field_resource_type': resource_type, - } + } logger.info(f"Row Data: {row_data}") @@ -977,6 +982,9 @@ def process_file(df, file_path: str, level): row_data = { 'id': pid, 'file': file_path, + 'field_weight': file_weight, + 'field_model': field_model, + 'model': imodel, } logger.info(f"Row Data: {row_data}") @@ -1164,106 +1172,133 @@ def main(): globals()['models'] = { 'Compound Audio 1': { 'model': 'Compound Object', + 'imodel': 'Compound Object', 'resource_type': 'Collection', 'child': 'Audio', }, 'Compound Book': { 'model': 'Compound Object', + 'imodel': 'Compound Object', 'resource_type': 'Collection', 'child': 'Paged Content', }, 'Compound Image 1': { 'model': 'Paged Content', + 'imodel': 'Paged Content', 'resource_type': 'Collection', 'child': 'Page', }, 'Compound Video 1': { 'model': 'Compound Object', + 'imodel': 'Compound Object', 'resource_type': 'Collection', 'child': 'Video', }, 'Compound Object': { 'model': 'Compound Object', + 'imodel': 'Compound Object', 'resource_type': 'Collection', 'child': 'None', }, 'Collection': { 'model': 'Collection', + 'imodel': 'Collection', 'resource_type': 'Collection', 'child': 'None', }, 'Serial 1': { 'model': 'Newspaper', + 'imodel': 'Newspaper', 'resource_type': 'Collection', 'child': 'Issue 1', }, 'Serial 2': { 'model': 'Newspaper', + 'imodel': 'Newspaper', 'resource_type': 'Collection', 'child': 'Issue 2', }, 'Issue 1': { - 'model': 'Publication Issue', + 'model': 'Publication Issue 1', + 'imodel': 'Publication Issue', 'resource_type': 'Collection', 'child': 'Page', }, 'Issue 2': { - 'model': 'Publication Issue', + 'model': 'Publication Issue 2', + 'imodel': 'Publication Issue', 'resouce_type': 'Text', 'child': 'File', }, - 'Publication Issue': { + 'Publication Issue 1': { 'model': 'Publication Issue', + 'imodel': 'Publication Issue', 'resource_type': 'Collection', 'child': 'Page', }, + 'Publication Issue 2': { + 'model': 'Publication Issue', + 'imodel': 'Publication Issue', + 'resource_type': 'Text', + 'child': 'PDF', + }, 'Book': { 'model': 'Paged Content', + 'imodel': 'Paged Content', 'resource_type': 'Collection', 'child': 'Page', }, 'Paged Content': { 'model': 'Paged Content', + 'imodel': 'Paged Content', 'resource_type': 'Collection', 'child': 'Page', }, 'Digital Document': { 'model': 'Digital Document', + 'imodel': 'Digital Document', 'resource_type': 'Text', 'child': 'None', }, 'Page': { 'model': 'Page', + 'imodel': 'Page', 'resource_type': 'Text', 'child': 'File', }, 'Image': { 'model': 'Image', + 'imodel': 'Image', 'resource_type': 'Still Image', 'child': 'File', }, 'Audio': { 'model': 'Audio', + 'imodel': 'Audio', 'resource_type': 'Sound', 'child': 'File', }, 'Video': { 'model': 'Video', + 'imodel': 'Video', 'resource_type': 'Moving Image', 'child': 'File', }, 'PDF': { 'model': 'Digital Document', + 'imodel': 'Digital Document', 'resource_type': 'Text', 'child': 'File', }, 'Binary': { 'model': 'Binary', + 'imodel': 'Binary', 'resource_type': 'Unspecified', 'child': 'File', }, 'File': { 'model': 'File', + 'imodel': 'File', 'child': 'None', }, } From 2d83e102e8c4062130e6aab3c6c2170085f82e98 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 15:56:39 -0500 Subject: [PATCH 05/21] Added additional fields for PDFs and added minimal row_data for Simple Images. --- scan-batch-dir | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/scan-batch-dir b/scan-batch-dir index cd20558..7cd2899 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -823,7 +823,7 @@ def process_file(df, file_path: str, level): # Get the field_model from JSONAPI field_model = get_taxonomy_tid('https://i2.digital.library.pitt.edu','islandora_models',imodel) - logger.info(f"File is model: {my_model}, TID: {field_model}") + logger.info(f"File is model: {imodel}, TID: {field_model}") # Process any .tif files. if (file_ext.lower() == ".tif"): @@ -985,6 +985,8 @@ def process_file(df, file_path: str, level): 'field_weight': file_weight, 'field_model': field_model, 'model': imodel, + 'field_resource_type': resource_type, + 'level': level, } logger.info(f"Row Data: {row_data}") @@ -995,6 +997,29 @@ def process_file(df, file_path: str, level): # Process any simple image files. if (file_ext.lower() == ".png" or file_ext.lower() == ".jpg"): print(f" Type: Simple Image") + + # Handle Top Level files. + if not level == 1: + pid = f"{parent_folder}-{file_name}" + else: + pid = f"{file_name}" + + # Build Row Data. + row_data = { + 'id': pid, + 'file': file_path, + 'field_weight': file_weight, + 'field_model': field_model, + 'model': imodel, + 'field_resource_type': resource_type, + 'level': level, + } + + logger.info(f"Row Data: {row_data}") + + # Update Dataframe. + df = add_update_dataframe(df,pid,row_data) + print(f"\n") @@ -1378,6 +1403,7 @@ def main(): print(f"Creating Log file: {log_file}") globals()['logger'] = setup_logger('logger', log_file, level=logging.DEBUG) logger.info(f"Begin log.") + logger.info(f"Running User: {username}") # Read Content Models - Proposed for later. # globals()['content_models'] = read_yaml_file('content_models.yml') From 0a9261063f8b5beb72272c9d32d40afe81e35f99 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:20:16 -0500 Subject: [PATCH 06/21] Added tables to the README.md file. --- README.md | 61 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 7f1c448..28f4bbf 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,23 @@ few types of file directory layouts relating to these types of objects: - Books +- Compound Books + - Manuscripts -- Newspaper Issues +- Newspaper/Serial Issues (Paged) + +- Newspaper/Serial Issues (PDF) - Audio (Oral Histories or General Audio files) - Video (Oral Histories or General Videos) -- Images +- Image (Single Image) + +- PDF (Single File) + +- Page (Single Page) Currently no other object types are addressed, but the script will identify and add them as it finds them. @@ -35,27 +43,40 @@ contents): Sheet Columns: - ----------------------------------------------------------------------- - Required Columns Description - ------------------------ ---------------------------------------------- - 'id' The PID of the object. This column must exist. - - 'file' Leave this empty but the column must exist. + |------------------------|----------------------------------------------| + |Required Columns | Description | + |------------------------|----------------------------------------------| + |'id' |The PID of the object. This column must exist.| + |--- |--- | + |'file' |Leave this empty but the column must exist. This field will be updated by the script with - the full path to the file. - - Optional Columns - - 'thumbnail' Used for A/V media. If a .jpg or .png file is + the full path to the file.| + |--- |--- | + |'model' | This column will be added. This is the Islandora + Model.| + |--- |--- | + |'field_model' |This column will be added. This is the Islanodra + Model Taxonomy ID.| + |--- |--- | + |'field_weight' |This column will be added. This is the order of + the objects in the sheet if it can be determined.| + |--- |--- | + |'field_resource_type' |This column will be added. This is the Resource + Type of the object in the sheet.| + |--- |--- | + + + |------------------------|-----------------------------------------------| + |Optional Columns | Description | + |------------------------|-----------------------------------------------| + |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added - to this column. - - 'transcript' Used for A/V media. If a .srt or .vtt file is + to this column.| + |--- |--- | + |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added - to this column. - - - ----------------------------------------------------------------------- + to this column.| + |------------------------|-----------------------------------------------| Script Parameters: From e946d26b539d28fb9a9c5b0bf66d5249c91664dc Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:22:32 -0500 Subject: [PATCH 07/21] Updated the tables in the README.md file. --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 28f4bbf..8b8410b 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,9 @@ contents): Sheet Columns: - |------------------------|----------------------------------------------| + |--- |--- | |Required Columns | Description | - |------------------------|----------------------------------------------| + |--- |--- | |'id' |The PID of the object. This column must exist.| |--- |--- | |'file' |Leave this empty but the column must exist. @@ -66,9 +66,9 @@ Sheet Columns: |--- |--- | - |------------------------|-----------------------------------------------| + |--- |--- | |Optional Columns | Description | - |------------------------|-----------------------------------------------| + |--- |--- | |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added to this column.| @@ -76,7 +76,7 @@ Sheet Columns: |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added to this column.| - |------------------------|-----------------------------------------------| + |--- |--- | Script Parameters: From cd956a3d37412a6d0d2fb2fad2195fbc6be8fe62 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:24:56 -0500 Subject: [PATCH 08/21] Updated the tables in the README.md file. --- README.md | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8b8410b..66f5285 100644 --- a/README.md +++ b/README.md @@ -41,9 +41,8 @@ contents): ## Google Sheet requirements: -Sheet Columns: +### Sheet Columns: - |--- |--- | |Required Columns | Description | |--- |--- | |'id' |The PID of the object. This column must exist.| @@ -52,33 +51,22 @@ Sheet Columns: This field will be updated by the script with the full path to the file.| |--- |--- | - |'model' | This column will be added. This is the Islandora - Model.| - |--- |--- | - |'field_model' |This column will be added. This is the Islanodra - Model Taxonomy ID.| + |'model' |This column will be added. This is the Islandora Model.| |--- |--- | - |'field_weight' |This column will be added. This is the order of - the objects in the sheet if it can be determined.| + |'field_model' |This column will be added. This is the Islanodra Model Taxonomy ID.| |--- |--- | - |'field_resource_type' |This column will be added. This is the Resource - Type of the object in the sheet.| + |'field_weight' |This column will be added. This is the order of the objects in the sheet if it can be determined.| |--- |--- | + |'field_resource_type' |This column will be added. This is the Resource Type of the object in the sheet.| - |--- |--- | |Optional Columns | Description | |--- |--- | - |'thumbnail' |Used for A/V media. If a .jpg or .png file is - found the full path to the file will be added - to this column.| + |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added to this column.| |--- |--- | - |'transcript' |Used for A/V media. If a .srt or .vtt file is - found the full path to the file will be added - to this column.| - |--- |--- | + |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added to this column.| -Script Parameters: +### Script Parameters: ------------------------------------------------------------------------- Required Parameters Description From a6cce3653706fd9c6f5fd67135615dc4115576d4 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:26:17 -0500 Subject: [PATCH 09/21] Updated the tables in the README.md file. --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index 66f5285..aa0f930 100644 --- a/README.md +++ b/README.md @@ -46,24 +46,15 @@ contents): |Required Columns | Description | |--- |--- | |'id' |The PID of the object. This column must exist.| - |--- |--- | - |'file' |Leave this empty but the column must exist. - This field will be updated by the script with - the full path to the file.| - |--- |--- | + |'file' |Leave this empty but the column must exist. This field will be updated by the script with the full path to the file.| |'model' |This column will be added. This is the Islandora Model.| - |--- |--- | |'field_model' |This column will be added. This is the Islanodra Model Taxonomy ID.| - |--- |--- | |'field_weight' |This column will be added. This is the order of the objects in the sheet if it can be determined.| - |--- |--- | |'field_resource_type' |This column will be added. This is the Resource Type of the object in the sheet.| |Optional Columns | Description | - |--- |--- | |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added to this column.| - |--- |--- | |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added to this column.| ### Script Parameters: From 05ed05cc3b1fcc18a411be39a32ac5beb87ffcb1 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:28:05 -0500 Subject: [PATCH 10/21] Updated the tables in the README.md file. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aa0f930..c00b3c0 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ contents): ## Google Sheet requirements: -### Sheet Columns: +### Sheet Required Columns: |Required Columns | Description | |--- |--- | @@ -52,8 +52,10 @@ contents): |'field_weight' |This column will be added. This is the order of the objects in the sheet if it can be determined.| |'field_resource_type' |This column will be added. This is the Resource Type of the object in the sheet.| +### Sheet Optional Columns: |Optional Columns | Description | + |--- |--- | |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added to this column.| |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added to this column.| From 90a062479a2051c739ac2318792e7cf32e1c0a73 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Wed, 18 Feb 2026 16:32:31 -0500 Subject: [PATCH 11/21] Updated the tables in the README.md file. --- README.md | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index c00b3c0..ae48099 100644 --- a/README.md +++ b/README.md @@ -59,31 +59,21 @@ contents): |'thumbnail' |Used for A/V media. If a .jpg or .png file is found the full path to the file will be added to this column.| |'transcript' |Used for A/V media. If a .srt or .vtt file is found the full path to the file will be added to this column.| -### Script Parameters: +### Script Required Parameters: - ------------------------------------------------------------------------- - Required Parameters Description - ------------------------- ----------------------------------------------- - \--config-file Full or relative path to the configuration file - used for the script. - - \--log-file Full or relative path to the log file that will - be generated. - - \--directory Full path of the directory we wish to scan. - - - - Optional Parameters - - \--in-google-sheet-id The ID number of the Google Sheet. + |Required Parameters | Description | + |--- |--- | + |\--config-file |Full or relative path to the configuration file used for the script.| + |\--log-file |Full or relative path to the log file that will be generated.| + |\--directory |Full path of the directory we wish to scan.| - \--in-google-sheet-name The Name of the Tab in the Google Sheet (E.g.: - Sheet1) +### Script Optional Parameters: - \--in-google-creds-file The full or relative path to the Google - Credentials File. - ------------------------------------------------------------------------- + |Optional Parameters | Description | + |--- |--- | + |\--in-google-sheet-id |The ID number of the Google Sheet.| + |\--in-google-sheet-name |The Name of the Tab in the Google Sheet (E.g.: Sheet1)| + |\--in-google-creds-file |The full or relative path to the Google Credentials File.| ## Google Credentials File: From ea1316c393709b8e6fc6cf02e2099f2a98468d54 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 13:12:12 -0500 Subject: [PATCH 12/21] Fixed spacing in function call parameters. --- scan-batch-dir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scan-batch-dir b/scan-batch-dir index 7cd2899..eb0f9f1 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -707,7 +707,7 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data): logger.info(f"Adding to DataFrame: Success for PID: {pid}") return updated_df -def get_value_from_df(df: pd.DataFrame,match_column: str,match_value,return_column:str): +def get_value_from_df(df: pd.DataFrame, match_column: str, match_value, return_column: str): """ Given a pandas DataFrame, find the row where match_column == match_value and return the value from return_column. From a7226d5d2cb4a7ee47a434c168d41eeb6edced02 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 13:15:32 -0500 Subject: [PATCH 13/21] Adjust function documentation to correctly describe the return value as being the first value from the return column. --- scan-batch-dir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scan-batch-dir b/scan-batch-dir index eb0f9f1..a80b2e5 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -710,7 +710,7 @@ def add_update_dataframe(df: pd.DataFrame, pid: str, row_data): def get_value_from_df(df: pd.DataFrame, match_column: str, match_value, return_column: str): """ Given a pandas DataFrame, find the row where match_column == match_value - and return the value from return_column. + and return the first value from return_column. Parameters: df (pd.DataFrame): The DataFrame to search. From d2c1561e34ec5a85ebfd4f14e59a9c966b470f57 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 13:20:32 -0500 Subject: [PATCH 14/21] Added function documentation to process_file. --- scan-batch-dir | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scan-batch-dir b/scan-batch-dir index a80b2e5..ecf9238 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -785,6 +785,17 @@ def get_taxonomy_tid(base_url, vocabulary, term_name, auth_token=None): def process_file(df, file_path: str, level): + """ + Process the found file. + + Parameters: + df: The Pandas dataframe. + file_path (str): The path to the file. + level: The level of the path in relation to the starting directory. + + Returns: + df: The updated dataframe. + """ logger.info(f"Function: process_file") logger.info(f"Processing File: {file_path}") print(f"Processing File: {file_path}") From f04de97ca1524fd4faa8e1d9114edfb06295970e Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 13:46:23 -0500 Subject: [PATCH 15/21] Adding some argument signatures to functions. --- scan-batch-dir | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index ecf9238..694bab0 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -784,7 +784,7 @@ def get_taxonomy_tid(base_url, vocabulary, term_name, auth_token=None): logger.error(f"get_taxonomy_tid - Error {response.status_code}: {response.text}") -def process_file(df, file_path: str, level): +def process_file(df: pd.DataFrame, file_path: str, level): """ Process the found file. @@ -1037,7 +1037,7 @@ def process_file(df, file_path: str, level): return(df) -def get_directory(directory): +def get_directory(directory: str): """ Scan a directory and return a list of files and subdirectories. @@ -1058,7 +1058,7 @@ def get_directory(directory): result.sort(key=lambda x: (x[1] != "Directory", x[0].lower())) return result -def process_objects(df, directory: str, level): +def process_objects(df: pd.DataFrame, directory: str, level): """ Process the objects of the directory. @@ -1125,7 +1125,7 @@ def process_objects(df, directory: str, level): return df -def get_model_paths(start_model, models_dict): +def get_model_paths(start_model: str, models_dict: dict): """ Recursively map paths from a starting model to all leaf models. @@ -1152,7 +1152,7 @@ def get_model_paths(start_model, models_dict): child_paths = get_model_paths(child, models_dict) return [[model] + path for path in child_paths] -def get_model_info(model,models_dict): +def get_model_info(model: str,models_dict: dict): # Check if the starting model exists if model not in models_dict: return [] @@ -1160,7 +1160,7 @@ def get_model_info(model,models_dict): model_info = models_dict[model] return model_info -def get_model(level): +def get_model(level:int): #if model_paths[0][level-1] == "File": # Return parent model # return model_paths[0][level-2] @@ -1172,11 +1172,11 @@ def get_model(level): else: return model_paths[0][level-1] -def get_resource_type(level): +def get_resource_type(level:int): return 0 -def is_target_in_list(target,list): +def is_target_in_list(target, list): return target in list def get_value(map, key, default=None): From 4726124c0dc830e5b5f67e7b8c4be29292157e9b Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 13:54:51 -0500 Subject: [PATCH 16/21] Removed some unused functions. --- scan-batch-dir | 7 ------- 1 file changed, 7 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 694bab0..8323606 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -1172,16 +1172,9 @@ def get_model(level:int): else: return model_paths[0][level-1] -def get_resource_type(level:int): - return 0 - - def is_target_in_list(target, list): return target in list -def get_value(map, key, default=None): - return map.get(key, default) - def add_column(df: pd.DataFrame, column_name: str, default=None): if column_name not in df.columns: df[column_name] = default From 736b240c1761caebf99188f3e1c90b29c93126dc Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 15:49:04 -0500 Subject: [PATCH 17/21] Added the missing $ in the regex. --- scan-batch-dir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scan-batch-dir b/scan-batch-dir index 8323606..639054e 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -813,7 +813,7 @@ def process_file(df: pd.DataFrame, file_path: str, level): if file_name.isdigit(): file_weight = int(file_name) else: - pattern = r'.*-(\d{4})' + pattern = r'.*-(\d{4})$' match = re.search(pattern,file_name) if match: file_weight = int(match.group(1)) From 607681e67c80b20677fb03cb5e612f5a024ee23d Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 15:56:59 -0500 Subject: [PATCH 18/21] Remove function in preference for in-line code. --- scan-batch-dir | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 639054e..1a24009 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -1172,9 +1172,6 @@ def get_model(level:int): else: return model_paths[0][level-1] -def is_target_in_list(target, list): - return target in list - def add_column(df: pd.DataFrame, column_name: str, default=None): if column_name not in df.columns: df[column_name] = default @@ -1415,7 +1412,7 @@ def main(): # Get Batch Model. globals()['model'] = args.model logger.info(f"Batch Model: {model}") - if not is_target_in_list(model.lower(), allowed_models): + if not (model.lower() in allowed_models): logger.error(f"Model: {model} is not in the list of allowed_models.") print(f"Model: {model} is not in the list of allowed_models.") From 91d8a30ee94f2522dae45c4c2f7680c3770ae8c6 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Thu, 26 Feb 2026 16:02:26 -0500 Subject: [PATCH 19/21] Removed unused function dump_df_columns. --- scan-batch-dir | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index 1a24009..ba8734a 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -650,29 +650,6 @@ def get_file_info(file_path: str): return(pid,parent,file,dir,ext) -def dump_df_columns(df: pd.DataFrame, columns: list): - """ - Prints a DataFrame containing only the specified columns. - - Parameters: - df (pd.DataFrame): The original DataFrame. - columns (list): List of column names to include. - - Raises: - ValueError: If any requested column does not exist in the DataFrame. - """ - # Validate columns - missing_cols = [col for col in columns if col not in df.columns] - if missing_cols: - logger.error(f"The following columns do not exist in the DataFrame: {missing_cols}") - - # Prints the DataFrame with selected columns - with pd.option_context( - 'display.max_rows', None, - ): - print(f"Dataframe:\n{df[columns]}") - - def add_update_dataframe(df: pd.DataFrame, pid: str, row_data): """ Depending upon if the pid exists in the 'id' column of the dataframe From fb5ca84faf61fe57905f7c90e40d5044c7184502 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Fri, 27 Feb 2026 10:13:31 -0500 Subject: [PATCH 20/21] Moved skip patterns to the config file to allow for customization. --- scan-batch-dir | 12 ++++++++---- scan-batch-dir.conf-sample | 3 +++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scan-batch-dir b/scan-batch-dir index ba8734a..1449f08 100755 --- a/scan-batch-dir +++ b/scan-batch-dir @@ -1358,6 +1358,7 @@ def main(): globals()['google_sheet_id'] = cfg['google_sheet_id'] globals()['google_sheet_name'] = cfg['google_sheet_name'] globals()['log_file'] = cfg['log_file'] + globals()['skip'] = cfg['skip'].split(",") # Override config file variables with command line parameters. if args.in_gs_creds is not None: @@ -1373,16 +1374,19 @@ def main(): globals()['required_columns'] = ["id","file","level","parent_id", "field_weight","field_model","model","field_resource_type","transcript"] - # Global file patterns to skip over. - globals()['skip'] = ["ignore",".jp2",".metadata","meta",".opex",".fits", - "target.tif","metadata.csv","metadata.xlsx","manifest.xlsx","manifest.csv"] - # Create the Log file. print(f"Creating Log file: {log_file}") globals()['logger'] = setup_logger('logger', log_file, level=logging.DEBUG) logger.info(f"Begin log.") logger.info(f"Running User: {username}") + # Log running config values. + logger.info(f"Log File: {log_file}") + logger.info(f"Skip Patterns: {skip}") + logger.info(f"Google Credentials: {google_credentials}") + logger.info(f"Google Sheet ID: {google_sheet_id}") + logger.info(f"Google Sheet Name: {google_sheet_name}") + # Read Content Models - Proposed for later. # globals()['content_models'] = read_yaml_file('content_models.yml') diff --git a/scan-batch-dir.conf-sample b/scan-batch-dir.conf-sample index 9ed7d6d..a6efd3e 100644 --- a/scan-batch-dir.conf-sample +++ b/scan-batch-dir.conf-sample @@ -14,3 +14,6 @@ google_sheet_name: Sheet1 # Log file log_file: /path/to/file.log +#-------------------------------------------------------------------- +# Skip patterns - Directory/File patterns to ignored. +skip: ignore,meta,.jp2,.metadata,.opex,.fits,target.tif,metadata.csv,metadata.xlsx,manifest.xlsx,manifest.csv From e356d520e86ef4e3cd3e21faf4a94ad170713867 Mon Sep 17 00:00:00 2001 From: Brian Gregg Date: Fri, 27 Feb 2026 10:18:42 -0500 Subject: [PATCH 21/21] Updated the README.md file to address the \'skip\' parameter. --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index ae48099..b793839 100644 --- a/README.md +++ b/README.md @@ -28,16 +28,7 @@ few types of file directory layouts relating to these types of objects: Currently no other object types are addressed, but the script will identify and add them as it finds them. -The script will ignore the following files and directories (including -contents): - -- Directory named 'ignore' - -- File named 'manuscript.csv' - -- File named 'manuscript.xls' - -- File named 'manuscript.xlsx' +The script will ignore the files and directories (including contents) of the configuration file 'skip' parameter. This is a comma separated list of patterns that the script will not process. These are file patterns that you do not want to process. ## Google Sheet requirements: