Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 69 additions & 4 deletions scripts/1-fetch/smithsonian_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,77 @@
"TOTAL_OBJECTS",
]
HEADER_2_UNITS = [
"UNIT",
"UNIT_CODE",
"DATA_SOURCE",
"CC0_RECORDS",
"CC0_RECORDS_WITH_CC0_MEDIA",
"TOTAL_OBJECTS",
]
QUARTER = os.path.basename(PATHS["data_quarter"])

# Manually compiled unit code and name from URL
# 'https://github.com/Smithsonian/OpenAccess'
UNIT_MAP = {
"AAA": "Archives of American Art",
"AAG": "Archives of American Gardens",
"ACM": "Anacostia Community Museum",
"ACMA": "Anacostia Community Museum Archives",
"CFCHFOLKLIFE": "Ralph Rinzler Folklife Archives and Collections",
"CHNDM": "Cooper Hewitt, Smithsonian Design Museum",
"FBR": "Smithsonian Field Book Project",
"FSG": "Freer Gallery of Art and Arthur M. Sackler Gallery",
"HAC": "Smithsonian Gardens",
"HMSG": "Hirshhorn Museum and Sculpture Garden",
"HSFA": "Human Studies Film Archives",
"NASM": "National Air and Space Museum",
"NMAAHC": "National Museum of African American History and Culture",
"NMAH": "National Museum of American History",
"NMAI": "National Museum of the American Indian",
"NMAfA": "National Museum of African Art",
"NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
"NMNHBIRDS": (
"National Musuem of Natural History - Vertebrate Zoology - Birds"
" Division"
),
"NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
"NMNHEDUCATION": (
"National Musuem of Natural History - Education & Outreach"
),
"NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
"NMNHFISHES": (
"National Musuem of Natural History - Vertebrate Zoology - Fishes"
" Division"
),
"NMNHHERPS": (
"National Musuem of Natural History - Vertebrate Zoology - Herpetology"
" Division"
),
"NMNHINV": (
"National Musuem of Natural History - Invertebrate Zoology Dept."
),
"NMNHMAMMALS": (
"National Musuem of Natural History"
" - Vertebrate Zoology - Mammals Division"
),
"NMNHMINSCI": (
"National Musuem of Natural History" " - Mineral Sciences Dept."
),
"NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
"NPG": "National Portrait Gallery",
"NPM": "National Postal Museum",
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
"OCIO_DPO3D": "OCIO Digital Preservation & 3D Team",
"OFEO-SG": "Office of Facilities Engineering &"
" Operations – Smithsonian Gardens",
"SAAM": "Smithsonian American Art Museum",
"SIA": "Smithsonian Institution Archives",
"SIL": "Smithsonian Libraries",
"SILAF": "Smithsonian Institution Libraries, African Section",
"SILNMAHTL": "Smithsonian Institution Libraries,"
" National Museum of American History, Library",
"SLA_SRO": "Smithsonian Libraries Archives, Special Research/Operations",
}


def parse_arguments():
"""
Expand Down Expand Up @@ -102,7 +166,7 @@ def query_smithsonian(args, session):
" API key is set in .env",
1,
)
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonain")
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonian")
url = "https://api.si.edu/openaccess/api/v1.0/stats"
params = {"api_key": DATA_GOV_API_KEY}
try:
Expand Down Expand Up @@ -132,15 +196,16 @@ def query_smithsonian(args, session):
continue
data_units.append(
{
"UNIT": unit["unit"],
"UNIT_CODE": unit["unit"],
"DATA_SOURCE": UNIT_MAP.get(unit["unit"], unit["unit"]),
"CC0_RECORDS": unit["metrics"]["CC0_records"],
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
"CC0_records_with_CC0_media"
],
"TOTAL_OBJECTS": unit["total_objects"],
}
)
data_units = sorted(data_units, key=itemgetter("UNIT"))
data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
LOGGER.info(f"Fetched stats for {len(data_units)} units")
return data_metrics, data_units

Expand Down
203 changes: 203 additions & 0 deletions scripts/2-process/smithsonian_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/usr/bin/env python
"""
This file is dedicated to processing Smithsonian data
for analysis and comparison between quarters.
"""

# Standard library
import argparse
import os
import sys
import traceback

# Third-party
import pandas as pd

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_units.csv"),
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_records.csv"),
]


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
global QUARTER
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--quarter",
default=QUARTER,
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results (default: False)",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate data even if processed files already exist",
)

args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
if args.quarter != QUARTER:
global FILE_PATHS, PATHS
FILE_PATHS = shared.paths_list_update(
LOGGER, FILE_PATHS, QUARTER, args.quarter
)
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
QUARTER = args.quarter
args.logger = LOGGER
args.paths = PATHS
return args


def process_totals_by_units(args, count_data):
"""
Processing count data: totals by units
"""
LOGGER.info(process_totals_by_units.__doc__.strip())
data = {}

for row in count_data.itertuples(index=False):
unit = str(row.DATA_SOURCE)
total_objects = int(row.TOTAL_OBJECTS)

data[unit] = total_objects

data = pd.DataFrame(data.items(), columns=["Data_source", "Total_objects"])
data.sort_values("Data_source", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
)
shared.dataframe_to_csv(args, data, file_path)


def process_totals_by_records(args, count_data):
"""
Processing count data: totals by records
"""
LOGGER.info(process_totals_by_records.__doc__.strip())
data = {}

for row in count_data.itertuples(index=False):
unit = str(row.DATA_SOURCE)
CC0_records = int(row.CC0_RECORDS)
CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
total_objects = int(row.TOTAL_OBJECTS)

if unit not in data:
data[unit] = {
"CC0_records": 0,
"CC0_records_with_CC0_media": 0,
"Total_objects": 0,
}

data[unit]["CC0_records"] += CC0_records
data[unit]["CC0_records_with_CC0_media"] += CC0_records_with_CC0_media
data[unit]["Total_objects"] += total_objects

data = (
pd.DataFrame.from_dict(data, orient="index")
.reset_index()
.rename(columns={"index": "Data_source"})
)
data["CC0_without_media_percentage"] = (
(
(data["CC0_records"] - data["CC0_records_with_CC0_media"])
/ data["Total_objects"]
)
* 100
).round(2)

data["CC0_with_media_percentage"] = (
(data["CC0_records_with_CC0_media"] / data["Total_objects"]) * 100
).round(2)

data["Others_percentage"] = (
((data["Total_objects"] - data["CC0_records"]) / data["Total_objects"])
* 100
).round(2)

data.sort_values("Data_source", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "smithsonian_totals_by_records.csv"
)
shared.dataframe_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
shared.check_completion_file_exists(args, FILE_PATHS)
file_count = shared.path_join(
PATHS["data_1-fetch"], "smithsonian_2_units.csv"
)
count_data = shared.open_data_file(
LOGGER,
file_count,
usecols=[
"UNIT_CODE",
"DATA_SOURCE",
"CC0_RECORDS",
"CC0_RECORDS_WITH_CC0_MEDIA",
"TOTAL_OBJECTS",
],
)
process_totals_by_units(args, count_data)
process_totals_by_records(args, count_data)

# Push changes
args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit new GitHub data for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.exit_code)
except SystemExit as e:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
Loading