From 9042c7848dc18420894c97cbbe39c31ecc7e0797 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 19:32:51 -0800 Subject: [PATCH 01/30] feat: cleans noisy features (doi, numbered references, figure captions, etc) --- src/preprocessing/text_cleaner.py | 237 ++++++++++++++++++++++++++++++ tests/test_text_cleaner.py | 231 +++++++++++++++++++++++++++++ 2 files changed, 468 insertions(+) create mode 100644 src/preprocessing/text_cleaner.py create mode 100644 tests/test_text_cleaner.py diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py new file mode 100644 index 0000000..267bd61 --- /dev/null +++ b/src/preprocessing/text_cleaner.py @@ -0,0 +1,237 @@ +"""Noise removal for raw .txt files extracted from scientific PDFs. + +Strips content that adds no value to structured data extraction: + - Reference / bibliography sections (everything from the header onward) + - Acknowledgement and funding sections + - Author affiliation blocks (department names, university lines, emails) + - Figure and table captions + - Copyright and licence lines + - Standalone page-number lines + - DOI / URL lines + - Journal metadata lines (volume, issue, ISSN, received/accepted dates) + - Excessive blank lines and leading/trailing whitespace + +Exposes one primary function:: + + clean_text(text: str) -> str + +The function is safe to call on text that already has ``[PAGE N]`` markers; +those markers are preserved so the downstream ``extract_key_sections()`` +function can still do page-priority ranking. + +Usage (standalone):: + + python text_cleaner.py path/to/file.txt + python text_cleaner.py path/to/file.txt --max-chars 5000 +""" + +import re +import sys +from pathlib import Path +from typing import List + +# --------------------------------------------------------------------------- +# Section-level drop patterns +# When a line matches one of these, the entire remainder of that page/section +# block is discarded (until the next [PAGE N] marker or end of text). +# --------------------------------------------------------------------------- +_SECTION_DROP_HEADERS: re.Pattern = re.compile( + r"(?i)^\s*(" + r"acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix\b" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r")\s*[:\.]?\s*$" +) + +# --------------------------------------------------------------------------- +# Line-level drop patterns +# Lines that individually match one of these are removed regardless of where +# they appear in the document. +# --------------------------------------------------------------------------- +_LINE_DROP_PATTERNS: List[re.Pattern] = [ + # Standalone page numbers (digit-only line, 1–4 digits, optional spaces) + re.compile(r"^\s*\d{1,4}\s*$"), + + # Reference list entries: "[1] Smith ...", "1. Smith ...", "1) Smith ..." + re.compile(r"^\s*\[\d+\]\s+[A-Z]"), + re.compile(r"^\s*\d{1,3}[.)]\s{1,4}[A-Z][a-z]"), + + # DOI and bare URLs + re.compile(r"(?i)(https?://|doi\.org/|www\.)\S+"), + + # Email addresses + re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), + + # Copyright / licence lines + re.compile(r"(?i)(©|\(c\)\s*\d{4}|copyright\s+\d{4}|all\s+rights\s+reserved" + r"|published\s+by\s+elsevier|creative\s+commons|open\s+access" + r"|this\s+(article|paper|is)\s+(is\s+)?published)"), + + # Journal metadata: volume, issue, ISSN, page range + re.compile(r"(?i)^\s*(vol(ume)?\.?\s*\d|issue\s*\d|pp\.\s*\d|issn\s*[\d\-]" + r"|journal\s+of|proceedings\s+of)"), + + # Received / accepted / revised / available-online timestamps + re.compile(r"(?i)^\s*(received|accepted|revised|available\s+online|" + r"published\s+online|handling\s+editor)\s*[:;]"), + + # Keywords line + re.compile(r"(?i)^\s*key\s*-?\s*words?\s*[:\-]"), + + # Figure / table / plate captions + re.compile(r"(?i)^\s*(fig(ure)?\.?\s*\d|table\s*\d|plate\s*\d|" + r"fig\.\s*s\d|supplemental?\s+(table|figure)\s*\d)" + r"[\s.\-–—:]"), + + # Author affiliation lines (institution / department / lab names) + re.compile(r"(?i)^\s*(department\s+of|faculty\s+of|institute\s+(of|for)|" + r"division\s+of|school\s+of|laboratory\s+of|lab\s+of|" + r"centre?\s+(for|of)|program(me)?\s+(in|of)|" + r"universidad|universit[éy]|université|universidade|" + r"university\s+of|college\s+of)"), + + # Running page headers / footers: short all-caps lines + re.compile(r"^[A-Z\s\d\.\-–—:,]{5,60}$"), +] + +# --------------------------------------------------------------------------- +# Whitespace normalisation +# --------------------------------------------------------------------------- +_MULTI_BLANK = re.compile(r"\n{3,}") +_TRAILING_SPACES = re.compile(r"[ \t]+\n") + + +def _drop_line(line: str) -> bool: + """Return True if this line should be removed from the output.""" + stripped = line.strip() + if not stripped: + return False # preserve blank lines for now; collapsed later + for pat in _LINE_DROP_PATTERNS: + if pat.search(stripped): + return True + return False + + +def clean_text(text: str) -> str: + """Strip noise from raw extracted text. + + Processes the text in two passes: + + 1. **Section-level pass** — scans for section headers that mark the start + of non-useful content (References, Acknowledgements, etc.) and discards + everything from that header to the next ``[PAGE N]`` marker (or end of + text). This preserves other sections on subsequent pages. + + 2. **Line-level pass** — removes individual noisy lines (page numbers, + DOIs, email addresses, figure captions, affiliation lines, copyright + notices, reference-list entries, journal metadata). + + Finally, excess blank lines are collapsed to a single blank line and + leading/trailing whitespace is stripped. + + Args: + text: Raw text content, optionally containing ``[PAGE N]`` markers. + + Returns: + Cleaned text with noise removed. + """ + # ── Pass 1: section-level drop ────────────────────────────────────────── + # Split on [PAGE N] markers and process each page block independently. + # Within each block, once a drop-section header is found, discard the + # rest of that block. + page_split = re.split(r"(\[PAGE\s+\d+\])", text) + # page_split alternates: [pre-marker-text, marker, text, marker, text, ...] + + cleaned_parts: List[str] = [] + for segment in page_split: + if re.match(r"\[PAGE\s+\d+\]", segment): + # Keep the marker itself + cleaned_parts.append(segment) + continue + + lines = segment.split("\n") + kept: List[str] = [] + in_drop_section = False + for line in lines: + if in_drop_section: + # Skip until end of this page block + continue + if _SECTION_DROP_HEADERS.match(line): + in_drop_section = True + continue + kept.append(line) + cleaned_parts.append("\n".join(kept)) + + after_section_pass = "".join(cleaned_parts) + + # ── Pass 2: line-level drop ────────────────────────────────────────── + output_lines: List[str] = [] + for line in after_section_pass.split("\n"): + if _drop_line(line): + continue + output_lines.append(line) + + after_line_pass = "\n".join(output_lines) + + # ── Normalise whitespace ──────────────────────────────────────────────── + after_line_pass = _TRAILING_SPACES.sub("\n", after_line_pass) + after_line_pass = _MULTI_BLANK.sub("\n\n", after_line_pass) + return after_line_pass.strip() + + +# --------------------------------------------------------------------------- +# Standalone usage +# --------------------------------------------------------------------------- + +def main() -> None: # pragma: no cover + import argparse + + parser = argparse.ArgumentParser( + description="Strip noise from a raw .txt file extracted from a scientific PDF." + ) + parser.add_argument("input", type=str, help="Path to the .txt file to clean.") + parser.add_argument( + "--max-chars", + type=int, + default=None, + help="If set, truncate the cleaned output to this many characters.", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Optional path to write the cleaned text. Defaults to stdout.", + ) + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"[ERROR] File not found: {input_path}", file=sys.stderr) + sys.exit(1) + + raw = input_path.read_text(encoding="utf-8") + cleaned = clean_text(raw) + + if args.max_chars and len(cleaned) > args.max_chars: + cleaned = cleaned[: args.max_chars] + + if args.output: + Path(args.output).write_text(cleaned, encoding="utf-8") + print(f"[INFO] Cleaned text written to {args.output} ({len(cleaned)} chars)", file=sys.stderr) + else: + print(cleaned) + + +if __name__ == "__main__": + main() diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py new file mode 100644 index 0000000..0e621c8 --- /dev/null +++ b/tests/test_text_cleaner.py @@ -0,0 +1,231 @@ +"""Unit tests for src/preprocessing/text_cleaner.py""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from src.preprocessing.text_cleaner import clean_text + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_page(n: int, body: str) -> str: + return f"[PAGE {n}]\n{body}" + + +# --------------------------------------------------------------------------- +# Reference / bibliography section removal +# --------------------------------------------------------------------------- + + +class TestReferenceSectionRemoval: + def test_drops_references_header_and_trailing_content(self): + text = ( + "Methods\nWe examined 50 stomachs from Canis lupus.\n\n" + "References\nSmith J. 2001. J Ecol 10:1-5.\nDoe A. 2002. Nature 400:1.\n" + ) + result = clean_text(text) + assert "Smith J." not in result + assert "Doe A." not in result + + def test_keeps_content_before_references(self): + text = ( + "Results\nSample size was 30.\n\n" + "References\n1. Author A. 2000. Title. Journal.\n" + ) + result = clean_text(text) + assert "Sample size was 30" in result + + def test_drops_literature_cited(self): + text = "Discussion\nSee below.\n\nLiterature Cited\nFoo B. 1999.\n" + result = clean_text(text) + assert "Foo B." not in result + + def test_drops_bibliography(self): + text = "Results\nN=42.\n\nBibliography\nBar C. 2005.\n" + result = clean_text(text) + assert "Bar C." not in result + + def test_references_on_separate_page_doesnt_poison_next_page(self): + text = ( + "[PAGE 3]\nResults\nSample size = 30.\n" + "[PAGE 4]\nReferences\n1. Smith 2001.\n" + "[PAGE 5]\nDiscussion\nThis study found important results.\n" + ) + result = clean_text(text) + assert "Sample size = 30" in result + assert "Smith 2001" not in result + assert "This study found important results" in result + + +# --------------------------------------------------------------------------- +# Acknowledgement / funding section removal +# --------------------------------------------------------------------------- + + +class TestAcknowledgementRemoval: + def test_drops_acknowledgements(self): + text = "Methods\nWe sampled fish.\n\nAcknowledgements\nWe thank our funders.\n" + result = clean_text(text) + assert "We thank our funders" not in result + + def test_drops_acknowledgments_us_spelling(self): + text = "Results\nN=10.\n\nAcknowledgments\nFunded by NSF.\n" + result = clean_text(text) + assert "Funded by NSF" not in result + + def test_drops_funding_section(self): + text = "Abstract\nWe studied diets.\n\nFunding\nGrant XYZ-123.\n" + result = clean_text(text) + assert "Grant XYZ-123" not in result + + def test_keeps_content_before_acknowledgements(self): + text = "Results\n42 stomachs examined.\n\nAcknowledgements\nThanks.\n" + result = clean_text(text) + assert "42 stomachs examined" in result + + +# --------------------------------------------------------------------------- +# Line-level noise removal +# --------------------------------------------------------------------------- + + +class TestPageNumberRemoval: + def test_removes_standalone_page_number(self): + text = "Results\n\n42\n\nMore text here.\n" + result = clean_text(text) + assert "\n42\n" not in result + + def test_keeps_number_inside_sentence(self): + text = "We examined 42 stomachs.\n" + result = clean_text(text) + assert "We examined 42 stomachs" in result + + +class TestReferenceEntryRemoval: + def test_removes_bracketed_ref_entry(self): + result = clean_text("[1] Smith J. 2001. Nature.\n") + assert "[1] Smith" not in result + + def test_removes_numbered_ref_entry(self): + result = clean_text("1. Jones A. 1999. J Ecol.\n") + assert "Jones A." not in result + + def test_keeps_regular_sentences_starting_with_number(self): + # Sentences like "40 specimens were examined" should stay + text = "A total of 40 specimens were examined in 2005.\n" + result = clean_text(text) + assert "40 specimens" in result + + +class TestUrlAndDoiRemoval: + def test_removes_http_url(self): + result = clean_text("See https://www.example.com/paper for details.\n") + assert "https://" not in result + + def test_removes_doi(self): + result = clean_text("doi.org/10.1016/j.biocon.2001.01.001\n") + assert "doi.org" not in result + + +class TestEmailRemoval: + def test_removes_email_address(self): + result = clean_text("Contact: author@university.edu for more info.\n") + assert "@university.edu" not in result + + +class TestCopyrightRemoval: + def test_removes_copyright_symbol(self): + result = clean_text("© 2021 Elsevier Ltd. All rights reserved.\n") + assert "Elsevier" not in result + + def test_removes_copyright_word(self): + result = clean_text("Copyright 2020 The Authors.\n") + assert "Copyright 2020" not in result + + +class TestFigureCaptionRemoval: + def test_removes_figure_caption(self): + result = clean_text("Figure 1. Map of study area showing sampling sites.\n") + assert "Map of study area" not in result + + def test_removes_table_caption(self): + result = clean_text("Table 2. Diet composition of Vulpes vulpes.\n") + assert "Diet composition" not in result + + def test_removes_fig_abbreviation(self): + result = clean_text("Fig. 3. Distribution of stomach contents.\n") + assert "Distribution of stomach" not in result + + +class TestAffiliationRemoval: + def test_removes_department_line(self): + result = clean_text("Department of Ecology, University of Oslo, Norway.\n") + assert "Department of Ecology" not in result + + def test_removes_university_of_line(self): + result = clean_text("University of Cambridge, Cambridge CB2 1TN, UK.\n") + assert "University of Cambridge" not in result + + +class TestReceivedAcceptedRemoval: + def test_removes_received_line(self): + result = clean_text("Received: 12 March 2023; Accepted: 5 June 2023\n") + assert "12 March 2023" not in result + + +class TestKeywordsRemoval: + def test_removes_keywords_line(self): + result = clean_text("Keywords: predator, diet, stomach contents, ecology\n") + assert "predator, diet" not in result + + +# --------------------------------------------------------------------------- +# Whitespace normalisation +# --------------------------------------------------------------------------- + + +class TestWhitespaceNormalisation: + def test_collapses_multiple_blank_lines(self): + text = "Line one.\n\n\n\n\nLine two.\n" + result = clean_text(text) + assert "\n\n\n" not in result + + def test_strips_leading_and_trailing_whitespace(self): + text = " \n\nSome content.\n\n " + result = clean_text(text) + assert result == result.strip() + + +# --------------------------------------------------------------------------- +# Pass-through for clean text +# --------------------------------------------------------------------------- + + +class TestCleanPassThrough: + def test_clean_text_passes_through_unaffected_content(self): + text = ( + "Abstract\nThis study examined stomach contents of Canis lupus.\n\n" + "Methods\nWe collected 50 specimens from Yellowstone, USA.\n\n" + "Results\nOf 50 stomachs, 8 were empty and 42 contained prey.\n" + ) + result = clean_text(text) + for fragment in ["Canis lupus", "50 specimens", "Yellowstone", "8 were empty"]: + assert fragment in result, f"Expected '{fragment}' to be preserved" + + def test_page_markers_preserved(self): + text = "[PAGE 1]\nAbstract\nDiet study.\n[PAGE 2]\nMethods\nWe sampled.\n" + result = clean_text(text) + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + + def test_empty_string_returns_empty(self): + assert clean_text("") == "" + + def test_whitespace_only_returns_empty(self): + assert clean_text(" \n\n ") == "" From fa6145973729f3e1d7df346eff2c242e48c9875e Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 20:06:43 -0800 Subject: [PATCH 02/30] fix: text cleaning left blank lines --- src/preprocessing/text_cleaner.py | 98 +++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 5 deletions(-) diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py index 267bd61..d9c1511 100644 --- a/src/preprocessing/text_cleaner.py +++ b/src/preprocessing/text_cleaner.py @@ -54,6 +54,17 @@ r")\s*[:\.]?\s*$" ) +# Structured-header block patterns common in two-column journal PDFs. +# When a line matches, subsequent short lines (< 60 chars) that look like +# keyword / metadata values are dropped until a long content line resumes. +# NOTE: requires actual whitespace between letters to avoid matching "Abstract". +_STRUCTURED_HEADER_START: re.Pattern = re.compile( + r"(?i)^\s*(" + r"a\s+b\s+s\s+t\s+r\s+a\s+c\s+t" # spaced "A B S T R A C T" + r"|a\s+r\s+t\s+i\s+c\s+l\s+e\s+i\s+n\s+f\s+o" # spaced "A R T I C L E I N F O" + r")\s*$" +) + # --------------------------------------------------------------------------- # Line-level drop patterns # Lines that individually match one of these are removed regardless of where @@ -67,8 +78,8 @@ re.compile(r"^\s*\[\d+\]\s+[A-Z]"), re.compile(r"^\s*\d{1,3}[.)]\s{1,4}[A-Z][a-z]"), - # DOI and bare URLs - re.compile(r"(?i)(https?://|doi\.org/|www\.)\S+"), + # DOI and bare URLs (doi.org/, bare doi:, https://, www.) + re.compile(r"(?i)(https?://|doi\.org/|\bdoi:\s*10\.|www\.)\S*"), # Email addresses re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), @@ -83,19 +94,30 @@ r"|journal\s+of|proceedings\s+of)"), # Received / accepted / revised / available-online timestamps + # Match with or without trailing colon/semicolon re.compile(r"(?i)^\s*(received|accepted|revised|available\s+online|" - r"published\s+online|handling\s+editor)\s*[:;]"), + r"published\s+online|handling\s+editor)" + r"(\s*[:;]|\s+\d|\s+in)"), + + # Article history block header + re.compile(r"(?i)^\s*article\s+history\s*[:\.]?\s*$"), - # Keywords line + # Keywords header line AND single-word keyword-style lines that follow it re.compile(r"(?i)^\s*key\s*-?\s*words?\s*[:\-]"), + # Journal / publisher metadata lines + re.compile(r"(?i)^\s*(contents?\s+lists?\s+available|journal\s+homepage|" + r"elsevier\.com|sciencedirect\.com|springer\.|wiley\.com)"), + # Figure / table / plate captions re.compile(r"(?i)^\s*(fig(ure)?\.?\s*\d|table\s*\d|plate\s*\d|" r"fig\.\s*s\d|supplemental?\s+(table|figure)\s*\d)" r"[\s.\-–—:]"), # Author affiliation lines (institution / department / lab names) - re.compile(r"(?i)^\s*(department\s+of|faculty\s+of|institute\s+(of|for)|" + # Allow for leading special characters (e.g. ⁎, *, †) + re.compile(r"(?i)^[\s\*⁎†‡#]*" + r"(department\s+of|faculty\s+of|institute\s+(of|for)|" r"division\s+of|school\s+of|laboratory\s+of|lab\s+of|" r"centre?\s+(for|of)|program(me)?\s+(in|of)|" r"universidad|universit[éy]|université|universidade|" @@ -110,6 +132,52 @@ # --------------------------------------------------------------------------- _MULTI_BLANK = re.compile(r"\n{3,}") _TRAILING_SPACES = re.compile(r"[ \t]+\n") +# Line ending with a soft hyphen or mid-word break (next line starts lowercase +# or with punctuation that continues the word). +_SOFT_HYPHEN_END = re.compile(r"-$") +_CONTINUATION_LINE = re.compile(r"^[a-z,\.\)\];:!\?]") + + +def _rejoin_broken_lines(text: str) -> str: + """Rejoin lines that were broken mid-sentence or mid-word by column wrapping. + + Two cases are handled: + 1. Hard hyphenation: line ends with ``-`` and next line starts with a + lowercase letter → join without any space (remove the hyphen). + 2. Soft wrap: line ends without sentence-terminating punctuation and next + line starts with lowercase → join with a single space. + + ``[PAGE N]`` marker lines are never joined. + """ + lines = text.split("\n") + out: List[str] = [] + i = 0 + while i < len(lines): + line = lines[i] + # Never merge PAGE markers + if re.match(r"\[PAGE\s+\d+\]", line.strip()): + out.append(line) + i += 1 + continue + + rstripped = line.rstrip() + if i + 1 < len(lines): + next_line = lines[i + 1].lstrip() + next_is_page = re.match(r"\[PAGE\s+\d+\]", next_line) + if not next_is_page and next_line and _CONTINUATION_LINE.match(next_line): + # Hard hyphen: remove hyphen and join directly + if _SOFT_HYPHEN_END.search(rstripped): + out.append(rstripped[:-1] + next_line) + i += 2 + continue + # Soft wrap: line doesn't end with sentence-ending punctuation + if rstripped and rstripped[-1] not in ".!?:;)]": + out.append(rstripped + " " + next_line) + i += 2 + continue + out.append(line) + i += 1 + return "\n".join(out) def _drop_line(line: str) -> bool: @@ -175,6 +243,25 @@ def clean_text(text: str) -> str: after_section_pass = "".join(cleaned_parts) + # ── Pass 1b: strip structured-header blocks (spaced-letter headers + + # short keyword/metadata lines that follow them in two-column PDFs) ────── + structured_lines: List[str] = [] + in_structured_block = False + for line in after_section_pass.split("\n"): + stripped = line.strip() + if _STRUCTURED_HEADER_START.match(stripped): + in_structured_block = True + continue # drop the header itself + if in_structured_block: + # Exit block once a long line (real content) appears + if len(stripped) > 60: + in_structured_block = False + structured_lines.append(line) + # Drop short keyword/metadata values + continue + structured_lines.append(line) + after_section_pass = "\n".join(structured_lines) + # ── Pass 2: line-level drop ────────────────────────────────────────── output_lines: List[str] = [] for line in after_section_pass.split("\n"): @@ -186,6 +273,7 @@ def clean_text(text: str) -> str: # ── Normalise whitespace ──────────────────────────────────────────────── after_line_pass = _TRAILING_SPACES.sub("\n", after_line_pass) + after_line_pass = _rejoin_broken_lines(after_line_pass) after_line_pass = _MULTI_BLANK.sub("\n\n", after_line_pass) return after_line_pass.strip() From 688deaa99e8287142a79b47493ecf135221517be Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 20:14:57 -0800 Subject: [PATCH 03/30] feat: temp pipeline for testing --- data/results/metrics/Adams_1989_results.json | 18 + data/results/metrics/Fisher_2008_results.json | 14 + extract-from-txt.py | 310 ++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100644 data/results/metrics/Adams_1989_results.json create mode 100644 data/results/metrics/Fisher_2008_results.json create mode 100644 extract-from-txt.py diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json new file mode 100644 index 0000000..27a2c4c --- /dev/null +++ b/data/results/metrics/Adams_1989_results.json @@ -0,0 +1,18 @@ +{ + "source_file": "Adams_1989.txt", + "file_type": ".txt", + "metrics": { + "species_name": "Pygoscelis papua", + "study_location": "Marion Island, sub-Antarctic", + "study_date": "1984-1985", + "num_empty_stomachs": null, + "num_nonempty_stomachs": 144, + "sample_size": 144, + "fraction_feeding": 1.0, + "source_pages": [ + 1, + 2, + 6 + ] + } +} \ No newline at end of file diff --git a/data/results/metrics/Fisher_2008_results.json b/data/results/metrics/Fisher_2008_results.json new file mode 100644 index 0000000..2cf1d09 --- /dev/null +++ b/data/results/metrics/Fisher_2008_results.json @@ -0,0 +1,14 @@ +{ + "source_file": "Fisher_2008.txt", + "file_type": ".txt", + "metrics": { + "species_name": null, + "study_location": null, + "study_date": null, + "num_empty_stomachs": null, + "num_nonempty_stomachs": null, + "sample_size": null, + "fraction_feeding": null, + "source_pages": null + } +} \ No newline at end of file diff --git a/extract-from-txt.py b/extract-from-txt.py new file mode 100644 index 0000000..1e074d9 --- /dev/null +++ b/extract-from-txt.py @@ -0,0 +1,310 @@ +"""Extract-from-TXT Pipeline + +Processes pre-classified useful .txt files through noise cleaning, text +trimming, and LLM extraction — bypassing the XGBoost classifier entirely. + +Every .txt file fed to this script is assumed to have already been confirmed +as useful (e.g. by the classifier in classify-extract.py or by manual review). +The pipeline: + + 1. Read raw .txt file + 2. Strip noise (references, acknowledgements, affiliations, captions, …) + via src/preprocessing/text_cleaner.py + 3. Trim to the character budget using section-priority ranking + via src/llm/llm_text.py::extract_key_sections() + 4. Call Ollama for structured extraction via src/llm/llm_client.py + 5. Save result JSON per file and a summary CSV + +Usage:: + + # Process the default directory (data/processed-text/) + python extract-from-txt.py + + # Custom input directory + python extract-from-txt.py --input-dir path/to/txt_files/ + + # Full options + python extract-from-txt.py \\ + --input-dir data/processed-text/ \\ + --output-dir data/results/ \\ + --llm-model llama3.1:8b \\ + --max-chars 5000 \\ + --num-ctx 4096 + +Output: + - data/results/metrics/_results.json per file + - data/results/summaries/txt_pipeline_summary_.csv overall +""" + +import argparse +import sys +from pathlib import Path + +# Ensure the project root is on sys.path regardless of where this script is +# invoked from. +_PROJECT_ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(_PROJECT_ROOT)) + +from src.preprocessing.text_cleaner import clean_text +from src.llm.llm_text import extract_key_sections +from src.llm.llm_client import extract_metrics_from_text, save_extraction_result + + +# --------------------------------------------------------------------------- +# Core pipeline function +# --------------------------------------------------------------------------- + +def run_txt_pipeline( + input_dir: Path, + output_dir: Path, + llm_model: str, + max_chars: int, + num_ctx: int, + single_file: Path = None, +) -> None: + """Process every .txt file in *input_dir* through clean → trim → extract. + + Args: + input_dir: Directory containing pre-classified useful .txt files. + Ignored when *single_file* is provided. + output_dir: Root output directory for JSON results and summary CSV. + llm_model: Ollama model name (e.g. ``"llama3.1:8b"``). + max_chars: Character budget for the text sent to Ollama. + num_ctx: Context window size requested from Ollama. + single_file: If set, process only this one .txt file. + """ + if single_file is not None: + txt_paths = [single_file] + else: + txt_paths = sorted(input_dir.glob("*.txt")) + if not txt_paths: + print(f"[ERROR] No .txt files found in: {input_dir}", file=sys.stderr) + sys.exit(1) + + print(f"[INFO] Found {len(txt_paths)} .txt file(s) to process", file=sys.stderr) + output_dir.mkdir(parents=True, exist_ok=True) + summary_rows = [] + + for idx, txt_path in enumerate(txt_paths, start=1): + print(f"\n[{idx}/{len(txt_paths)}] {txt_path.name}", file=sys.stderr) + + row: dict = { + "filename": txt_path.name, + "raw_chars": "", + "cleaned_chars": "", + "trimmed_chars": "", + "extraction_status": "", + "species_name": "", + "study_location": "", + "study_date": "", + "sample_size": "", + "num_empty_stomachs": "", + "num_nonempty_stomachs": "", + "fraction_feeding": "", + } + + # ── Step 1: Read ──────────────────────────────────────────────────── + try: + raw_text = txt_path.read_text(encoding="utf-8") + except Exception as exc: + print(f" [ERROR] Could not read file: {exc}", file=sys.stderr) + row["extraction_status"] = "read_failed" + summary_rows.append(row) + continue + + row["raw_chars"] = len(raw_text) + print(f" [INFO] Raw size : {len(raw_text):,} chars", file=sys.stderr) + + if not raw_text.strip(): + print(f" [WARN] File is empty — skipping.", file=sys.stderr) + row["extraction_status"] = "empty_file" + summary_rows.append(row) + continue + + # ── Step 2: Clean ─────────────────────────────────────────────────── + cleaned = clean_text(raw_text) + row["cleaned_chars"] = len(cleaned) + print(f" [INFO] After clean: {len(cleaned):,} chars", file=sys.stderr) + + if not cleaned.strip(): + print(f" [WARN] Nothing left after cleaning — skipping.", file=sys.stderr) + row["extraction_status"] = "empty_after_clean" + summary_rows.append(row) + continue + + # ── Step 3: Trim to LLM budget ────────────────────────────────────── + if len(cleaned) > max_chars: + trimmed = extract_key_sections(cleaned, max_chars) + print( + f" [INFO] After trim : {len(trimmed):,} chars " + f"(budget {max_chars:,})", + file=sys.stderr, + ) + else: + trimmed = cleaned + + row["trimmed_chars"] = len(trimmed) + + # ── Step 4: LLM extraction ────────────────────────────────────────── + print(f" [INFO] Calling Ollama ({llm_model})…", file=sys.stderr) + try: + metrics = extract_metrics_from_text( + text=trimmed, + model=llm_model, + num_ctx=num_ctx, + ) + except Exception as exc: + print(f" [ERROR] Ollama extraction failed: {exc}", file=sys.stderr) + row["extraction_status"] = "extraction_failed" + summary_rows.append(row) + continue + + # ── Step 5: Save JSON ─────────────────────────────────────────────── + try: + result = save_extraction_result( + metrics=metrics, + source_file=txt_path, + original_text=raw_text, # keep full text for page resolution + output_dir=output_dir, + ) + except Exception as exc: + print(f" [ERROR] Could not save result: {exc}", file=sys.stderr) + row["extraction_status"] = "save_failed" + summary_rows.append(row) + continue + + m = result["metrics"] + row["extraction_status"] = "success" + row["species_name"] = m.get("species_name") or "" + row["study_location"] = m.get("study_location") or "" + row["study_date"] = m.get("study_date") or "" + row["sample_size"] = ( + "" if m.get("sample_size") is None else m["sample_size"] + ) + row["num_empty_stomachs"] = ( + "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] + ) + row["num_nonempty_stomachs"] = ( + "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] + ) + row["fraction_feeding"] = ( + "" if m.get("fraction_feeding") is None else m["fraction_feeding"] + ) + + print( + f" [OK] species={m.get('species_name')} " + f"n={m.get('sample_size')} " + f"date={m.get('study_date')}", + file=sys.stderr, + ) + + summary_rows.append(row) + + # ── Final report ──────────────────────────────────────────────────────── + total = len(summary_rows) + succeeded = sum(1 for r in summary_rows if r["extraction_status"] == "success") + failed = total - succeeded + + print("\n" + "=" * 55, file=sys.stderr) + print("TXT EXTRACTION PIPELINE COMPLETE", file=sys.stderr) + print("=" * 55, file=sys.stderr) + print(f" Files processed : {total}", file=sys.stderr) + print(f" Successful : {succeeded}", file=sys.stderr) + print(f" Failed / skipped : {failed}", file=sys.stderr) + print("=" * 55, file=sys.stderr) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description=( + "Extract structured predator-diet metrics from pre-classified " + "useful .txt files using Ollama." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + Default (data/processed-text/ → data/results/): + python extract-from-txt.py + + Custom directories: + python extract-from-txt.py --input-dir data/useful-txt/ --output-dir out/ + + Different model / tighter budget: + python extract-from-txt.py --llm-model mistral:7b --max-chars 4500 + """, + ) + parser.add_argument( + "--file", + type=str, + default=None, + help="Path to a single .txt file to process. Overrides --input-dir.", + ) + parser.add_argument( + "--input-dir", + type=str, + default="data/processed-text", + help="Directory of .txt files to process (default: data/processed-text).", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/results", + help="Root output directory for JSON results and CSV summary " + "(default: data/results).", + ) + parser.add_argument( + "--llm-model", + type=str, + default="llama3.1:8b", + help="Ollama model name (default: llama3.1:8b).", + ) + parser.add_argument( + "--max-chars", + type=int, + default=5000, + help="Maximum characters to send to Ollama after cleaning (default: 5000).", + ) + parser.add_argument( + "--num-ctx", + type=int, + default=4096, + help="Ollama context window size (default: 4096).", + ) + + args = parser.parse_args() + + single_file = None + if args.file: + single_file = Path(args.file) + if not single_file.exists(): + print(f"[ERROR] File not found: {single_file}", file=sys.stderr) + sys.exit(1) + if single_file.suffix.lower() != ".txt": + print(f"[ERROR] --file must point to a .txt file: {single_file}", file=sys.stderr) + sys.exit(1) + input_dir = single_file.parent + else: + input_dir = Path(args.input_dir) + if not input_dir.exists(): + print(f"[ERROR] Input directory not found: {input_dir}", file=sys.stderr) + sys.exit(1) + if not input_dir.is_dir(): + print(f"[ERROR] --input-dir must be a directory: {input_dir}", file=sys.stderr) + sys.exit(1) + + run_txt_pipeline( + input_dir=input_dir, + output_dir=Path(args.output_dir), + llm_model=args.llm_model, + max_chars=args.max_chars, + num_ctx=args.num_ctx, + single_file=single_file, + ) + + +if __name__ == "__main__": + main() From af2836fd8097ed560406c9f246dee0b3c97bb630 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 20:23:41 -0800 Subject: [PATCH 04/30] fix: section headers being removed --- src/preprocessing/text_cleaner.py | 34 ++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py index d9c1511..b694824 100644 --- a/src/preprocessing/text_cleaner.py +++ b/src/preprocessing/text_cleaner.py @@ -123,10 +123,38 @@ r"universidad|universit[éy]|université|universidade|" r"university\s+of|college\s+of)"), - # Running page headers / footers: short all-caps lines + # Running page headers / footers: short all-caps lines that are NOT + # known section headings (those are whitelisted in _drop_line below). re.compile(r"^[A-Z\s\d\.\-–—:,]{5,60}$"), ] +# --------------------------------------------------------------------------- +# Section-header whitelist +# Lines matching this pattern are structural anchors the LLM needs; they must +# never be removed even if they look like all-caps noise. +# --------------------------------------------------------------------------- +_SECTION_HEADER_WHITELIST: re.Pattern = re.compile( + r"(?i)^\s*" + # optional section number prefix e.g. "1.", "2.1.", "3.2.1 " + r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + r"(" + r"abstract" + r"|summary" + r"|introduction" + r"|background" + r"|methods?" + r"|materials?\s*(?:and|&)\s*methods?" + r"|methodology" + r"|study\s*(?:area|site|design|region|period)" + r"|results?" + r"|findings?" + r"|discussion" + r"|conclusions?" + r"|summary\s+and\s+discussion" + r"|acknowledge?ments?" + r")\s*[:\.\-]?\s*$" +) + # --------------------------------------------------------------------------- # Whitespace normalisation # --------------------------------------------------------------------------- @@ -185,6 +213,10 @@ def _drop_line(line: str) -> bool: stripped = line.strip() if not stripped: return False # preserve blank lines for now; collapsed later + # Never drop structural section headings — the LLM and section-ranker + # both depend on them to orient in the document. + if _SECTION_HEADER_WHITELIST.match(stripped): + return False for pat in _LINE_DROP_PATTERNS: if pat.search(stripped): return True From 0687e629bff8bfb57a03d5ddf61aeaa387765a38 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 20:47:33 -0800 Subject: [PATCH 05/30] use section priority rankings instead --- extract-from-txt.py | 28 ++++++++ src/llm/llm_text.py | 155 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 157 insertions(+), 26 deletions(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index 1e074d9..1cb3760 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -37,7 +37,9 @@ """ import argparse +import csv import sys +from datetime import datetime from pathlib import Path # Ensure the project root is on sys.path regardless of where this script is @@ -200,6 +202,31 @@ def run_txt_pipeline( summary_rows.append(row) + # ── Write summary CSV ─────────────────────────────────────────────────── + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + summaries_dir = output_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + summary_path = summaries_dir / f"txt_pipeline_summary_{timestamp}.csv" + + fieldnames = [ + "filename", + "raw_chars", + "cleaned_chars", + "trimmed_chars", + "extraction_status", + "species_name", + "study_location", + "study_date", + "sample_size", + "num_empty_stomachs", + "num_nonempty_stomachs", + "fraction_feeding", + ] + with open(summary_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(summary_rows) + # ── Final report ──────────────────────────────────────────────────────── total = len(summary_rows) succeeded = sum(1 for r in summary_rows if r["extraction_status"] == "success") @@ -211,6 +238,7 @@ def run_txt_pipeline( print(f" Files processed : {total}", file=sys.stderr) print(f" Successful : {succeeded}", file=sys.stderr) print(f" Failed / skipped : {failed}", file=sys.stderr) + print(f" Summary CSV : {summary_path}", file=sys.stderr) print("=" * 55, file=sys.stderr) diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index d311abd..39789d9 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -8,7 +8,7 @@ import re import sys from pathlib import Path -from typing import List, Tuple +from typing import List, Optional, Tuple # Add project root to path project_root = Path(__file__).parent.parent.parent @@ -16,6 +16,64 @@ from src.preprocessing.pdf_text_extraction import extract_text_from_pdf +# --------------------------------------------------------------------------- +# Section-boundary splitting helpers +# --------------------------------------------------------------------------- + +# Optional numeric prefix shared by all section patterns, e.g. "1.", "2.1.", "3.2.1 " +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + +# (pattern, priority) — lower number = kept first when budget is tight +_SECTION_PRIORITIES: List[Tuple[re.Pattern, int]] = [ + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$"), 0), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(results?|findings?)\s*[:\.]?\s*$"), 1), + (re.compile( + r"(?i)^\s*" + _NUM_PREFIX + + r"(materials?\s*(?:and|&)\s*methods?|methods?|methodology" + r"|study\s*(?:area|site|design|region|period))\s*[:\.]?\s*$" + ), 2), + (re.compile(r"(?i)^\s*table\s*\d"), 3), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(introduction|background)\s*[:\.]?\s*$"), 4), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(discussion|conclusions?|summary\s+and\s+discussion)\s*[:\.]?\s*$"), 5), +] + +_DROP_SECTION_RE: re.Pattern = re.compile( + r"(?i)^\s*" + r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" # optional numeric prefix + r"(" + r"acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix\b" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r")\s*[:\.]?\s*$" +) + + +def _section_priority(heading: str) -> int: + """Return the priority integer for a section heading (lower = more important). + Unknown / un-labelled sections get priority 6. + Drop sections return 999 and should be excluded before calling this. + """ + for pat, pri in _SECTION_PRIORITIES: + if pat.match(heading.strip()): + return pri + return 6 + + +# --------------------------------------------------------------------------- +# Legacy page-split helpers (kept for source-page resolution in llm_client.py) +# --------------------------------------------------------------------------- + # Section headers commonly found in scientific diet / stomach-content papers. # Order matters: earlier entries are higher priority when budget is tight. SECTION_PATTERNS: List[re.Pattern[str]] = [ @@ -80,17 +138,22 @@ def extract_key_sections(text: str, max_chars: int) -> str: """Return the most informative portion of text within the character budget. Strategy: - 1. Split the paper into pages using [PAGE N] markers - 2. Drop pages belonging to References/Acknowledgements/Appendix - 3. Rank remaining pages by section priority: + 1. Scan the cleaned text for section headings (Abstract, Results, Methods …) + regardless of [PAGE N] markers, giving section-level rather than + page-level granularity. + 2. Drop Reference / Acknowledgement / Appendix sections entirely. + 3. Rank remaining sections by content priority: Abstract > Results > Methods > Tables > Introduction > Discussion > other - 4. Greedily pack pages in priority order until the budget is spent - 5. Re-order selected pages by their original page number so the LLM - sees them in reading order + 4. Greedily pack sections in priority order until the budget is spent. + 5. Re-order selected sections in their original reading order so the LLM + receives coherent, in-document-order text. + + Falls back to simple character truncation if no section headings are found + (e.g. very short files or files with no structural markers). Args: - text: Full text of the document - max_chars: Maximum character budget for the output + text: Cleaned text of the document (may contain [PAGE N] markers). + max_chars: Maximum character budget for the output. Returns: Extracted text containing the most relevant sections within the budget. @@ -99,32 +162,72 @@ def extract_key_sections(text: str, max_chars: int) -> str: if len(text) <= max_chars: return text - pages = split_into_pages(text) - scored: List[Tuple[int, int, str]] = [] # (priority, page_num, page_text) - for page_num, page_text in pages: - useful, priority = classify_page(page_text) - if useful: - scored.append((priority, page_num, page_text)) + lines = text.split("\n") + + # ── Build section list ───────────────────────────────────────────────── + # Each entry: (original_line_index, heading_str, content_str) + sections: List[Tuple[int, str, str]] = [] + current_heading: str = "[PREAMBLE]" + current_start: int = 0 + current_lines: List[str] = [] + + for i, line in enumerate(lines): + stripped = line.strip() + is_drop = bool(_DROP_SECTION_RE.match(stripped)) if stripped else False + is_known = any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) if stripped else False + + if is_drop or is_known: + # Flush the in-progress section + sections.append((current_start, current_heading, "\n".join(current_lines))) + current_heading = stripped + current_start = i + current_lines = [] + else: + current_lines.append(line) + + # Flush the final section + sections.append((current_start, current_heading, "\n".join(current_lines))) + + # Fall back to simple truncation when no headings were detected + meaningful = [s for s in sections if s[1] != "[PREAMBLE]"] + if not meaningful: + return text[:max_chars] + + # ── Score and filter ─────────────────────────────────────────────────── + scored: List[Tuple[int, int, str, str]] = [] # (priority, orig_idx, heading, content) + for orig_idx, (start, heading, content) in enumerate(sections): + if _DROP_SECTION_RE.match(heading.strip()) if heading.strip() else False: + continue # hard-drop references, acknowledgements, appendix, … + if heading == "[PREAMBLE]": + # The preamble (everything before the first section heading) + # almost always contains the abstract. Treat it as priority 0 + # so it is packed first, ahead of all other sections. + priority = 0 + else: + priority = _section_priority(heading) + scored.append((priority, orig_idx, heading, content)) - # Sort by priority (ascending = most important first) + # Sort by priority: most important sections first scored.sort(key=lambda t: t[0]) - selected: List[Tuple[int, str]] = [] + # ── Greedily fill budget ─────────────────────────────────────────────── + selected: List[Tuple[int, str]] = [] # (orig_idx, chunk) budget = max_chars - for _priority, page_num, page_text in scored: - page_with_marker = f"[PAGE {page_num}]\n{page_text}" - if len(page_with_marker) <= budget: - selected.append((page_num, page_with_marker)) - budget -= len(page_with_marker) + for _pri, orig_idx, heading, content in scored: + chunk = (f"{heading}\n{content}").strip() if heading != "[PREAMBLE]" else content.strip() + if not chunk: + continue + if len(chunk) <= budget: + selected.append((orig_idx, chunk)) + budget -= len(chunk) elif budget > 200: - # Partially include the page up to the remaining budget - selected.append((page_num, page_with_marker[:budget])) + selected.append((orig_idx, chunk[:budget])) budget = 0 break - # Re-sort by page number so the LLM sees content in reading order + # Re-sort by original index so the LLM reads content in document order selected.sort(key=lambda t: t[0]) - return "\n".join(chunk for _, chunk in selected) + return "\n\n".join(chunk for _, chunk in selected) def load_document(file_path: Path) -> str: From 317387dcf52699b67a737ed73eb8279bfdda41e8 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 20:49:13 -0800 Subject: [PATCH 06/30] save cleaned text to folder --- extract-from-txt.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index 1cb3760..bd33196 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -32,6 +32,7 @@ --num-ctx 4096 Output: + - data/cleaned-text/_.txt trimmed text passed to Ollama - data/results/metrics/_results.json per file - data/results/summaries/txt_pipeline_summary_.csv overall """ @@ -85,6 +86,8 @@ def run_txt_pipeline( print(f"[INFO] Found {len(txt_paths)} .txt file(s) to process", file=sys.stderr) output_dir.mkdir(parents=True, exist_ok=True) + cleaned_text_dir = output_dir.parent / "cleaned-text" + cleaned_text_dir.mkdir(parents=True, exist_ok=True) summary_rows = [] for idx, txt_path in enumerate(txt_paths, start=1): @@ -147,7 +150,16 @@ def run_txt_pipeline( row["trimmed_chars"] = len(trimmed) - # ── Step 4: LLM extraction ────────────────────────────────────────── + # ── Step 4: Save cleaned text snapshot ───────────────────────────── + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + cleaned_path = cleaned_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + cleaned_path.write_text(trimmed, encoding="utf-8") + print(f" [INFO] Cleaned text : {cleaned_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save cleaned text: {exc}", file=sys.stderr) + + # ── Step 5: LLM extraction ────────────────────────────────────────── print(f" [INFO] Calling Ollama ({llm_model})…", file=sys.stderr) try: metrics = extract_metrics_from_text( @@ -161,7 +173,7 @@ def run_txt_pipeline( summary_rows.append(row) continue - # ── Step 5: Save JSON ─────────────────────────────────────────────── + # ── Step 6: Save JSON ─────────────────────────────────────────────── try: result = save_extraction_result( metrics=metrics, From 17b7b55e27d22886382e8660a280e38336967aa7 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 21:13:35 -0800 Subject: [PATCH 07/30] use parargraph scoring --- src/llm/llm_text.py | 233 +++++++++++++++++++++++++++++++++----------- 1 file changed, 178 insertions(+), 55 deletions(-) diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index 39789d9..3005519 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -70,6 +70,96 @@ def _section_priority(heading: str) -> int: return 6 +# --------------------------------------------------------------------------- +# Paragraph-level keyword scoring +# --------------------------------------------------------------------------- +# Each tuple is (compiled pattern, score weight). A paragraph’s total score +# is the sum of weights for every pattern that matches anywhere in it. +# Higher-scoring paragraphs are packed into the LLM budget first. + +_FIELD_PATTERNS: List[Tuple[re.Pattern, int]] = [ + # sample_size — explicit counts of stomachs / specimens / individuals + (re.compile( + r"(?i)(\bn\s*=\s*\d+" + r"|total\s+of\s+\d+" + r"|\d+\s+stomachs?" + r"|\d+\s+specimens?" + r"|\d+\s+individuals?" + r"|\d+\s+birds?" + r"|\d+\s+fish" + r"|\d+\s+samples?" + r"|sample\s+size\s+(of\s+)?\d+" + r"|examined\s+\d+" + r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" + ), 4), + # num_empty_stomachs — explicit empty-stomach language + (re.compile( + r"(?i)(empty\s+stomachs?" + r"|stomachs?\s+(were\s+)?empty" + r"|had\s+empty" + r"|without\s+food" + r"|without\s+(stomach\s+)?contents?" + r"|zero\s+prey" + r"|no\s+food\s+(items?|remains?)" + r"|vacuous|vacant\s+stomachs?)" + ), 5), + # num_nonempty_stomachs / fraction_feeding + (re.compile( + r"(?i)(non.?empty" + r"|contained\s+(food|prey|items?)" + r"|with\s+food" + r"|with\s+(stomach\s+)?contents?" + r"|had\s+(food|prey)" + r"|feeding\s+rate" + r"|proportion\s+(feeding|with\s+food)" + r"|percent\s+(feeding|with\s+food)" + r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" + ), 5), + # general percentage / fraction near gut/stomach context + (re.compile( + r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" + r"|\d+\s+of\s+\d+\s+(were|had|contained)" + r"|proportion\s+of\s+\d+)" + ), 2), + # study date — collection period + (re.compile( + r"(?i)(collected\s+(in|during|between)" + r"|sampled\s+(in|during|between)" + r"|field\s+season" + r"|study\s+period" + r"|between\s+\d{4}\s+and\s+\d{4}" + r"|\d{4}[\-\u2013]\d{4}" + r"|sampling\s+period)" + ), 2), + # study location + (re.compile( + r"(?i)(study\s+(area|site|region)" + r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" + r"|sampling\s+(location|site|area))" + ), 2), + # any binomial species name (used as a weak relevance signal) + (re.compile(r"\b[A-Z][a-z]+\s+[a-z]{3,}\b"), 1), +] + +# Maximum characters reserved for the pinned abstract/preamble. Any +# remaining budget is filled by keyword-scored paragraphs. +_ABSTRACT_CAP: int = 2000 + + +def _score_paragraph(para: str) -> int: + """Return a keyword-relevance score for a single paragraph of text. + + Paragraphs that mention many extraction targets (stomach counts, empty / + non-empty language, sample sizes, dates, locations) score higher and are + preferentially included in the LLM prompt. + """ + score = 0 + for pat, weight in _FIELD_PATTERNS: + if pat.search(para): + score += weight + return score + + # --------------------------------------------------------------------------- # Legacy page-split helpers (kept for source-page resolution in llm_client.py) # --------------------------------------------------------------------------- @@ -137,35 +227,42 @@ def classify_page(page_text: str) -> Tuple[bool, int]: def extract_key_sections(text: str, max_chars: int) -> str: """Return the most informative portion of text within the character budget. - Strategy: - 1. Scan the cleaned text for section headings (Abstract, Results, Methods …) - regardless of [PAGE N] markers, giving section-level rather than - page-level granularity. - 2. Drop Reference / Acknowledgement / Appendix sections entirely. - 3. Rank remaining sections by content priority: - Abstract > Results > Methods > Tables > Introduction > Discussion > other - 4. Greedily pack sections in priority order until the budget is spent. - 5. Re-order selected sections in their original reading order so the LLM - receives coherent, in-document-order text. - - Falls back to simple character truncation if no section headings are found - (e.g. very short files or files with no structural markers). + Two-phase strategy + ------------------ + Phase 1 — Abstract pin + Always include the preamble (everything before the first section + heading), which almost always contains the abstract, title, and key + study metadata. Capped at ``_ABSTRACT_CAP`` characters so it cannot + crowd out the data-rich content below. + + Phase 2 — Keyword-scored paragraph mining + Split all remaining non-dropped section text into blank-line-separated + paragraphs. Score each paragraph by how many extraction-relevant + keywords it contains (sample counts, empty/non-empty stomach language, + percentages, dates, locations, species names). Pack the + highest-scoring paragraphs first until the remaining budget is full. + Re-order selected paragraphs to their original document position so + the LLM receives coherent, in-order text. + + This approach guarantees that sentences like + “A total of 144 stomach samples… 58% contained food” + are always included regardless of which named section they fall in. Args: text: Cleaned text of the document (may contain [PAGE N] markers). max_chars: Maximum character budget for the output. Returns: - Extracted text containing the most relevant sections within the budget. - If the full text fits within max_chars, it is returned as-is. + Extracted text fitting within *max_chars*. + If the full text already fits, it is returned unchanged. """ if len(text) <= max_chars: return text lines = text.split("\n") - # ── Build section list ───────────────────────────────────────────────── - # Each entry: (original_line_index, heading_str, content_str) + # ── Phase 1: Split document into named sections ──────────────────────── + # Each entry: (start_line_idx, heading, content) sections: List[Tuple[int, str, str]] = [] current_heading: str = "[PREAMBLE]" current_start: int = 0 @@ -174,60 +271,86 @@ def extract_key_sections(text: str, max_chars: int) -> str: for i, line in enumerate(lines): stripped = line.strip() is_drop = bool(_DROP_SECTION_RE.match(stripped)) if stripped else False - is_known = any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) if stripped else False - + is_known = ( + any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) + if stripped else False + ) if is_drop or is_known: - # Flush the in-progress section sections.append((current_start, current_heading, "\n".join(current_lines))) current_heading = stripped current_start = i current_lines = [] else: current_lines.append(line) - - # Flush the final section sections.append((current_start, current_heading, "\n".join(current_lines))) - # Fall back to simple truncation when no headings were detected - meaningful = [s for s in sections if s[1] != "[PREAMBLE]"] - if not meaningful: - return text[:max_chars] - - # ── Score and filter ─────────────────────────────────────────────────── - scored: List[Tuple[int, int, str, str]] = [] # (priority, orig_idx, heading, content) - for orig_idx, (start, heading, content) in enumerate(sections): + # ── Phase 1 result: pin the abstract/preamble ────────────────────────── + preamble_text = "" + body_sections: List[Tuple[int, str, str]] = [] # (start, heading, content) + for start, heading, content in sections: if _DROP_SECTION_RE.match(heading.strip()) if heading.strip() else False: - continue # hard-drop references, acknowledgements, appendix, … + continue # hard-drop references / acknowledgements / appendix if heading == "[PREAMBLE]": - # The preamble (everything before the first section heading) - # almost always contains the abstract. Treat it as priority 0 - # so it is packed first, ahead of all other sections. - priority = 0 + preamble_text = content.strip()[:_ABSTRACT_CAP] else: - priority = _section_priority(heading) - scored.append((priority, orig_idx, heading, content)) - - # Sort by priority: most important sections first - scored.sort(key=lambda t: t[0]) - - # ── Greedily fill budget ─────────────────────────────────────────────── - selected: List[Tuple[int, str]] = [] # (orig_idx, chunk) - budget = max_chars - for _pri, orig_idx, heading, content in scored: - chunk = (f"{heading}\n{content}").strip() if heading != "[PREAMBLE]" else content.strip() - if not chunk: + body_sections.append((start, heading, content)) + + budget = max_chars - len(preamble_text) + + # ── Phase 2: keyword-scored paragraph mining ─────────────────────────── + # Collect every paragraph from ALL body sections (blank-line separated). + # Each paragraph remembers its position so we can restore reading order. + # (start_line, paragraph_text, keyword_score) + raw_paragraphs: List[Tuple[int, str, int]] = [] + + for sec_start, heading, content in body_sections: + # Prepend the section heading so the LLM sees which section it’s in. + block = (f"{heading}\n{content}").strip() if heading else content.strip() + if not block: continue - if len(chunk) <= budget: - selected.append((orig_idx, chunk)) - budget -= len(chunk) + # Split on blank lines into paragraphs + para_lines: List[str] = [] + para_start = sec_start + for j, ln in enumerate(block.split("\n")): + if ln.strip(): + para_lines.append(ln) + else: + if para_lines: + para_text = "\n".join(para_lines).strip() + raw_paragraphs.append( + (sec_start + j, para_text, _score_paragraph(para_text)) + ) + para_lines = [] + para_start = sec_start + j + 1 + if para_lines: + para_text = "\n".join(para_lines).strip() + raw_paragraphs.append( + (sec_start + len(block.split("\n")), para_text, _score_paragraph(para_text)) + ) + + # Sort by score descending; use original position as tiebreaker (earlier first) + raw_paragraphs.sort(key=lambda t: (-t[2], t[0])) + + # Greedily fill budget with highest-scoring paragraphs + selected_paras: List[Tuple[int, str]] = [] # (orig_pos, text) + for pos, para_text, score in raw_paragraphs: + if budget <= 0: + break + if len(para_text) <= budget: + selected_paras.append((pos, para_text)) + budget -= len(para_text) elif budget > 200: - selected.append((orig_idx, chunk[:budget])) + selected_paras.append((pos, para_text[:budget])) budget = 0 - break - # Re-sort by original index so the LLM reads content in document order - selected.sort(key=lambda t: t[0]) - return "\n\n".join(chunk for _, chunk in selected) + # Re-sort to original document order so the LLM reads coherent text + selected_paras.sort(key=lambda t: t[0]) + + parts: List[str] = [] + if preamble_text: + parts.append(preamble_text) + parts.extend(p for _, p in selected_paras) + return "\n\n".join(parts) def load_document(file_path: Path) -> str: From f738892a4a89700c5fb4952f46b73c540675c217 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Fri, 27 Feb 2026 21:14:03 -0800 Subject: [PATCH 08/30] update data for comparison later --- .../Adams_1989_20260227_210332.txt | 75 +++++++++++++++++++ data/results/metrics/Adams_1989_results.json | 16 ++-- data/results/metrics/Fisher_2008_results.json | 14 ---- .../txt_pipeline_summary_20260227_211255.csv | 2 + 4 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 data/cleaned-text/Adams_1989_20260227_210332.txt delete mode 100644 data/results/metrics/Fisher_2008_results.json create mode 100644 data/results/summaries/txt_pipeline_summary_20260227_211255.csv diff --git a/data/cleaned-text/Adams_1989_20260227_210332.txt b/data/cleaned-text/Adams_1989_20260227_210332.txt new file mode 100644 index 0000000..61b7abc --- /dev/null +++ b/data/cleaned-text/Adams_1989_20260227_210332.txt @@ -0,0 +1,75 @@ +[PAGE 1] +Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic +Marion Island + +Author(s): N. J. Adams and N. T. Klages +Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 +Published by: Waterbird Society + [PAGE 2] + Temporal variation in the diet of the Gentoo Penguin + Pygoscelis papua at sub-Antarctic Marion Island + N. J. ADAMS & N. T. KLAGES' + Percy FitzPatrick Institute of African Ornithology, University of Cape Town, + Rondebosch 7700, South Africa + 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa + Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% + of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and + June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo + Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys + rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris + marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal + changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species + within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. + Key words.-diet, Gentoo Penguin, Pygoscelis papua, + +RESULTS + Mean mass of food samples was 139.2 + + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. + (1984). Mean monthly sample mass was highest in December at 332 g. + General Composition of the Diet + Thirty species or species groups were identified (Table 1), with fish comprising + the largest single group (11 species). However, samples from individual penguins + were largely homogeneous consisting of a single species of crustacean or fish (see also + Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- + tained only a single prey taxon, 26% contained two taxa and 10% contained three + taxa (analysis based on taxa comprising at least 5% by mass of individual samples). + Fish and crustaceans accounted for + 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- + siderable changes in proportions of fish, crustaceans and cephalopods occurred + over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass + from March to June 1984 but decreased to + 0% by March 1985. Cephalopods accounted for more than 10% of the diet + only in February and March 1985. + Fish + The fish component of the diet was dominated by the family Nototheniidae, + particularly Notothenia squamifrons. It is likely that most of the unidentified + juvenile nototheniids were also this species + (Table 1). Nototheniids accounted for over + 90% of the fish component during June to + October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- + peared more commonly in the diet from + October and accounted for most of the food items in March 1985 (Table 3). + Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas + When plotted on a monthly basis, the was no evidence of any progressive chan + in size-class distribution throughout t year. However, during January and Fe + ruary smaller fish between 28.6 - 41.8 m + (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger + dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini + months. Standard parameters for all fish species are given in Table 4. + Crustaceans + Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta + cean component. During winter (March to + August 1984), N. marionis formed the largest part of the crustacean component, but E. + vallentini subsequently increased to nearly + 100% in January and February 1985 + (Table 3). Mean length of E. vallentini was + 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm + + 5.18, n = 1481). + Cephalopods + Cephalopods accounted for just over + 10% of the diet during February and + March 1985 but seldom occurred in the remaining months (Table 3). Most + cephalopods were small octopods (DML < + 15 mm). The only squid identified were juveniles of the species Kondakovia lon- + gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? + 3.8 g (n = 4) respectively. \ No newline at end of file diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json index 27a2c4c..851776d 100644 --- a/data/results/metrics/Adams_1989_results.json +++ b/data/results/metrics/Adams_1989_results.json @@ -2,17 +2,13 @@ "source_file": "Adams_1989.txt", "file_type": ".txt", "metrics": { - "species_name": "Pygoscelis papua", + "species_name": null, "study_location": "Marion Island, sub-Antarctic", - "study_date": "1984-1985", + "study_date": null, "num_empty_stomachs": null, - "num_nonempty_stomachs": 144, - "sample_size": 144, - "fraction_feeding": 1.0, - "source_pages": [ - 1, - 2, - 6 - ] + "num_nonempty_stomachs": null, + "sample_size": null, + "fraction_feeding": null, + "source_pages": null } } \ No newline at end of file diff --git a/data/results/metrics/Fisher_2008_results.json b/data/results/metrics/Fisher_2008_results.json deleted file mode 100644 index 2cf1d09..0000000 --- a/data/results/metrics/Fisher_2008_results.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "source_file": "Fisher_2008.txt", - "file_type": ".txt", - "metrics": { - "species_name": null, - "study_location": null, - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": null, - "fraction_feeding": null, - "source_pages": null - } -} \ No newline at end of file diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv new file mode 100644 index 0000000..b88726f --- /dev/null +++ b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv @@ -0,0 +1,2 @@ +filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding +Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,, From ff34c749cf3b913191133409a721d077ecdabd1b Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:10:54 -0800 Subject: [PATCH 09/30] added another filter to drop entire paragraphs that contain irrelevant info --- src/preprocessing/section_filter.py | 383 +++++++++++++++++++++++++++ tests/test_section_filter.py | 394 ++++++++++++++++++++++++++++ 2 files changed, 777 insertions(+) create mode 100644 src/preprocessing/section_filter.py create mode 100644 tests/test_section_filter.py diff --git a/src/preprocessing/section_filter.py b/src/preprocessing/section_filter.py new file mode 100644 index 0000000..eac6813 --- /dev/null +++ b/src/preprocessing/section_filter.py @@ -0,0 +1,383 @@ +"""Coarse relevance filtering for cleaned scientific-paper text. + +Runs between ``text_cleaner.clean_text()`` and +``llm_text.extract_key_sections()`` to drop entire paragraphs that are very +unlikely to contain any of the target extraction metrics: + + - predator species names + - study locations + - collection dates + - sample sizes (n=, stomachs examined) + - empty / non-empty stomach counts + - feeding fraction data + +Paragraphs about pure taxonomy debates, phylogenetic analysis, habitat ecology +without location data, literature reviews of other studies, detailed prey-ID +methodology, morphometric measurements, or statistical test descriptions are +dropped. + +**Conservative by design** — a paragraph is kept unless it matches at least one +negative pattern *and* scores zero on every positive pattern. Borderline +paragraphs are always retained. + +Exposes one primary function:: + + filter_relevant_sections(text: str) -> str + +The function preserves ``[PAGE N]`` markers and section headings so the +downstream ``extract_key_sections()`` can still do its section-priority ranking. +""" + +import re +from typing import List, Tuple + +# --------------------------------------------------------------------------- +# Positive (keep) patterns — if ANY matches, the paragraph is kept regardless +# of negative signals. Mirrors the field patterns in llm_text.py but cast as +# a binary keep gate rather than a weighted scorer. +# --------------------------------------------------------------------------- + +_POSITIVE_PATTERNS: List[re.Pattern] = [ + # sample_size — explicit counts of stomachs / specimens / individuals + re.compile( + r"(?i)(\bn\s*=\s*\d+" + r"|total\s+of\s+\d+" + r"|\d+\s+stomachs?" + r"|\d+\s+specimens?" + r"|\d+\s+individuals?" + r"|\d+\s+birds?" + r"|\d+\s+fish" + r"|\d+\s+samples?" + r"|sample\s+size\s+(of\s+)?\d+" + r"|examined\s+\d+" + r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" + ), + # empty-stomach language + re.compile( + r"(?i)(empty\s+stomachs?" + r"|stomachs?\s+(were\s+)?empty" + r"|had\s+empty" + r"|without\s+food" + r"|without\s+(stomach\s+)?contents?" + r"|zero\s+prey" + r"|no\s+food\s+(items?|remains?)" + r"|vacuous|vacant\s+stomachs?)" + ), + # non-empty / fraction feeding + re.compile( + r"(?i)(non.?empty" + r"|contained\s+(food|prey|items?)" + r"|with\s+food" + r"|with\s+(stomach\s+)?contents?" + r"|had\s+(food|prey)" + r"|feeding\s+rate" + r"|proportion\s+(feeding|with\s+food)" + r"|percent\s+(feeding|with\s+food)" + r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" + ), + # percentage / fraction near gut/stomach context + re.compile( + r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" + r"|\d+\s+of\s+\d+\s+(were|had|contained)" + r"|proportion\s+of\s+\d+)" + ), + # study date — collection period + re.compile( + r"(?i)(collected\s+(in|during|between)" + r"|sampled\s+(in|during|between)" + r"|field\s+season" + r"|study\s+period" + r"|between\s+\d{4}\s+and\s+\d{4}" + r"|\d{4}[\-\u2013]\d{4}" + r"|sampling\s+period)" + ), + # study location + re.compile( + r"(?i)(study\s+(area|site|region)" + r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" + r"|sampling\s+(location|site|area))" + ), + # gut / stomach / diet core vocabulary + re.compile( + r"(?i)(stomach\s+content" + r"|diet\s+(composition|analysis|study)" + r"|gut\s+content" + r"|food\s+(items?|habits?|composition)" + r"|prey\s+(items?|composition|species|frequency)" + r"|trophic\s+(level|ecology|niche)" + r"|feeding\s+(ecology|habits?|behaviour|behavior)" + r"|gastrointestinal|foregut|hindgut|crop\s+content)" + ), + # predator species name — binomial in a sentence with diet/food/stomach + re.compile( + r"(?i)\b[A-Z][a-z]+\s+[a-z]{3,}\b.{0,80}" + r"(stomach|diet|prey|food|feeding|gut|trophic|forag)" + ), + # geographic coordinates or explicit lat/lon + re.compile( + r"(\d{1,3}[°º]\s*\d{0,2}['′]?\s*[NS]" + r"|\d{1,3}[°º]\s*\d{0,2}['′]?\s*[EW]" + r"|latitude|longitude" + r"|\d+\.\d+\s*[°º]?\s*[NS],?\s*\d+\.\d+\s*[°º]?\s*[EW])" + ), + # table-like numeric data (rows of numbers separated by whitespace/tabs) + re.compile(r"(?m)^.*(\d+\s*[\t|]\s*){2,}\d+"), +] + +# --------------------------------------------------------------------------- +# Negative (drop-candidate) patterns — a paragraph is dropped ONLY when it +# matches at least one negative pattern AND matches ZERO positive patterns. +# --------------------------------------------------------------------------- + +_NEGATIVE_PATTERNS: List[re.Pattern] = [ + # taxonomy / systematics debates + re.compile( + r"(?i)(phylogenet(ic|ics)" + r"|cladistic" + r"|taxonom(y|ic)" + r"|systemat(ic|ics)" + r"|monophyl(y|etic)" + r"|paraphyl(y|etic)" + r"|polyphyl(y|etic)" + r"|sister\s+(group|taxon|clade)" + r"|molecular\s+(clock|phylogen)" + r"|bayesian\s+(inference|analysis|tree)" + r"|maximum\s+likelihood\s+tree" + r"|bootstrap\s+(support|value)" + r"|posterior\s+probabilit)" + ), + # habitat ecology descriptions (without study-site info) + re.compile( + r"(?i)(habitat\s+(type|preference|selection|use|suitability)" + r"|home\s+range\s+(size|area|overlap)" + r"|territory\s+(size|defense|overlap)" + r"|canopy\s+(cover|closure|height)" + r"|vegetation\s+(type|structure|cover|composition|survey)" + r"|understory|understorey" + r"|basal\s+area" + r"|tree\s+(density|dbh|diameter)" + r"|forest\s+type\s+(was|is|include))" + ), + # literature review / citation-heavy passages + re.compile( + r"(?i)(several\s+(studies|authors|investigators)\s+(have|found|reported|showed)" + r"|previous(ly)?\s+(reported|described|documented|found|studied)" + r"|has\s+been\s+(reported|documented|described)\s+by" + r"|according\s+to\s+\w+\s*(\(\d{4}\)|\d{4})" + r"|consistent\s+with\s+(the\s+)?(findings?|results?)\s+of" + r"|\(\s*see\s+(also\s+)?\w+\s*(et\s+al\.?)?\s*,?\s*\d{4}\)" + r"|reviewed\s+(by|in)\s+\w+)" + ), + # detailed prey identification methodology + re.compile( + r"(?i)(prey\s+(were\s+)?identified\s+(to|using|by|under)" + r"|identification\s+(key|guide|manual)" + r"|taxonomic\s+(key|identification)" + r"|dichotomous\s+key" + r"|stereomicroscope" + r"|dissecting\s+microscope" + r"|otolith\s+(identification|catalogue|reference)" + r"|diagnostic\s+(bones?|fragments?|structures?)" + r"|reference\s+collection" + r"|hard\s+(parts?|remains?)\s+(were\s+)?(identified|compared|matched))" + ), + # morphometric / biometric measurements + re.compile( + r"(?i)(morphometric" + r"|snout[\-\s]vent\s+length" + r"|total\s+length\s+(was\s+)?measured" + r"|body\s+(mass|weight)\s+(was\s+)?measured" + r"|wing\s+(chord|length)\s+(was\s+)?measured" + r"|bill\s+(length|depth|width)\s+(was\s+)?measured" + r"|tarsus\s+length" + r"|carapace\s+(length|width)" + r"|standard\s+length\s+\(SL\)" + r"|fork\s+length\s+\(FL\)" + r"|total\s+length\s+\(TL\))" + ), + # statistical methods (not results) + re.compile( + r"(?i)(anova|ancova|manova" + r"|chi[\-\s]?squared?\s+test" + r"|kruskal[\-\s]wallis" + r"|mann[\-\s]whitney" + r"|wilcoxon" + r"|tukey('?s)?\s+(hsd|post[\-\s]?hoc)" + r"|bonferroni\s+correction" + r"|generali[sz]ed\s+linear\s+(model|mixed)" + r"|linear\s+regression\s+(was|were)\s+used" + r"|principal\s+component\s+analysis" + r"|canonical\s+correspondence" + r"|multivariate\s+analysis\s+of" + r"|permutational\s+anova" + r"|rarefaction\s+curve" + r"|shannon[\-\s]wiener|simpson('?s)?\s+(diversity|index))" + ), + # conservation / management policy (not data) + re.compile( + r"(?i)(conservation\s+(implication|strateg|management|priority|action)" + r"|management\s+(implication|recommendation|strateg|plan)" + r"|red\s+list\s+(status|categor)" + r"|endangered\s+species\s+act" + r"|iucn\s+(status|categor|red\s+list)" + r"|population\s+(viability|modelling|decline|trend)" + r"|threat\s+(status|assessment|categor))" + ), + # genetic / molecular methods (not diet data) + re.compile( + r"(?i)(pcr\s+(amplif|reaction|protocol|conditions)" + r"|dna\s+(extract|amplif|sequenc|barcod)" + r"|mitochondrial\s+(dna|gene|region|marker)" + r"|microsatellite" + r"|primer\s+(pair|sequence|set)" + r"|gel\s+electrophoresis" + r"|nucleotide\s+(sequence|substitution)" + r"|genbank\s+accession)" + ), +] + +# --------------------------------------------------------------------------- +# Section headings — recognise both known-priority and drop-candidate headers +# so we can preserve them in output while filtering paragraph content. +# --------------------------------------------------------------------------- + +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + +_SECTION_HEADING_RE: re.Pattern = re.compile( + r"(?i)^\s*" + _NUM_PREFIX + + r"(" + r"abstract|summary" + r"|introduction|background" + r"|methods?|materials?\s*(?:and|&)\s*methods?" + r"|methodology" + r"|study\s*(?:area|site|design|region|period)" + r"|results?|findings?" + r"|discussion|conclusions?" + r"|summary\s+and\s+discussion" + r"|acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r"|table\s*\d" + r")\s*[:\.\-]?\s*$", +) + +# Page markers are always preserved. +_PAGE_MARKER_RE: re.Pattern = re.compile(r"^\s*\[PAGE\s+\d+\]\s*$") + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def filter_relevant_sections(text: str) -> str: + """Remove paragraphs unlikely to contain target diet metrics. + + The function splits *text* into blank-line-separated paragraphs, scores + each one against positive (keep) and negative (drop) pattern lists, and + removes paragraphs that trigger negative patterns while scoring zero on + positive patterns. + + ``[PAGE N]`` markers and section headings are always preserved so + ``extract_key_sections()`` can still perform section-priority ranking + downstream. + + Args: + text: Noise-cleaned text (output of ``clean_text()``). May contain + ``[PAGE N]`` markers and section headings. + + Returns: + Filtered text with irrelevant paragraphs removed. All structural + markers are preserved. + """ + if not text or not text.strip(): + return text + + # Split into blocks on blank lines while tracking structure + blocks = _split_into_blocks(text) + + kept: List[str] = [] + for block in blocks: + stripped = block.strip() + if not stripped: + # Preserve blank-line spacing + kept.append("") + continue + + # Always keep page markers and section headings + if _PAGE_MARKER_RE.match(stripped) or _SECTION_HEADING_RE.match(stripped): + kept.append(block) + continue + + # Score the paragraph + if _should_keep(stripped): + kept.append(block) + + result = "\n\n".join(kept) + + # Collapse excessive blank lines (more than 2 newlines → 2) + result = re.sub(r"\n{3,}", "\n\n", result) + + return result.strip() + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _split_into_blocks(text: str) -> List[str]: + """Split text into paragraph blocks separated by blank lines. + + ``[PAGE N]`` markers that appear on their own line are treated as their + own block so they are never merged with surrounding text. + """ + # First, ensure PAGE markers are separated by blank lines from content + # so they end up in their own block. + normalized = re.sub( + r"(? bool: + """Return True if *text* matches any positive/keep pattern.""" + for pat in _POSITIVE_PATTERNS: + if pat.search(text): + return True + return False + + +def _has_negative_signal(text: str) -> bool: + """Return True if *text* matches any negative/drop-candidate pattern.""" + for pat in _NEGATIVE_PATTERNS: + if pat.search(text): + return True + return False + + +def _should_keep(text: str) -> bool: + """Decide whether a paragraph should be kept. + + Decision logic (conservative — defaults to keep): + 1. If the paragraph has ANY positive signal → **keep**. + 2. If the paragraph has a negative signal AND no positive signal → **drop**. + 3. If the paragraph has neither signal → **keep** (borderline). + """ + if _has_positive_signal(text): + return True + if _has_negative_signal(text): + return False + # No signal either way — keep to be safe + return True diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py new file mode 100644 index 0000000..ae3e7b9 --- /dev/null +++ b/tests/test_section_filter.py @@ -0,0 +1,394 @@ +"""Unit tests for src/preprocessing/section_filter.py""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from src.preprocessing.section_filter import ( + filter_relevant_sections, + _has_positive_signal, + _has_negative_signal, + _should_keep, + _split_into_blocks, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_page(n: int, body: str) -> str: + return f"[PAGE {n}]\n{body}" + + +# --------------------------------------------------------------------------- +# Structural preservation +# --------------------------------------------------------------------------- + + +class TestStructuralPreservation: + """Page markers and section headings must never be removed.""" + + def test_preserves_page_markers(self): + text = "[PAGE 1]\nSome content.\n\n[PAGE 2]\nMore content." + result = filter_relevant_sections(text) + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + + def test_preserves_section_headings(self): + text = ( + "Abstract\n\nWe studied diets.\n\n" + "Methods\n\nWe sampled fish.\n\n" + "Results\n\nN=42 stomachs examined." + ) + result = filter_relevant_sections(text) + assert "Abstract" in result + assert "Methods" in result + assert "Results" in result + + def test_preserves_numbered_section_headings(self): + text = "1. Introduction\n\nBackground text.\n\n2. Methods\n\nSampling." + result = filter_relevant_sections(text) + assert "1. Introduction" in result + assert "2. Methods" in result + + def test_empty_input_returns_empty(self): + assert filter_relevant_sections("") == "" + assert filter_relevant_sections(" ") == " " + assert filter_relevant_sections(None) is None + + def test_short_text_preserved_fully(self): + text = "A brief note about Canis lupus diet." + result = filter_relevant_sections(text) + assert result == text + + +# --------------------------------------------------------------------------- +# Positive-signal paragraphs are always kept +# --------------------------------------------------------------------------- + + +class TestPositiveSignalKept: + """Paragraphs with target-metric language must always be retained.""" + + def test_keeps_sample_size_paragraph(self): + para = "A total of 144 stomach samples were collected from Canis lupus." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "144 stomach samples" in result + + def test_keeps_empty_stomach_paragraph(self): + para = "Of the 100 stomachs examined, 23 were empty." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "23 were empty" in result + + def test_keeps_nonempty_paragraph(self): + para = "58% of stomachs contained food items." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "58%" in result + + def test_keeps_feeding_rate(self): + para = "The feeding rate was 72% across all seasons." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "feeding rate" in result + + def test_keeps_collection_date(self): + para = "Specimens were collected between 2005 and 2010." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "collected between 2005 and 2010" in result + + def test_keeps_study_location(self): + para = "The study area was located in the Okavango Delta, Botswana." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Okavango Delta" in result + + def test_keeps_diet_composition(self): + para = "Diet composition of Panthera leo was dominated by ungulates." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "Diet composition" in result + + def test_keeps_percentage_data(self): + para = "Mammals constituted 45% of the total prey items." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "45%" in result + + def test_keeps_n_equals(self): + para = "We analysed gut contents (n = 87) from adult specimens." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "n = 87" in result + + def test_keeps_coordinates(self): + para = "The sampling site (34°15'S, 18°29'E) was coastal." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "34°15'S" in result + + +# --------------------------------------------------------------------------- +# Negative-signal paragraphs are dropped (when no positive signal) +# --------------------------------------------------------------------------- + + +class TestNegativeSignalDropped: + """Paragraphs with only negative signals should be removed.""" + + def test_drops_phylogenetic_paragraph(self): + para = ( + "Bayesian inference of the phylogenetic relationships among " + "the sister group taxa revealed strong bootstrap support for " + "the monophyletic clade." + ) + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "Bayesian inference" not in result + + def test_drops_habitat_description(self): + para = ( + "Vegetation type was classified as tropical dry forest with " + "canopy cover dense and understory dominated by shrubs." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "canopy cover" not in result + + def test_drops_literature_review(self): + para = ( + "Several studies have reported similar findings. " + "Previously reported by Smith (2001) and consistent with " + "the findings of Jones et al. (2005)." + ) + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "Several studies have reported" not in result + + def test_drops_prey_id_methodology(self): + para = ( + "Prey were identified to the lowest taxonomic level using " + "a stereomicroscope and reference collection of diagnostic bones." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "stereomicroscope" not in result + + def test_drops_morphometric_paragraph(self): + para = ( + "Snout-vent length was measured to the nearest 0.1 mm. " + "Total length was measured for each individual using calipers." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Snout-vent length" not in result + + def test_drops_statistical_methods(self): + para = ( + "Differences were tested using Kruskal-Wallis tests with " + "Bonferroni correction for multiple comparisons. A generalized " + "linear model was used to assess trends." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Kruskal-Wallis" not in result + + def test_drops_conservation_policy(self): + para = ( + "Conservation implications suggest the species should be " + "upgraded to a higher IUCN Red List category given the " + "observed population decline." + ) + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "IUCN Red List" not in result + + def test_drops_genetic_methods(self): + para = ( + "DNA was extracted using a commercial kit. PCR amplification " + "was performed using primers targeting the mitochondrial gene " + "region. Products were separated by gel electrophoresis." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "PCR amplification" not in result + + +# --------------------------------------------------------------------------- +# Conservative behaviour — borderline paragraphs kept +# --------------------------------------------------------------------------- + + +class TestConservativeBehaviour: + """Paragraphs with no signals should be kept (borderline = safe).""" + + def test_keeps_neutral_paragraph(self): + para = "The region experiences a temperate maritime climate." + text = f"Introduction\n\n{para}" + result = filter_relevant_sections(text) + assert "temperate maritime climate" in result + + def test_keeps_paragraph_with_mixed_signals(self): + """Positive signal overrides co-occurring negative signal.""" + para = ( + "A total of 50 stomachs were examined. Prey were identified " + "using a stereomicroscope and diagnostic bones from a " + "reference collection." + ) + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "50 stomachs" in result + assert "stereomicroscope" in result + + def test_keeps_species_diet_sentence(self): + para = "Feeding ecology of Vulpes vulpes in agricultural landscapes." + text = f"Abstract\n\n{para}" + result = filter_relevant_sections(text) + assert "Vulpes vulpes" in result + + +# --------------------------------------------------------------------------- +# Integration-style tests with multi-section documents +# --------------------------------------------------------------------------- + + +class TestFullDocumentFiltering: + """Test the filter on realistic multi-section document text.""" + + def test_realistic_document(self): + text = ( + "[PAGE 1]\n" + "Abstract\n\n" + "We examined the diet of Canis lupus in Yellowstone.\n\n" + "[PAGE 2]\n" + "Methods\n\n" + "Specimens were collected between 2010 and 2015 from Yellowstone " + "National Park (44°36'N, 110°30'W).\n\n" + "Snout-vent length was measured to the nearest millimetre " + "using digital calipers.\n\n" + "DNA was extracted from tissue samples. PCR amplification of " + "mitochondrial DNA markers was performed.\n\n" + "[PAGE 3]\n" + "Results\n\n" + "A total of 200 stomachs were examined, of which 42 were empty.\n\n" + "Diet composition included 65% ungulates and 20% lagomorphs.\n\n" + "[PAGE 4]\n" + "Discussion\n\n" + "Previous work has reported broadly concordant results. Previously " + "reported by Mech (1970) and consistent with findings of " + "Black et al. (2003).\n\n" + "Conservation implications suggest continued monitoring of wolf " + "population viability in the Greater Yellowstone area.\n" + ) + result = filter_relevant_sections(text) + + # Structural markers preserved + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + assert "[PAGE 3]" in result + assert "[PAGE 4]" in result + assert "Abstract" in result + assert "Methods" in result + assert "Results" in result + assert "Discussion" in result + + # Positive-signal paragraphs kept + assert "diet of Canis lupus" in result + assert "200 stomachs" in result + assert "42 were empty" in result + assert "65%" in result + assert "collected between 2010 and 2015" in result + + # Negative-only paragraphs dropped + assert "Snout-vent length" not in result + assert "PCR amplification" not in result + assert "Previous work has reported" not in result + assert "population viability" not in result + + def test_does_not_over_filter_short_doc(self): + """A short document with mostly positive content should lose very little.""" + text = ( + "Abstract\n\n" + "We examined stomach contents of 50 Accipiter nisus.\n\n" + "Results\n\n" + "Sample size was n=50. Of these, 12 stomachs were empty.\n" + "Birds comprised 78% of prey items.\n" + ) + result = filter_relevant_sections(text) + # Everything here is positive — nothing should be lost + assert "50 Accipiter nisus" in result + assert "n=50" in result + assert "12 stomachs were empty" in result + assert "78%" in result + + +# --------------------------------------------------------------------------- +# Internal helper tests +# --------------------------------------------------------------------------- + + +class TestInternalHelpers: + """Test the internal scoring/splitting functions.""" + + def test_has_positive_signal_true(self): + assert _has_positive_signal("A total of 30 stomachs were examined.") + assert _has_positive_signal("Specimens were collected during 2018.") + assert _has_positive_signal("The study area was in Kenya.") + assert _has_positive_signal("Empty stomachs accounted for 15%.") + + def test_has_positive_signal_false(self): + assert not _has_positive_signal("The weather was mild that year.") + assert not _has_positive_signal( + "Bayesian inference supported the monophyletic clade." + ) + + def test_has_negative_signal_true(self): + assert _has_negative_signal( + "Phylogenetic analysis using maximum likelihood trees." + ) + assert _has_negative_signal( + "Habitat type was classified as open grassland." + ) + assert _has_negative_signal( + "Mann-Whitney tests were used for comparisons." + ) + + def test_has_negative_signal_false(self): + assert not _has_negative_signal("We counted 30 prey items in the gut.") + assert not _has_negative_signal("The study was conducted in Brazil.") + + def test_should_keep_positive_overrides_negative(self): + text = ( + "A total of 100 stomachs were examined using a stereomicroscope " + "and reference collection." + ) + assert _should_keep(text) is True + + def test_should_keep_neutral_kept(self): + assert _should_keep("The area has a subtropical climate.") is True + + def test_should_keep_negative_only_dropped(self): + assert _should_keep( + "Bayesian inference of phylogenetic relationships." + ) is False + + def test_split_into_blocks_basic(self): + text = "Para one.\n\nPara two.\n\nPara three." + blocks = _split_into_blocks(text) + assert len(blocks) == 3 + assert "Para one." in blocks[0] + assert "Para two." in blocks[1] + assert "Para three." in blocks[2] + + def test_split_into_blocks_page_markers(self): + text = "[PAGE 1]\nContent one.\n\n[PAGE 2]\nContent two." + blocks = _split_into_blocks(text) + page_blocks = [b for b in blocks if "[PAGE" in b] + assert len(page_blocks) >= 2 From b3b4abfa693f91950dbae6fc08a1e7cdc39048d8 Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:21:36 -0800 Subject: [PATCH 10/30] update instructions on files --- extract-from-txt.py | 75 ++++++++++++++++++++++++++++++++------------- src/llm/llm_text.py | 5 ++- 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index bd33196..bb0203a 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -1,7 +1,8 @@ """Extract-from-TXT Pipeline -Processes pre-classified useful .txt files through noise cleaning, text -trimming, and LLM extraction — bypassing the XGBoost classifier entirely. +Processes pre-classified useful .txt files through noise cleaning, section +filtering, text trimming, and LLM extraction — bypassing the XGBoost +classifier entirely. Every .txt file fed to this script is assumed to have already been confirmed as useful (e.g. by the classifier in classify-extract.py or by manual review). @@ -10,10 +11,12 @@ 1. Read raw .txt file 2. Strip noise (references, acknowledgements, affiliations, captions, …) via src/preprocessing/text_cleaner.py - 3. Trim to the character budget using section-priority ranking + 3. Drop irrelevant paragraphs (taxonomy, morphometrics, stats methods, …) + via src/preprocessing/section_filter.py + 4. Trim to the character budget using section-priority ranking via src/llm/llm_text.py::extract_key_sections() - 4. Call Ollama for structured extraction via src/llm/llm_client.py - 5. Save result JSON per file and a summary CSV + 5. Call Ollama for structured extraction via src/llm/llm_client.py + 6. Save result JSON per file and a summary CSV Usage:: @@ -32,7 +35,9 @@ --num-ctx 4096 Output: - - data/cleaned-text/_.txt trimmed text passed to Ollama + - data/cleaned-text/text_cleaner/_.txt noise-stripped text + - data/cleaned-text/section_filter/_.txt section-filtered text + - data/cleaned-text/llm_text/_.txt trimmed text passed to Ollama - data/results/metrics/_results.json per file - data/results/summaries/txt_pipeline_summary_.csv overall """ @@ -49,6 +54,7 @@ sys.path.insert(0, str(_PROJECT_ROOT)) from src.preprocessing.text_cleaner import clean_text +from src.preprocessing.section_filter import filter_relevant_sections from src.llm.llm_text import extract_key_sections from src.llm.llm_client import extract_metrics_from_text, save_extraction_result @@ -65,7 +71,7 @@ def run_txt_pipeline( num_ctx: int, single_file: Path = None, ) -> None: - """Process every .txt file in *input_dir* through clean → trim → extract. + """Process every .txt file in *input_dir* through clean → filter → trim → extract. Args: input_dir: Directory containing pre-classified useful .txt files. @@ -86,8 +92,12 @@ def run_txt_pipeline( print(f"[INFO] Found {len(txt_paths)} .txt file(s) to process", file=sys.stderr) output_dir.mkdir(parents=True, exist_ok=True) - cleaned_text_dir = output_dir.parent / "cleaned-text" - cleaned_text_dir.mkdir(parents=True, exist_ok=True) + cleaner_text_dir = output_dir.parent / "cleaned-text" / "text_cleaner" + filter_text_dir = output_dir.parent / "cleaned-text" / "section_filter" + llm_text_dir = output_dir.parent / "cleaned-text" / "llm_text" + cleaner_text_dir.mkdir(parents=True, exist_ok=True) + filter_text_dir.mkdir(parents=True, exist_ok=True) + llm_text_dir.mkdir(parents=True, exist_ok=True) summary_rows = [] for idx, txt_path in enumerate(txt_paths, start=1): @@ -97,6 +107,7 @@ def run_txt_pipeline( "filename": txt_path.name, "raw_chars": "", "cleaned_chars": "", + "filtered_chars": "", "trimmed_chars": "", "extraction_status": "", "species_name": "", @@ -137,29 +148,50 @@ def run_txt_pipeline( summary_rows.append(row) continue - # ── Step 3: Trim to LLM budget ────────────────────────────────────── - if len(cleaned) > max_chars: - trimmed = extract_key_sections(cleaned, max_chars) + # ── Step 3: Save text_cleaner output ──────────────────────────────── + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + cleaner_path = cleaner_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + cleaner_path.write_text(cleaned, encoding="utf-8") + print(f" [INFO] Cleaner text : {cleaner_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save cleaner text: {exc}", file=sys.stderr) + + # ── Step 4: Section filter ────────────────────────────────────────── + filtered = filter_relevant_sections(cleaned) + row["filtered_chars"] = len(filtered) + print(f" [INFO] After filter: {len(filtered):,} chars", file=sys.stderr) + + # ── Step 4b: Save section_filter output ───────────────────────────── + filter_path = filter_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + filter_path.write_text(filtered, encoding="utf-8") + print(f" [INFO] Filter text : {filter_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save filter text: {exc}", file=sys.stderr) + + # ── Step 5: Trim to LLM budget ────────────────────────────────────── + if len(filtered) > max_chars: + trimmed = extract_key_sections(filtered, max_chars) print( f" [INFO] After trim : {len(trimmed):,} chars " f"(budget {max_chars:,})", file=sys.stderr, ) else: - trimmed = cleaned + trimmed = filtered row["trimmed_chars"] = len(trimmed) - # ── Step 4: Save cleaned text snapshot ───────────────────────────── - ts = datetime.now().strftime("%Y%m%d_%H%M%S") - cleaned_path = cleaned_text_dir / f"{txt_path.stem}_{ts}.txt" + # ── Step 5b: Save llm_text output ────────────────────────────────── + llm_path = llm_text_dir / f"{txt_path.stem}_{ts}.txt" try: - cleaned_path.write_text(trimmed, encoding="utf-8") - print(f" [INFO] Cleaned text : {cleaned_path.name}", file=sys.stderr) + llm_path.write_text(trimmed, encoding="utf-8") + print(f" [INFO] LLM text : {llm_path.name}", file=sys.stderr) except Exception as exc: - print(f" [WARN] Could not save cleaned text: {exc}", file=sys.stderr) + print(f" [WARN] Could not save LLM text: {exc}", file=sys.stderr) - # ── Step 5: LLM extraction ────────────────────────────────────────── + # ── Step 6: LLM extraction ────────────────────────────────────────── print(f" [INFO] Calling Ollama ({llm_model})…", file=sys.stderr) try: metrics = extract_metrics_from_text( @@ -173,7 +205,7 @@ def run_txt_pipeline( summary_rows.append(row) continue - # ── Step 6: Save JSON ─────────────────────────────────────────────── + # ── Step 7: Save JSON ─────────────────────────────────────────────── try: result = save_extraction_result( metrics=metrics, @@ -224,6 +256,7 @@ def run_txt_pipeline( "filename", "raw_chars", "cleaned_chars", + "filtered_chars", "trimmed_chars", "extraction_status", "species_name", diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index 3005519..a88e2e9 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -14,7 +14,9 @@ project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf +# NOTE: pdf_text_extraction is imported lazily inside read_file_text() to +# avoid pulling in heavy PDF dependencies (camelot, fitz) when only the +# text-based pipeline is used. # --------------------------------------------------------------------------- # Section-boundary splitting helpers @@ -369,6 +371,7 @@ def load_document(file_path: Path) -> str: if suffix == '.pdf': print(f"[INFO] Reading PDF file...", file=sys.stderr) + from src.preprocessing.pdf_text_extraction import extract_text_from_pdf return extract_text_from_pdf(str(file_path)) elif suffix in ['.txt', '.text']: print(f"[INFO] Reading text file...", file=sys.stderr) From af639733acb2d6e1680f643073f277e3d8b6e68c Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:25:12 -0800 Subject: [PATCH 11/30] added retry logic to retry null returns --- src/llm/llm_client.py | 64 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/src/llm/llm_client.py b/src/llm/llm_client.py index 5b6c414..4e3e8a3 100644 --- a/src/llm/llm_client.py +++ b/src/llm/llm_client.py @@ -35,14 +35,20 @@ def extract_metrics_from_text( text: str, model: str = "llama3.1:8b", - num_ctx: int = 4096, + num_ctx: int = 8192, + _retry: bool = False, ) -> PredatorDietMetrics: """Extract structured metrics from text using Ollama. + On the first call, if ``species_name`` or ``sample_size`` come back null + the function automatically retries once with a focused follow-up prompt + that asks the model to look more carefully. + Args: text: Preprocessed text content from a scientific publication. model: Name of the Ollama model to use. num_ctx: Context window size to request from Ollama (lower = less memory). + _retry: Internal flag — True when this is the automatic retry attempt. Returns: PredatorDietMetrics object with extracted data. @@ -78,6 +84,7 @@ def extract_metrics_from_text( - Ignore page markers [PAGE N]. - Prioritize Abstract, Methods, and Results sections. - Be especially careful to distinguish collection dates from publication dates. +- For each non-null field, also look for a short verbatim phrase (5-15 words) from the text that supports your answer. This helps verify accuracy. EXAMPLES @@ -103,9 +110,62 @@ def extract_metrics_from_text( messages=[{"role": "user", "content": prompt}], model=model, format=PredatorDietMetrics.model_json_schema(), + options={"num_ctx": num_ctx}, ) metrics = PredatorDietMetrics.model_validate_json(response.message.content) + + # ── Retry once if critical fields are null ────────────────────────────── + if not _retry and (metrics.species_name is None or metrics.sample_size is None): + missing = [] + if metrics.species_name is None: + missing.append("species_name") + if metrics.sample_size is None: + missing.append("sample_size") + print( + f" [INFO] Retry: {', '.join(missing)} came back null — re-prompting", + file=sys.stderr, + ) + retry_prompt = ( + "The following fields were returned as null but are very likely present " + "in the text. Please re-read the text carefully — especially the Abstract, " + "Methods, and Results sections — and try again.\n\n" + f"Missing fields: {', '.join(missing)}\n\n" + "Hints:\n" + ) + if "species_name" in missing: + retry_prompt += ( + "- species_name: Look for the first binomial Latin name (Genus species) " + "mentioned in the title or abstract. This is the PREDATOR, not its prey.\n" + ) + if "sample_size" in missing: + retry_prompt += ( + "- sample_size: Look for phrases like 'N stomachs', 'N specimens', " + "'a total of N', 'n=N', 'N individuals were examined'. Check Results " + "and Methods sections.\n" + ) + retry_prompt += f"\nTEXT\n{text}" + + retry_response = chat( + messages=[{"role": "user", "content": retry_prompt}], + model=model, + format=PredatorDietMetrics.model_json_schema(), + options={"num_ctx": num_ctx}, + ) + retry_metrics = PredatorDietMetrics.model_validate_json( + retry_response.message.content + ) + + # Merge: prefer retry values for fields that were null, keep originals otherwise + merged = metrics.model_dump() + retry_dict = retry_metrics.model_dump() + for field in ["species_name", "study_location", "study_date", + "num_empty_stomachs", "num_nonempty_stomachs", "sample_size"]: + if merged.get(field) is None and retry_dict.get(field) is not None: + merged[field] = retry_dict[field] + + metrics = PredatorDietMetrics.model_validate(merged) + return metrics @@ -172,7 +232,7 @@ def main(): parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results/metrics)") parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). Reduce if you hit CUDA/OOM errors.") - parser.add_argument("--num-ctx", type=int, default=4096, help="Context window size for the model (default: 4096). Lower values use less memory.") + parser.add_argument("--num-ctx", type=int, default=8192, help="Context window size for the model (default: 8192). Lower values use less memory.") args = parser.parse_args() From af6aa3a4e43b0405c73dedc1bfd93b65b28c885b Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:28:14 -0800 Subject: [PATCH 12/30] increased char and num-ctx limit on ollama --- extract-from-txt.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index bb0203a..34ebf0f 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -31,8 +31,8 @@ --input-dir data/processed-text/ \\ --output-dir data/results/ \\ --llm-model llama3.1:8b \\ - --max-chars 5000 \\ - --num-ctx 4096 + --max-chars 10000 \\ + --num-ctx 8192 Output: - data/cleaned-text/text_cleaner/_.txt noise-stripped text @@ -338,14 +338,14 @@ def main() -> None: parser.add_argument( "--max-chars", type=int, - default=5000, - help="Maximum characters to send to Ollama after cleaning (default: 5000).", + default=10000, + help="Maximum characters to send to Ollama after cleaning (default: 10000).", ) parser.add_argument( "--num-ctx", type=int, - default=4096, - help="Ollama context window size (default: 4096).", + default=8192, + help="Ollama context window size (default: 8192).", ) args = parser.parse_args() From cde64a146fcaa69dedb931cacffe42508780f3e0 Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:29:04 -0800 Subject: [PATCH 13/30] drops paragraphs with no neg or pos signal --- src/preprocessing/section_filter.py | 32 +++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/section_filter.py b/src/preprocessing/section_filter.py index eac6813..09f2d94 100644 --- a/src/preprocessing/section_filter.py +++ b/src/preprocessing/section_filter.py @@ -276,6 +276,16 @@ _PAGE_MARKER_RE: re.Pattern = re.compile(r"^\s*\[PAGE\s+\d+\]\s*$") +# --------------------------------------------------------------------------- +# Long-document threshold — above this many chars the filter becomes stricter, +# dropping paragraphs that have *no* signal at all (neither positive nor +# negative). This prevents very long papers from passing through unfiltered +# when most paragraphs simply lack a negative keyword. +# --------------------------------------------------------------------------- + +_LONG_DOC_THRESHOLD: int = 15_000 + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -288,6 +298,10 @@ def filter_relevant_sections(text: str) -> str: removes paragraphs that trigger negative patterns while scoring zero on positive patterns. + For documents longer than ``_LONG_DOC_THRESHOLD`` characters, paragraphs + with *no signal at all* (neither positive nor negative) are also dropped. + This prevents very long papers from passing through almost unfiltered. + ``[PAGE N]`` markers and section headings are always preserved so ``extract_key_sections()`` can still perform section-priority ranking downstream. @@ -303,6 +317,8 @@ def filter_relevant_sections(text: str) -> str: if not text or not text.strip(): return text + strict = len(text) > _LONG_DOC_THRESHOLD + # Split into blocks on blank lines while tracking structure blocks = _split_into_blocks(text) @@ -320,7 +336,7 @@ def filter_relevant_sections(text: str) -> str: continue # Score the paragraph - if _should_keep(stripped): + if _should_keep(stripped, strict=strict): kept.append(block) result = "\n\n".join(kept) @@ -367,17 +383,25 @@ def _has_negative_signal(text: str) -> bool: return False -def _should_keep(text: str) -> bool: +def _should_keep(text: str, *, strict: bool = False) -> bool: """Decide whether a paragraph should be kept. Decision logic (conservative — defaults to keep): 1. If the paragraph has ANY positive signal → **keep**. 2. If the paragraph has a negative signal AND no positive signal → **drop**. - 3. If the paragraph has neither signal → **keep** (borderline). + 3. If ``strict`` is True and there is NO signal either way → **drop**. + 4. Otherwise (no signal, not strict) → **keep** (borderline). + + Args: + text: Paragraph text to evaluate. + strict: When True (used for long documents), paragraphs with zero + signal are dropped rather than kept by default. """ if _has_positive_signal(text): return True if _has_negative_signal(text): return False - # No signal either way — keep to be safe + # No signal either way + if strict: + return False # long doc — drop borderline paragraphs return True From 00bcf41e74f3c04d1f6596656537f2cad76c5fef Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:29:13 -0800 Subject: [PATCH 14/30] added long doc test --- tests/test_section_filter.py | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py index ae3e7b9..ba3b844 100644 --- a/tests/test_section_filter.py +++ b/tests/test_section_filter.py @@ -13,6 +13,7 @@ _has_negative_signal, _should_keep, _split_into_blocks, + _LONG_DOC_THRESHOLD, ) @@ -392,3 +393,90 @@ def test_split_into_blocks_page_markers(self): blocks = _split_into_blocks(text) page_blocks = [b for b in blocks if "[PAGE" in b] assert len(page_blocks) >= 2 + + +# --------------------------------------------------------------------------- +# Long-document strict filtering +# --------------------------------------------------------------------------- + + +class TestLongDocumentStrictMode: + """Documents > _LONG_DOC_THRESHOLD drop zero-signal paragraphs.""" + + def _make_long_doc(self, extra_paragraphs: list[str]) -> str: + """Build a document that exceeds the threshold with filler + extras.""" + # Use positive-signal filler so the base text is kept + filler_line = "A total of 50 specimens were examined." + repeat_count = (_LONG_DOC_THRESHOLD // len(filler_line)) + 2 + filler = "\n\n".join([filler_line] * repeat_count) + return filler + "\n\n" + "\n\n".join(extra_paragraphs) + + def test_neutral_para_kept_in_short_doc(self): + """Below threshold, neutral paragraphs survive.""" + neutral = "The weather was mild and skies were overcast." + text = f"Abstract\n\n{neutral}" + assert len(text) < _LONG_DOC_THRESHOLD + result = filter_relevant_sections(text) + assert neutral in result + + def test_neutral_para_dropped_in_long_doc(self): + """Above threshold, neutral paragraphs are dropped.""" + neutral = "The weather was mild and skies were overcast." + text = self._make_long_doc([neutral]) + assert len(text) > _LONG_DOC_THRESHOLD + result = filter_relevant_sections(text) + assert neutral not in result + + def test_positive_para_kept_in_long_doc(self): + """Positive-signal paragraphs are STILL kept in long docs.""" + positive = "A total of 200 stomachs were examined." + text = self._make_long_doc([positive]) + result = filter_relevant_sections(text) + assert positive in result + + def test_negative_para_dropped_in_long_doc(self): + """Negative-signal paragraphs still dropped in long docs.""" + negative = ( + "Bayesian inference of the phylogenetic relationships " + "revealed strong bootstrap support." + ) + text = self._make_long_doc([negative]) + result = filter_relevant_sections(text) + assert "Bayesian inference" not in result + + def test_headings_preserved_in_long_doc(self): + """Section headings never dropped, even in strict mode.""" + text = self._make_long_doc(["Methods", "Results"]) + result = filter_relevant_sections(text) + assert "Methods" in result + assert "Results" in result + + def test_page_markers_preserved_in_long_doc(self): + text = self._make_long_doc(["[PAGE 99]"]) + result = filter_relevant_sections(text) + assert "[PAGE 99]" in result + + def test_should_keep_strict_drops_neutral(self): + """Direct test of _should_keep with strict=True.""" + neutral = "The area has a subtropical climate." + assert _should_keep(neutral, strict=False) is True + assert _should_keep(neutral, strict=True) is False + + def test_should_keep_strict_keeps_positive(self): + positive = "A total of 100 stomachs were analyzed." + assert _should_keep(positive, strict=True) is True + + def test_long_doc_reduces_size(self): + """A long document with lots of neutral text gets meaningfully reduced.""" + neutral = "The atmospheric conditions were unremarkable that day." + positive = "We examined 85 stomachs from Vulpes vulpes." + # 50 neutral paras + a few positive ones + paras = [neutral] * 50 + [positive] * 3 + text = "\n\n".join(paras) + # Make sure it's over the threshold + while len(text) < _LONG_DOC_THRESHOLD: + text += f"\n\n{neutral}" + result = filter_relevant_sections(text) + assert len(result) < len(text) + assert positive in result + assert neutral not in result From d6efaf348e747017e0f19441f82e9639db8cf6eb Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:40:53 -0800 Subject: [PATCH 15/30] switched to qwen2.5:7b --- extract-from-txt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index 34ebf0f..94e6f4a 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -332,8 +332,9 @@ def main() -> None: parser.add_argument( "--llm-model", type=str, - default="llama3.1:8b", - help="Ollama model name (default: llama3.1:8b).", + # default="llama3.1:8b", + default="qwen2.5:7b", + help="Ollama model name (default: qwen2.5:7b).", ) parser.add_argument( "--max-chars", From b3dcf4675469451a70b80c97b9d5f5f3fb8dc0f8 Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:54:56 -0800 Subject: [PATCH 16/30] rewrote the system prompt to handle diverse study methods beyond stomach dissection --- src/llm/llm_client.py | 154 +++++++++++++++++++++++++++--------------- 1 file changed, 101 insertions(+), 53 deletions(-) diff --git a/src/llm/llm_client.py b/src/llm/llm_client.py index 4e3e8a3..f039bae 100644 --- a/src/llm/llm_client.py +++ b/src/llm/llm_client.py @@ -34,15 +34,16 @@ def extract_metrics_from_text( text: str, - model: str = "llama3.1:8b", + # model: str = "llama3.1:8b", + model: str = "qwen2.5:7b", num_ctx: int = 8192, _retry: bool = False, ) -> PredatorDietMetrics: """Extract structured metrics from text using Ollama. - On the first call, if ``species_name`` or ``sample_size`` come back null - the function automatically retries once with a focused follow-up prompt - that asks the model to look more carefully. + On the first call, if any fields come back null the function automatically + retries once with a focused follow-up prompt that gives method-specific + hints for finding the missing data. Args: text: Preprocessed text content from a scientific publication. @@ -53,7 +54,7 @@ def extract_metrics_from_text( Returns: PredatorDietMetrics object with extracted data. """ - prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet survey publication and return a single flat JSON object with exactly these fields: + prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet study and return a single flat JSON object with exactly these fields: species_name - string or null study_location - string or null @@ -62,47 +63,67 @@ def extract_metrics_from_text( num_nonempty_stomachs - integer (>= 0) or null sample_size - integer (> 0) or null -Use null for any field whose value cannot be confidently determined from the text. +Use null ONLY when the value truly cannot be determined from any part of the text. FIELD DEFINITIONS -species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal whose stomachs/guts were examined, not its prey. Return exactly one species. If multiple predators are studied, choose the one with the most stomach samples. Capitalize the genus, lowercase the specific epithet (e.g., "Pygoscelis papua"). - -study_location: Geographic area where predator specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, or Study Site sections. - -study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". Look for phrases like "specimens collected in", "sampling period", "field season", "between [year] and [year]". Return null if only publication year is visible. - -num_empty_stomachs: Number of predators with stomachs containing no food. Synonyms: "empty", "vacant", "without food", "zero prey items", "stomachs with no contents", "N individuals had empty stomachs". - -num_nonempty_stomachs: Number of predators with stomachs containing food. Synonyms: "non-empty", "with food", "containing prey", "with contents", "fed", "N contained food", "N had prey items". - -sample_size: Total number of predator individuals examined. When both num_empty_stomachs and num_nonempty_stomachs are available, sample_size equals their sum. Look for phrases like "N stomachs were examined", "a total of N individuals", "N specimens", "n=", "sample size of N". +species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal being studied, not its prey. Return exactly one species. If multiple predators appear, choose the one with the most samples. Capitalize genus, lowercase epithet (e.g., "Pygoscelis papua"). + +study_location: Geographic area where specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, Study Site, and Abstract. + +study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". + Where to look: + - "specimens collected in", "sampling period", "field season", "between [year] and [year]" + - "from March 1984 to March 1985" → "1984-1985" + - "Received 23 November 2007" in article info suggests collection was ~2005-2007 + - If the only dates are "Received" or "Accepted" submission dates and no collection dates are stated, estimate the collection period as 1-2 years before submission. + +num_empty_stomachs: Number of predators with NO food in their digestive tract. Apply broadly across study methods: + - Stomach dissection: "empty", "vacant", "without food", "zero prey items" + - Stomach pumping / lavage: "yielded no food", "no contents obtained", "produced no material" + - Scat / fecal analysis: "scats with no identifiable prey", "empty scats" + - Regurgitation: "failed to regurgitate", "no pellet produced" + - Immunoassay / molecular: "tested negative for all prey", "no prey detected" + If the study uses stomach pumping and ALL samples contained food, set this to 0. + +num_nonempty_stomachs: Number of predators with food in their digestive tract. Same method mapping as above: + - "non-empty", "with food", "containing prey", "with contents", "fed" + - Stomach pumping: "food samples collected", "samples containing prey" + - Scat: "scats with identifiable prey remains" + - Immunoassay: "tested positive for prey", "positive reactions" + If study says "a total of N food samples was collected" and implies ALL had food, set num_nonempty_stomachs = N. + +sample_size: Total number of predator individuals examined. Equals num_empty + num_nonempty when both are known. + - "N stomachs examined", "N individuals", "N specimens", "n=N", "a total of N" + - "N food samples" when all sampled animals contributed one sample + - "two groups of 225" → sample_size = 450 + - Check Abstract, Methods, and Results. RULES -- Do not invent data; use null if ambiguous or missing. +- Do not invent data; use null only if truly ambiguous or missing. - Return a single JSON object; do not return arrays. - Ignore page markers [PAGE N]. - Prioritize Abstract, Methods, and Results sections. -- Be especially careful to distinguish collection dates from publication dates. -- For each non-null field, also look for a short verbatim phrase (5-15 words) from the text that supports your answer. This helps verify accuracy. +- Carefully distinguish collection dates from publication/submission dates. +- If ALL samples had food (e.g., stomach pumping where every sample yielded prey), set num_empty_stomachs = 0 and num_nonempty_stomachs = sample_size. EXAMPLES -1. Simple complete case: -{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 5, "num_nonempty_stomachs": 15, "sample_size": 20}} +1. Traditional stomach dissection: +{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": 5, "num_nonempty_stomachs": 47, "sample_size": 52}} + +2. Stomach pumping (all samples had food): +{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 0, "num_nonempty_stomachs": 144, "sample_size": 144}} -2. Missing empty stomach data (can infer from sample_size): -{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": null, "num_nonempty_stomachs": 47, "sample_size": 52}} +3. Immunoassay / molecular detection: +{{"species_name": "Nucella lapillus", "study_location": "Swans Island, Maine, USA", "study_date": "2005-2007", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 450}} -3. Multi-year study: +4. Scat / fecal analysis: {{"species_name": "Vulpes vulpes", "study_location": "Bristol, UK", "study_date": "2015-2018", "num_empty_stomachs": 12, "num_nonempty_stomachs": 88, "sample_size": 100}} -4. Minimal data available: +5. Minimal data: {{"species_name": "Ursus arctos", "study_location": null, "study_date": "2020", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 23}} -5. Only some fields extractable: -{{"species_name": "Zalophus californianus", "study_location": "California coast", "study_date": null, "num_empty_stomachs": 8, "num_nonempty_stomachs": 34, "sample_size": 42}} - TEXT {text} """ @@ -115,35 +136,62 @@ def extract_metrics_from_text( metrics = PredatorDietMetrics.model_validate_json(response.message.content) - # ── Retry once if critical fields are null ────────────────────────────── - if not _retry and (metrics.species_name is None or metrics.sample_size is None): - missing = [] - if metrics.species_name is None: - missing.append("species_name") - if metrics.sample_size is None: - missing.append("sample_size") + # ── Retry once if any extractable fields are null ─────────────────────── + _retryable = [ + "species_name", "study_location", "study_date", + "num_empty_stomachs", "num_nonempty_stomachs", "sample_size", + ] + missing = [f for f in _retryable if getattr(metrics, f) is None] + + if not _retry and missing: print( f" [INFO] Retry: {', '.join(missing)} came back null — re-prompting", file=sys.stderr, ) + + # Build targeted hints for each missing field + _hints = { + "species_name": ( + "- species_name: Look for the first binomial Latin name (Genus species) " + "in the title or abstract. This is the PREDATOR, not its prey.\n" + ), + "study_location": ( + "- study_location: Check Methods or Study Area sections for place names, " + "islands, countries, or coordinates.\n" + ), + "study_date": ( + "- study_date: Look for phrases like 'collected in', 'sampled during', " + "'field season', 'from [month] [year] to [month] [year]'. " + "If no collection date is explicit, infer from 'Received [date]' — " + "collection is typically 1-2 years before manuscript submission.\n" + ), + "num_empty_stomachs": ( + "- num_empty_stomachs: Look for 'empty', 'no food', 'no contents', " + "'negative for prey'. If ALL samples had food (e.g., stomach pumping " + "where every sample produced material), return 0.\n" + ), + "num_nonempty_stomachs": ( + "- num_nonempty_stomachs: Look for 'contained food', 'with prey', " + "'non-empty', 'food samples collected'. If ALL samples had food, " + "this equals sample_size.\n" + ), + "sample_size": ( + "- sample_size: Look for 'N stomachs', 'N specimens', 'a total of N', " + "'n=N', 'N individuals examined', 'two groups of N'. Check Abstract, " + "Methods, and Results.\n" + ), + } + retry_prompt = ( - "The following fields were returned as null but are very likely present " - "in the text. Please re-read the text carefully — especially the Abstract, " - "Methods, and Results sections — and try again.\n\n" + "The following fields were returned as null. Please re-read the text " + "carefully — especially the Abstract, Methods, and Results sections — " + "and try harder to find values for them. Think about different study " + "methods (stomach pumping, scat analysis, immunoassays, etc.).\n\n" f"Missing fields: {', '.join(missing)}\n\n" "Hints:\n" ) - if "species_name" in missing: - retry_prompt += ( - "- species_name: Look for the first binomial Latin name (Genus species) " - "mentioned in the title or abstract. This is the PREDATOR, not its prey.\n" - ) - if "sample_size" in missing: - retry_prompt += ( - "- sample_size: Look for phrases like 'N stomachs', 'N specimens', " - "'a total of N', 'n=N', 'N individuals were examined'. Check Results " - "and Methods sections.\n" - ) + for field in missing: + retry_prompt += _hints.get(field, "") retry_prompt += f"\nTEXT\n{text}" retry_response = chat( @@ -159,8 +207,7 @@ def extract_metrics_from_text( # Merge: prefer retry values for fields that were null, keep originals otherwise merged = metrics.model_dump() retry_dict = retry_metrics.model_dump() - for field in ["species_name", "study_location", "study_date", - "num_empty_stomachs", "num_nonempty_stomachs", "sample_size"]: + for field in _retryable: if merged.get(field) is None and retry_dict.get(field) is not None: merged[field] = retry_dict[field] @@ -229,7 +276,8 @@ def save_extraction_result( def main(): parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM") parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)") - parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + # parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)") parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results/metrics)") parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). Reduce if you hit CUDA/OOM errors.") parser.add_argument("--num-ctx", type=int, default=8192, help="Context window size for the model (default: 8192). Lower values use less memory.") From ad49f3c858fc16ad267bb670d13fa247ea98ab01 Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 1 Mar 2026 20:58:27 -0800 Subject: [PATCH 17/30] reformat --- classify-extract.py | 27 ++++-- extract-from-txt.py | 35 +++----- src/llm/llm_client.py | 32 +++---- src/llm/llm_text.py | 132 +++++++++++++--------------- src/preprocessing/section_filter.py | 29 ++---- src/preprocessing/text_cleaner.py | 51 ++++------- tests/test_section_filter.py | 96 ++++---------------- tests/test_text_cleaner.py | 17 +--- 8 files changed, 149 insertions(+), 270 deletions(-) diff --git a/classify-extract.py b/classify-extract.py index 84fd24a..be71e7c 100644 --- a/classify-extract.py +++ b/classify-extract.py @@ -41,6 +41,7 @@ # Pipeline # --------------------------------------------------------------------------- + def run_pipeline( input_path: Path, model_dir: str, @@ -190,15 +191,25 @@ def run_pipeline( # ── Write summary CSV ───────────────────────────────────────────────── from datetime import datetime + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") summaries_dir = output_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) summary_path = summaries_dir / f"pipeline_summary_{timestamp}.csv" - + fieldnames = [ - "filename", "classification", "confidence", "pred_prob", - "extraction_status", "species_name", "study_location", "study_date", - "sample_size", "num_empty_stomachs", "num_nonempty_stomachs", "fraction_feeding", + "filename", + "classification", + "confidence", + "pred_prob", + "extraction_status", + "species_name", + "study_location", + "study_date", + "sample_size", + "num_empty_stomachs", + "num_nonempty_stomachs", + "fraction_feeding", ] with open(summary_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) @@ -228,12 +239,10 @@ def run_pipeline( # CLI entry point # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( - description=( - "Classify PDFs as useful/not-useful, then extract structured diet " - "metrics from useful ones using an LLM." - ), + description=("Classify PDFs as useful/not-useful, then extract structured diet " "metrics from useful ones using an LLM."), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -312,4 +321,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/extract-from-txt.py b/extract-from-txt.py index 94e6f4a..8b0be91 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -63,6 +63,7 @@ # Core pipeline function # --------------------------------------------------------------------------- + def run_txt_pipeline( input_dir: Path, output_dir: Path, @@ -174,8 +175,7 @@ def run_txt_pipeline( if len(filtered) > max_chars: trimmed = extract_key_sections(filtered, max_chars) print( - f" [INFO] After trim : {len(trimmed):,} chars " - f"(budget {max_chars:,})", + f" [INFO] After trim : {len(trimmed):,} chars " f"(budget {max_chars:,})", file=sys.stderr, ) else: @@ -210,7 +210,7 @@ def run_txt_pipeline( result = save_extraction_result( metrics=metrics, source_file=txt_path, - original_text=raw_text, # keep full text for page resolution + original_text=raw_text, # keep full text for page resolution output_dir=output_dir, ) except Exception as exc: @@ -224,23 +224,13 @@ def run_txt_pipeline( row["species_name"] = m.get("species_name") or "" row["study_location"] = m.get("study_location") or "" row["study_date"] = m.get("study_date") or "" - row["sample_size"] = ( - "" if m.get("sample_size") is None else m["sample_size"] - ) - row["num_empty_stomachs"] = ( - "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] - ) - row["num_nonempty_stomachs"] = ( - "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] - ) - row["fraction_feeding"] = ( - "" if m.get("fraction_feeding") is None else m["fraction_feeding"] - ) + row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] + row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] + row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] + row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] print( - f" [OK] species={m.get('species_name')} " - f"n={m.get('sample_size')} " - f"date={m.get('study_date')}", + f" [OK] species={m.get('species_name')} " f"n={m.get('sample_size')} " f"date={m.get('study_date')}", file=sys.stderr, ) @@ -291,12 +281,10 @@ def run_txt_pipeline( # CLI # --------------------------------------------------------------------------- + def main() -> None: parser = argparse.ArgumentParser( - description=( - "Extract structured predator-diet metrics from pre-classified " - "useful .txt files using Ollama." - ), + description=("Extract structured predator-diet metrics from pre-classified " "useful .txt files using Ollama."), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -326,8 +314,7 @@ def main() -> None: "--output-dir", type=str, default="data/results", - help="Root output directory for JSON results and CSV summary " - "(default: data/results).", + help="Root output directory for JSON results and CSV summary " "(default: data/results).", ) parser.add_argument( "--llm-model", diff --git a/src/llm/llm_client.py b/src/llm/llm_client.py index f039bae..bb68137 100644 --- a/src/llm/llm_client.py +++ b/src/llm/llm_client.py @@ -138,8 +138,12 @@ def extract_metrics_from_text( # ── Retry once if any extractable fields are null ─────────────────────── _retryable = [ - "species_name", "study_location", "study_date", - "num_empty_stomachs", "num_nonempty_stomachs", "sample_size", + "species_name", + "study_location", + "study_date", + "num_empty_stomachs", + "num_nonempty_stomachs", + "sample_size", ] missing = [f for f in _retryable if getattr(metrics, f) is None] @@ -151,14 +155,8 @@ def extract_metrics_from_text( # Build targeted hints for each missing field _hints = { - "species_name": ( - "- species_name: Look for the first binomial Latin name (Genus species) " - "in the title or abstract. This is the PREDATOR, not its prey.\n" - ), - "study_location": ( - "- study_location: Check Methods or Study Area sections for place names, " - "islands, countries, or coordinates.\n" - ), + "species_name": ("- species_name: Look for the first binomial Latin name (Genus species) " "in the title or abstract. This is the PREDATOR, not its prey.\n"), + "study_location": ("- study_location: Check Methods or Study Area sections for place names, " "islands, countries, or coordinates.\n"), "study_date": ( "- study_date: Look for phrases like 'collected in', 'sampled during', " "'field season', 'from [month] [year] to [month] [year]'. " @@ -171,15 +169,9 @@ def extract_metrics_from_text( "where every sample produced material), return 0.\n" ), "num_nonempty_stomachs": ( - "- num_nonempty_stomachs: Look for 'contained food', 'with prey', " - "'non-empty', 'food samples collected'. If ALL samples had food, " - "this equals sample_size.\n" - ), - "sample_size": ( - "- sample_size: Look for 'N stomachs', 'N specimens', 'a total of N', " - "'n=N', 'N individuals examined', 'two groups of N'. Check Abstract, " - "Methods, and Results.\n" + "- num_nonempty_stomachs: Look for 'contained food', 'with prey', " "'non-empty', 'food samples collected'. If ALL samples had food, " "this equals sample_size.\n" ), + "sample_size": ("- sample_size: Look for 'N stomachs', 'N specimens', 'a total of N', " "'n=N', 'N individuals examined', 'two groups of N'. Check Abstract, " "Methods, and Results.\n"), } retry_prompt = ( @@ -200,9 +192,7 @@ def extract_metrics_from_text( format=PredatorDietMetrics.model_json_schema(), options={"num_ctx": num_ctx}, ) - retry_metrics = PredatorDietMetrics.model_validate_json( - retry_response.message.content - ) + retry_metrics = PredatorDietMetrics.model_validate_json(retry_response.message.content) # Merge: prefer retry values for fields that were null, keep originals otherwise merged = metrics.model_dump() diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index a88e2e9..6573232 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -29,11 +29,7 @@ _SECTION_PRIORITIES: List[Tuple[re.Pattern, int]] = [ (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$"), 0), (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(results?|findings?)\s*[:\.]?\s*$"), 1), - (re.compile( - r"(?i)^\s*" + _NUM_PREFIX + - r"(materials?\s*(?:and|&)\s*methods?|methods?|methodology" - r"|study\s*(?:area|site|design|region|period))\s*[:\.]?\s*$" - ), 2), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(materials?\s*(?:and|&)\s*methods?|methods?|methodology" r"|study\s*(?:area|site|design|region|period))\s*[:\.]?\s*$"), 2), (re.compile(r"(?i)^\s*table\s*\d"), 3), (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(introduction|background)\s*[:\.]?\s*$"), 4), (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(discussion|conclusions?|summary\s+and\s+discussion)\s*[:\.]?\s*$"), 5), @@ -41,7 +37,7 @@ _DROP_SECTION_RE: re.Pattern = re.compile( r"(?i)^\s*" - r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" # optional numeric prefix + r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" # optional numeric prefix r"(" r"acknowledge?ments?" r"|literature\s+cited" @@ -81,64 +77,68 @@ def _section_priority(heading: str) -> int: _FIELD_PATTERNS: List[Tuple[re.Pattern, int]] = [ # sample_size — explicit counts of stomachs / specimens / individuals - (re.compile( - r"(?i)(\bn\s*=\s*\d+" - r"|total\s+of\s+\d+" - r"|\d+\s+stomachs?" - r"|\d+\s+specimens?" - r"|\d+\s+individuals?" - r"|\d+\s+birds?" - r"|\d+\s+fish" - r"|\d+\s+samples?" - r"|sample\s+size\s+(of\s+)?\d+" - r"|examined\s+\d+" - r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" - ), 4), + ( + re.compile( + r"(?i)(\bn\s*=\s*\d+" + r"|total\s+of\s+\d+" + r"|\d+\s+stomachs?" + r"|\d+\s+specimens?" + r"|\d+\s+individuals?" + r"|\d+\s+birds?" + r"|\d+\s+fish" + r"|\d+\s+samples?" + r"|sample\s+size\s+(of\s+)?\d+" + r"|examined\s+\d+" + r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" + ), + 4, + ), # num_empty_stomachs — explicit empty-stomach language - (re.compile( - r"(?i)(empty\s+stomachs?" - r"|stomachs?\s+(were\s+)?empty" - r"|had\s+empty" - r"|without\s+food" - r"|without\s+(stomach\s+)?contents?" - r"|zero\s+prey" - r"|no\s+food\s+(items?|remains?)" - r"|vacuous|vacant\s+stomachs?)" - ), 5), + ( + re.compile( + r"(?i)(empty\s+stomachs?" + r"|stomachs?\s+(were\s+)?empty" + r"|had\s+empty" + r"|without\s+food" + r"|without\s+(stomach\s+)?contents?" + r"|zero\s+prey" + r"|no\s+food\s+(items?|remains?)" + r"|vacuous|vacant\s+stomachs?)" + ), + 5, + ), # num_nonempty_stomachs / fraction_feeding - (re.compile( - r"(?i)(non.?empty" - r"|contained\s+(food|prey|items?)" - r"|with\s+food" - r"|with\s+(stomach\s+)?contents?" - r"|had\s+(food|prey)" - r"|feeding\s+rate" - r"|proportion\s+(feeding|with\s+food)" - r"|percent\s+(feeding|with\s+food)" - r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" - ), 5), + ( + re.compile( + r"(?i)(non.?empty" + r"|contained\s+(food|prey|items?)" + r"|with\s+food" + r"|with\s+(stomach\s+)?contents?" + r"|had\s+(food|prey)" + r"|feeding\s+rate" + r"|proportion\s+(feeding|with\s+food)" + r"|percent\s+(feeding|with\s+food)" + r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" + ), + 5, + ), # general percentage / fraction near gut/stomach context - (re.compile( - r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" - r"|\d+\s+of\s+\d+\s+(were|had|contained)" - r"|proportion\s+of\s+\d+)" - ), 2), + (re.compile(r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" r"|\d+\s+of\s+\d+\s+(were|had|contained)" r"|proportion\s+of\s+\d+)"), 2), # study date — collection period - (re.compile( - r"(?i)(collected\s+(in|during|between)" - r"|sampled\s+(in|during|between)" - r"|field\s+season" - r"|study\s+period" - r"|between\s+\d{4}\s+and\s+\d{4}" - r"|\d{4}[\-\u2013]\d{4}" - r"|sampling\s+period)" - ), 2), + ( + re.compile( + r"(?i)(collected\s+(in|during|between)" + r"|sampled\s+(in|during|between)" + r"|field\s+season" + r"|study\s+period" + r"|between\s+\d{4}\s+and\s+\d{4}" + r"|\d{4}[\-\u2013]\d{4}" + r"|sampling\s+period)" + ), + 2, + ), # study location - (re.compile( - r"(?i)(study\s+(area|site|region)" - r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" - r"|sampling\s+(location|site|area))" - ), 2), + (re.compile(r"(?i)(study\s+(area|site|region)" r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" r"|sampling\s+(location|site|area))"), 2), # any binomial species name (used as a weak relevance signal) (re.compile(r"\b[A-Z][a-z]+\s+[a-z]{3,}\b"), 1), ] @@ -273,10 +273,7 @@ def extract_key_sections(text: str, max_chars: int) -> str: for i, line in enumerate(lines): stripped = line.strip() is_drop = bool(_DROP_SECTION_RE.match(stripped)) if stripped else False - is_known = ( - any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) - if stripped else False - ) + is_known = any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) if stripped else False if is_drop or is_known: sections.append((current_start, current_heading, "\n".join(current_lines))) current_heading = stripped @@ -319,16 +316,12 @@ def extract_key_sections(text: str, max_chars: int) -> str: else: if para_lines: para_text = "\n".join(para_lines).strip() - raw_paragraphs.append( - (sec_start + j, para_text, _score_paragraph(para_text)) - ) + raw_paragraphs.append((sec_start + j, para_text, _score_paragraph(para_text))) para_lines = [] para_start = sec_start + j + 1 if para_lines: para_text = "\n".join(para_lines).strip() - raw_paragraphs.append( - (sec_start + len(block.split("\n")), para_text, _score_paragraph(para_text)) - ) + raw_paragraphs.append((sec_start + len(block.split("\n")), para_text, _score_paragraph(para_text))) # Sort by score descending; use original position as tiebreaker (earlier first) raw_paragraphs.sort(key=lambda t: (-t[2], t[0])) @@ -372,6 +365,7 @@ def load_document(file_path: Path) -> str: if suffix == '.pdf': print(f"[INFO] Reading PDF file...", file=sys.stderr) from src.preprocessing.pdf_text_extraction import extract_text_from_pdf + return extract_text_from_pdf(str(file_path)) elif suffix in ['.txt', '.text']: print(f"[INFO] Reading text file...", file=sys.stderr) diff --git a/src/preprocessing/section_filter.py b/src/preprocessing/section_filter.py index 09f2d94..5ba03d3 100644 --- a/src/preprocessing/section_filter.py +++ b/src/preprocessing/section_filter.py @@ -76,11 +76,7 @@ r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" ), # percentage / fraction near gut/stomach context - re.compile( - r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" - r"|\d+\s+of\s+\d+\s+(were|had|contained)" - r"|proportion\s+of\s+\d+)" - ), + re.compile(r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" r"|\d+\s+of\s+\d+\s+(were|had|contained)" r"|proportion\s+of\s+\d+)"), # study date — collection period re.compile( r"(?i)(collected\s+(in|during|between)" @@ -92,11 +88,7 @@ r"|sampling\s+period)" ), # study location - re.compile( - r"(?i)(study\s+(area|site|region)" - r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" - r"|sampling\s+(location|site|area))" - ), + re.compile(r"(?i)(study\s+(area|site|region)" r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" r"|sampling\s+(location|site|area))"), # gut / stomach / diet core vocabulary re.compile( r"(?i)(stomach\s+content" @@ -109,17 +101,9 @@ r"|gastrointestinal|foregut|hindgut|crop\s+content)" ), # predator species name — binomial in a sentence with diet/food/stomach - re.compile( - r"(?i)\b[A-Z][a-z]+\s+[a-z]{3,}\b.{0,80}" - r"(stomach|diet|prey|food|feeding|gut|trophic|forag)" - ), + re.compile(r"(?i)\b[A-Z][a-z]+\s+[a-z]{3,}\b.{0,80}" r"(stomach|diet|prey|food|feeding|gut|trophic|forag)"), # geographic coordinates or explicit lat/lon - re.compile( - r"(\d{1,3}[°º]\s*\d{0,2}['′]?\s*[NS]" - r"|\d{1,3}[°º]\s*\d{0,2}['′]?\s*[EW]" - r"|latitude|longitude" - r"|\d+\.\d+\s*[°º]?\s*[NS],?\s*\d+\.\d+\s*[°º]?\s*[EW])" - ), + re.compile(r"(\d{1,3}[°º]\s*\d{0,2}['′]?\s*[NS]" r"|\d{1,3}[°º]\s*\d{0,2}['′]?\s*[EW]" r"|latitude|longitude" r"|\d+\.\d+\s*[°º]?\s*[NS],?\s*\d+\.\d+\s*[°º]?\s*[EW])"), # table-like numeric data (rows of numbers separated by whitespace/tabs) re.compile(r"(?m)^.*(\d+\s*[\t|]\s*){2,}\d+"), ] @@ -244,8 +228,7 @@ _NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" _SECTION_HEADING_RE: re.Pattern = re.compile( - r"(?i)^\s*" + _NUM_PREFIX + - r"(" + r"(?i)^\s*" + _NUM_PREFIX + r"(" r"abstract|summary" r"|introduction|background" r"|methods?|materials?\s*(?:and|&)\s*methods?" @@ -290,6 +273,7 @@ # Public API # --------------------------------------------------------------------------- + def filter_relevant_sections(text: str) -> str: """Remove paragraphs unlikely to contain target diet metrics. @@ -351,6 +335,7 @@ def filter_relevant_sections(text: str) -> str: # Internal helpers # --------------------------------------------------------------------------- + def _split_into_blocks(text: str) -> List[str]: """Split text into paragraph blocks separated by blank lines. diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py index b694824..efe4662 100644 --- a/src/preprocessing/text_cleaner.py +++ b/src/preprocessing/text_cleaner.py @@ -60,7 +60,7 @@ # NOTE: requires actual whitespace between letters to avoid matching "Abstract". _STRUCTURED_HEADER_START: re.Pattern = re.compile( r"(?i)^\s*(" - r"a\s+b\s+s\s+t\s+r\s+a\s+c\s+t" # spaced "A B S T R A C T" + r"a\s+b\s+s\s+t\s+r\s+a\s+c\s+t" # spaced "A B S T R A C T" r"|a\s+r\s+t\s+i\s+c\s+l\s+e\s+i\s+n\s+f\s+o" # spaced "A R T I C L E I N F O" r")\s*$" ) @@ -73,56 +73,38 @@ _LINE_DROP_PATTERNS: List[re.Pattern] = [ # Standalone page numbers (digit-only line, 1–4 digits, optional spaces) re.compile(r"^\s*\d{1,4}\s*$"), - # Reference list entries: "[1] Smith ...", "1. Smith ...", "1) Smith ..." re.compile(r"^\s*\[\d+\]\s+[A-Z]"), re.compile(r"^\s*\d{1,3}[.)]\s{1,4}[A-Z][a-z]"), - # DOI and bare URLs (doi.org/, bare doi:, https://, www.) re.compile(r"(?i)(https?://|doi\.org/|\bdoi:\s*10\.|www\.)\S*"), - # Email addresses re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), - # Copyright / licence lines - re.compile(r"(?i)(©|\(c\)\s*\d{4}|copyright\s+\d{4}|all\s+rights\s+reserved" - r"|published\s+by\s+elsevier|creative\s+commons|open\s+access" - r"|this\s+(article|paper|is)\s+(is\s+)?published)"), - + re.compile(r"(?i)(©|\(c\)\s*\d{4}|copyright\s+\d{4}|all\s+rights\s+reserved" r"|published\s+by\s+elsevier|creative\s+commons|open\s+access" r"|this\s+(article|paper|is)\s+(is\s+)?published)"), # Journal metadata: volume, issue, ISSN, page range - re.compile(r"(?i)^\s*(vol(ume)?\.?\s*\d|issue\s*\d|pp\.\s*\d|issn\s*[\d\-]" - r"|journal\s+of|proceedings\s+of)"), - + re.compile(r"(?i)^\s*(vol(ume)?\.?\s*\d|issue\s*\d|pp\.\s*\d|issn\s*[\d\-]" r"|journal\s+of|proceedings\s+of)"), # Received / accepted / revised / available-online timestamps # Match with or without trailing colon/semicolon - re.compile(r"(?i)^\s*(received|accepted|revised|available\s+online|" - r"published\s+online|handling\s+editor)" - r"(\s*[:;]|\s+\d|\s+in)"), - + re.compile(r"(?i)^\s*(received|accepted|revised|available\s+online|" r"published\s+online|handling\s+editor)" r"(\s*[:;]|\s+\d|\s+in)"), # Article history block header re.compile(r"(?i)^\s*article\s+history\s*[:\.]?\s*$"), - # Keywords header line AND single-word keyword-style lines that follow it re.compile(r"(?i)^\s*key\s*-?\s*words?\s*[:\-]"), - # Journal / publisher metadata lines - re.compile(r"(?i)^\s*(contents?\s+lists?\s+available|journal\s+homepage|" - r"elsevier\.com|sciencedirect\.com|springer\.|wiley\.com)"), - + re.compile(r"(?i)^\s*(contents?\s+lists?\s+available|journal\s+homepage|" r"elsevier\.com|sciencedirect\.com|springer\.|wiley\.com)"), # Figure / table / plate captions - re.compile(r"(?i)^\s*(fig(ure)?\.?\s*\d|table\s*\d|plate\s*\d|" - r"fig\.\s*s\d|supplemental?\s+(table|figure)\s*\d)" - r"[\s.\-–—:]"), - + re.compile(r"(?i)^\s*(fig(ure)?\.?\s*\d|table\s*\d|plate\s*\d|" r"fig\.\s*s\d|supplemental?\s+(table|figure)\s*\d)" r"[\s.\-–—:]"), # Author affiliation lines (institution / department / lab names) # Allow for leading special characters (e.g. ⁎, *, †) - re.compile(r"(?i)^[\s\*⁎†‡#]*" - r"(department\s+of|faculty\s+of|institute\s+(of|for)|" - r"division\s+of|school\s+of|laboratory\s+of|lab\s+of|" - r"centre?\s+(for|of)|program(me)?\s+(in|of)|" - r"universidad|universit[éy]|université|universidade|" - r"university\s+of|college\s+of)"), - + re.compile( + r"(?i)^[\s\*⁎†‡#]*" + r"(department\s+of|faculty\s+of|institute\s+(of|for)|" + r"division\s+of|school\s+of|laboratory\s+of|lab\s+of|" + r"centre?\s+(for|of)|program(me)?\s+(in|of)|" + r"universidad|universit[éy]|université|universidade|" + r"university\s+of|college\s+of)" + ), # Running page headers / footers: short all-caps lines that are NOT # known section headings (those are whitelisted in _drop_line below). re.compile(r"^[A-Z\s\d\.\-–—:,]{5,60}$"), @@ -314,12 +296,11 @@ def clean_text(text: str) -> str: # Standalone usage # --------------------------------------------------------------------------- + def main() -> None: # pragma: no cover import argparse - parser = argparse.ArgumentParser( - description="Strip noise from a raw .txt file extracted from a scientific PDF." - ) + parser = argparse.ArgumentParser(description="Strip noise from a raw .txt file extracted from a scientific PDF.") parser.add_argument("input", type=str, help="Path to the .txt file to clean.") parser.add_argument( "--max-chars", diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py index ba3b844..423bbae 100644 --- a/tests/test_section_filter.py +++ b/tests/test_section_filter.py @@ -21,6 +21,7 @@ # Helpers # --------------------------------------------------------------------------- + def _make_page(n: int, body: str) -> str: return f"[PAGE {n}]\n{body}" @@ -40,11 +41,7 @@ def test_preserves_page_markers(self): assert "[PAGE 2]" in result def test_preserves_section_headings(self): - text = ( - "Abstract\n\nWe studied diets.\n\n" - "Methods\n\nWe sampled fish.\n\n" - "Results\n\nN=42 stomachs examined." - ) + text = "Abstract\n\nWe studied diets.\n\n" "Methods\n\nWe sampled fish.\n\n" "Results\n\nN=42 stomachs examined." result = filter_relevant_sections(text) assert "Abstract" in result assert "Methods" in result @@ -145,78 +142,49 @@ class TestNegativeSignalDropped: """Paragraphs with only negative signals should be removed.""" def test_drops_phylogenetic_paragraph(self): - para = ( - "Bayesian inference of the phylogenetic relationships among " - "the sister group taxa revealed strong bootstrap support for " - "the monophyletic clade." - ) + para = "Bayesian inference of the phylogenetic relationships among " "the sister group taxa revealed strong bootstrap support for " "the monophyletic clade." text = f"Discussion\n\n{para}" result = filter_relevant_sections(text) assert "Bayesian inference" not in result def test_drops_habitat_description(self): - para = ( - "Vegetation type was classified as tropical dry forest with " - "canopy cover dense and understory dominated by shrubs." - ) + para = "Vegetation type was classified as tropical dry forest with " "canopy cover dense and understory dominated by shrubs." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "canopy cover" not in result def test_drops_literature_review(self): - para = ( - "Several studies have reported similar findings. " - "Previously reported by Smith (2001) and consistent with " - "the findings of Jones et al. (2005)." - ) + para = "Several studies have reported similar findings. " "Previously reported by Smith (2001) and consistent with " "the findings of Jones et al. (2005)." text = f"Discussion\n\n{para}" result = filter_relevant_sections(text) assert "Several studies have reported" not in result def test_drops_prey_id_methodology(self): - para = ( - "Prey were identified to the lowest taxonomic level using " - "a stereomicroscope and reference collection of diagnostic bones." - ) + para = "Prey were identified to the lowest taxonomic level using " "a stereomicroscope and reference collection of diagnostic bones." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "stereomicroscope" not in result def test_drops_morphometric_paragraph(self): - para = ( - "Snout-vent length was measured to the nearest 0.1 mm. " - "Total length was measured for each individual using calipers." - ) + para = "Snout-vent length was measured to the nearest 0.1 mm. " "Total length was measured for each individual using calipers." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "Snout-vent length" not in result def test_drops_statistical_methods(self): - para = ( - "Differences were tested using Kruskal-Wallis tests with " - "Bonferroni correction for multiple comparisons. A generalized " - "linear model was used to assess trends." - ) + para = "Differences were tested using Kruskal-Wallis tests with " "Bonferroni correction for multiple comparisons. A generalized " "linear model was used to assess trends." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "Kruskal-Wallis" not in result def test_drops_conservation_policy(self): - para = ( - "Conservation implications suggest the species should be " - "upgraded to a higher IUCN Red List category given the " - "observed population decline." - ) + para = "Conservation implications suggest the species should be " "upgraded to a higher IUCN Red List category given the " "observed population decline." text = f"Discussion\n\n{para}" result = filter_relevant_sections(text) assert "IUCN Red List" not in result def test_drops_genetic_methods(self): - para = ( - "DNA was extracted using a commercial kit. PCR amplification " - "was performed using primers targeting the mitochondrial gene " - "region. Products were separated by gel electrophoresis." - ) + para = "DNA was extracted using a commercial kit. PCR amplification " "was performed using primers targeting the mitochondrial gene " "region. Products were separated by gel electrophoresis." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "PCR amplification" not in result @@ -238,11 +206,7 @@ def test_keeps_neutral_paragraph(self): def test_keeps_paragraph_with_mixed_signals(self): """Positive signal overrides co-occurring negative signal.""" - para = ( - "A total of 50 stomachs were examined. Prey were identified " - "using a stereomicroscope and diagnostic bones from a " - "reference collection." - ) + para = "A total of 50 stomachs were examined. Prey were identified " "using a stereomicroscope and diagnostic bones from a " "reference collection." text = f"Methods\n\n{para}" result = filter_relevant_sections(text) assert "50 stomachs" in result @@ -315,13 +279,7 @@ def test_realistic_document(self): def test_does_not_over_filter_short_doc(self): """A short document with mostly positive content should lose very little.""" - text = ( - "Abstract\n\n" - "We examined stomach contents of 50 Accipiter nisus.\n\n" - "Results\n\n" - "Sample size was n=50. Of these, 12 stomachs were empty.\n" - "Birds comprised 78% of prey items.\n" - ) + text = "Abstract\n\n" "We examined stomach contents of 50 Accipiter nisus.\n\n" "Results\n\n" "Sample size was n=50. Of these, 12 stomachs were empty.\n" "Birds comprised 78% of prey items.\n" result = filter_relevant_sections(text) # Everything here is positive — nothing should be lost assert "50 Accipiter nisus" in result @@ -346,39 +304,26 @@ def test_has_positive_signal_true(self): def test_has_positive_signal_false(self): assert not _has_positive_signal("The weather was mild that year.") - assert not _has_positive_signal( - "Bayesian inference supported the monophyletic clade." - ) + assert not _has_positive_signal("Bayesian inference supported the monophyletic clade.") def test_has_negative_signal_true(self): - assert _has_negative_signal( - "Phylogenetic analysis using maximum likelihood trees." - ) - assert _has_negative_signal( - "Habitat type was classified as open grassland." - ) - assert _has_negative_signal( - "Mann-Whitney tests were used for comparisons." - ) + assert _has_negative_signal("Phylogenetic analysis using maximum likelihood trees.") + assert _has_negative_signal("Habitat type was classified as open grassland.") + assert _has_negative_signal("Mann-Whitney tests were used for comparisons.") def test_has_negative_signal_false(self): assert not _has_negative_signal("We counted 30 prey items in the gut.") assert not _has_negative_signal("The study was conducted in Brazil.") def test_should_keep_positive_overrides_negative(self): - text = ( - "A total of 100 stomachs were examined using a stereomicroscope " - "and reference collection." - ) + text = "A total of 100 stomachs were examined using a stereomicroscope " "and reference collection." assert _should_keep(text) is True def test_should_keep_neutral_kept(self): assert _should_keep("The area has a subtropical climate.") is True def test_should_keep_negative_only_dropped(self): - assert _should_keep( - "Bayesian inference of phylogenetic relationships." - ) is False + assert _should_keep("Bayesian inference of phylogenetic relationships.") is False def test_split_into_blocks_basic(self): text = "Para one.\n\nPara two.\n\nPara three." @@ -436,10 +381,7 @@ def test_positive_para_kept_in_long_doc(self): def test_negative_para_dropped_in_long_doc(self): """Negative-signal paragraphs still dropped in long docs.""" - negative = ( - "Bayesian inference of the phylogenetic relationships " - "revealed strong bootstrap support." - ) + negative = "Bayesian inference of the phylogenetic relationships " "revealed strong bootstrap support." text = self._make_long_doc([negative]) result = filter_relevant_sections(text) assert "Bayesian inference" not in result diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py index 0e621c8..da6798d 100644 --- a/tests/test_text_cleaner.py +++ b/tests/test_text_cleaner.py @@ -14,6 +14,7 @@ # Helpers # --------------------------------------------------------------------------- + def _make_page(n: int, body: str) -> str: return f"[PAGE {n}]\n{body}" @@ -25,19 +26,13 @@ def _make_page(n: int, body: str) -> str: class TestReferenceSectionRemoval: def test_drops_references_header_and_trailing_content(self): - text = ( - "Methods\nWe examined 50 stomachs from Canis lupus.\n\n" - "References\nSmith J. 2001. J Ecol 10:1-5.\nDoe A. 2002. Nature 400:1.\n" - ) + text = "Methods\nWe examined 50 stomachs from Canis lupus.\n\n" "References\nSmith J. 2001. J Ecol 10:1-5.\nDoe A. 2002. Nature 400:1.\n" result = clean_text(text) assert "Smith J." not in result assert "Doe A." not in result def test_keeps_content_before_references(self): - text = ( - "Results\nSample size was 30.\n\n" - "References\n1. Author A. 2000. Title. Journal.\n" - ) + text = "Results\nSample size was 30.\n\n" "References\n1. Author A. 2000. Title. Journal.\n" result = clean_text(text) assert "Sample size was 30" in result @@ -52,11 +47,7 @@ def test_drops_bibliography(self): assert "Bar C." not in result def test_references_on_separate_page_doesnt_poison_next_page(self): - text = ( - "[PAGE 3]\nResults\nSample size = 30.\n" - "[PAGE 4]\nReferences\n1. Smith 2001.\n" - "[PAGE 5]\nDiscussion\nThis study found important results.\n" - ) + text = "[PAGE 3]\nResults\nSample size = 30.\n" "[PAGE 4]\nReferences\n1. Smith 2001.\n" "[PAGE 5]\nDiscussion\nThis study found important results.\n" result = clean_text(text) assert "Sample size = 30" in result assert "Smith 2001" not in result From 0b3a3d88e5fc5edb0980e417fd19da56091545b3 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 12:54:00 -0700 Subject: [PATCH 18/30] improved truncation --- .../llm_text/Adams_1989_20260312_115204.txt | 160 +++++++ .../Adams_1989_20260312_115204.txt | 433 ++++++++++++++++++ .../Adams_1989_20260312_115204.txt | 433 ++++++++++++++++++ src/llm/llm_text.py | 58 ++- src/preprocessing/text_cleaner.py | 11 +- 5 files changed, 1088 insertions(+), 7 deletions(-) create mode 100644 data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt create mode 100644 data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt create mode 100644 data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt diff --git a/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt b/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt new file mode 100644 index 0000000..124e9ce --- /dev/null +++ b/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt @@ -0,0 +1,160 @@ +[PAGE 1] +Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic +Marion Island + +Author(s): N. J. Adams and N. T. Klages +Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 +Published by: Waterbird Society + [PAGE 2] + Temporal variation in the diet of the Gentoo Penguin + Pygoscelis papua at sub-Antarctic Marion Island + N. J. ADAMS & N. T. KLAGES' + Percy FitzPatrick Institute of African Ornithology, University of Cape Town, + Rondebosch 7700, South Africa + 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa + Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% + of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and + June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo + Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys + rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris + marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal + changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species + within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. + Key words.-diet, Gentoo Penguin, Pygoscelis papua, + +METHODS + A total of 144 stomach samples was collected fr + Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion + Island (46? 53'S, 37?54'E) from March 1984 to March + 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month + as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding + status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- + sequently, during this period, we probably sampled both breeding and non-breeding birds. + Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, + drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- + pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean + components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish + muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle + (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. + However, most stomach samples from Gentoo Penguins contained one prey type only (see below). + Most fish were recovered with heads separated from the remainder of the body. Consequently, + analyses were based on identification and measure- +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +RESULTS + Mean mass of food samples was 139.2 + + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. + (1984). Mean monthly sample mass was highest in December at 332 g. + General Composition of the Diet + Thirty species or species groups were identified (Table 1), with fish comprising + the largest single group (11 species). However, samples from individual penguins + were largely homogeneous consisting of a single species of crustacean or fish (see also + Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- + tained only a single prey taxon, 26% contained two taxa and 10% contained three + taxa (analysis based on taxa comprising at least 5% by mass of individual samples). + Fish and crustaceans accounted for + 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- + siderable changes in proportions of fish, crustaceans and cephalopods occurred + over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass + from March to June 1984 but decreased to + 0% by March 1985. Cephalopods accounted for more than 10% of the diet + only in February and March 1985. + Fish + The fish component of the diet was dominated by the family Nototheniidae, + particularly Notothenia squamifrons. It is likely that most of the unidentified + juvenile nototheniids were also this species + (Table 1). Nototheniids accounted for over + 90% of the fish component during June to + October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- + peared more commonly in the diet from + October and accounted for most of the food items in March 1985 (Table 3). + Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas + When plotted on a monthly basis, the was no evidence of any progressive chan + in size-class distribution throughout t year. However, during January and Fe + ruary smaller fish between 28.6 - 41.8 m + (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger + dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini + months. Standard parameters for all fish species are given in Table 4. + Crustaceans + Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta + cean component. During winter (March to + August 1984), N. marionis formed the largest part of the crustacean component, but E. + vallentini subsequently increased to nearly + 100% in January and February 1985 + (Table 3). Mean length of E. vallentini was + 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm + + 5.18, n = 1481). + Cephalopods + Cephalopods accounted for just over + 10% of the diet during February and + March 1985 but seldom occurred in the remaining months (Table 3). Most + cephalopods were small octopods (DML < + 15 mm). The only squid identified were juveniles of the species Kondakovia lon- + gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? + 3.8 g (n = 4) respectively. + +[PAGE 5] + ADAMS & KLAGES * GENTOO PENGUIN DIET by month. + Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar + Crustaceans + Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 + Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 + Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 + Fish + Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 + Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 + Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 + Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 + Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 + Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island + was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La + Cock et al. 1984, N. T. Klages pers. obs.) + whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- + ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the + islands of the southern Indian and, less commonly, the South Atlantic Oceans + MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- + shaded segment: crustaceans, stippled segment: + fish, shaded segment: cephalopods. + (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all + in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and + Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile + nototheniids in the diet in November and + December preceded an increase in the relative abundance of small-size class N. + squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o + these larval fish. + Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish + in the diet. Adults were presumably too large for consumption. The species had + previously been considered a demersal species endemic to the colder waters + around Kerguelen (48?27'-50?S, 60?27'- + 70?35'E) and Heard Islands (53?01'S, + 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters + around Marion Island was surprising. + However, hydrographical evidence (Benon and Murail 1979) and sampling of + zooplankton (Grindley and Lane 1979, + Boden and Parker 1986) suggests that advection of foreign water masses past the + island with associated fauna may occur + (Boden & Parker 1986). + The appearance of myctophids in the diet of Gentoo Penguins coincided with + the increase in relative abundance of these fish in the diet of King Penguins Apteno- + dytes patagonicus during summer, suggesting increased availability (Adams and + Klages 1987). Small numbers of myctophids have previously been reported in + the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. + 1984). + U9 + - + CO a + C, +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 6] + recovered from Gentoo Penguin stomach samples at Marion Island. + Species OD (mm) Estimated SL (mm) Estimated mass (g) + N Mean SD Range Mean SD Range Mean SD Range + Nototheniidae + Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) + Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) + Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) + Channichthyidae + Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36. \ No newline at end of file diff --git a/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt b/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt new file mode 100644 index 0000000..2282198 --- /dev/null +++ b/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt @@ -0,0 +1,433 @@ +[PAGE 1] +Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic +Marion Island + +Author(s): N. J. Adams and N. T. Klages +Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 +Published by: Waterbird Society + [PAGE 2] + Temporal variation in the diet of the Gentoo Penguin + Pygoscelis papua at sub-Antarctic Marion Island + N. J. ADAMS & N. T. KLAGES' + Percy FitzPatrick Institute of African Ornithology, University of Cape Town, + Rondebosch 7700, South Africa + 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa + Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% + of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and + June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo + Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys + rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris + marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal + changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species + within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. + Key words.-diet, Gentoo Penguin, Pygoscelis papua, sub-Antarctic. + Colonial Waterbirds 12(1): 30-36, 1989 + Studies of the diets of both migrant and resident penguins in the Southern + Ocean region have concentrated on birds during the chick-rearing period, with little + information for the non-breeding season + (see Croxall and Lishman 1987). The Gentoo Penguin Pygoscelis papua is a year- + round resident at sub-Antarctic Marion Island (van Zinderen Bakker 1971), and at + some other breeding sites in the Southern + Ocean (Watson 1975), affording opportunities for sampling its diet throughout + the year. + Williams (1981) suggested that seasonal changes in the diet and earlier, winter + breeding by Gentoo Penguins at Marion + Island, compared to populations at higher latitudes, ensured a crustacean diet during + incubation and early chick rearing. He suggested that a switch by Gentoo Pen- + guins to a fish diet later in the season, deduced from a change in color of the + excreta, was a response to the arrival of large numbers of two species of potentially + competing, summer-resident crested penguins, the Macaroni Eudyptes chrysolophus + and the Rockhopper E. chrysocome. However, food samples collected at Marion Is- + land in September 1982 showed that fish are common in the diet of Gentoo Pen- + guins before the arrival of the other penguin species (La Cock et al. 1984). We re- + port here on a more comprehensive set o samples collected throughout a single ye + at Marion Island. Our principal objecti was to investigate temporal changes i + diet. + METHODS + A total of 144 stomach samples was collected fr + Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion + Island (46? 53'S, 37?54'E) from March 1984 to March + 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month + as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding + status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- + sequently, during this period, we probably sampled both breeding and non-breeding birds. + Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, + drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- + pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean + components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish + muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle + (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. + However, most stomach samples from Gentoo Penguins contained one prey type only (see below). + Most fish were recovered with heads separated from the remainder of the body. Consequently, + analyses were based on identification and measure- +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 3] + ADAMS & KLAGES * GENTOO PENGUIN DIET ment of otoliths (Adams and Klages, 1987). We esti- + mated standard length (SL) and mass of fish from regressions relating otolith diameter to these + parameters. Regressions not available in Adams and + Klages (1987) and Brown and Klages (1987) are given in Duhamel (1981), Hecht and Cooper (1986), and + Hecht (1987). Since otoliths were removed from intact fish crania, digestion and hence measurement + error were minimal. + Crustacean species were identified with the aid of published keys (Bowman and Gruner, 1973; + Kirkwood, 1982; 1984). Total length of intact individuals was measured from the anterior margin of an + eye to the tip of the telson. Numbers of crustaceans in each sample were estimated by dividing total mass + by the average mass of an individual crustacean of each respective species. Actual numbers and fresh + mass of prey ingested will be underestimated in welldigested samples. + Most of the small cephalopods were recovered intact, and numbers were counted directly. The iden- + tification of squid beaks was facilitated by a reference collection at the Port Elizabeth Museum, and con- + firmed by comparison with the literature (Clarke, + 1986). We estimated dorsal mantle length (DML) and mass of squid from regressions relating lower ros- + trum length (LRL) to these parameters (Adams and + Klages, 1987). Juvenile octopods were not identified. + samples at Marion Island. + Prey species Total % numbers % frequency number of prey class of occurrence + FISH + Notothiniidae + Notothenia squamifrons + Unident.juv. nototheniids + Notothenia acuta + Dissostichus eleginoides + Channichthyidae + Channichthys rhinoceratus + Myctophidae + Protomyctophum normani + Gymnoscopelus nicholsi + Protomyctophum tenisoni + Krefftichthys anderssoni + Protomyctophum bolini unid. myctophids + Electrona carlsbergi + Gymnoscopelus sp. + Muraenolepidae + Muraenolepts sp. + Unident. fish + Euphausiacea + Euphausia vallentini + Hippolytidae + Nauticaris marionis + Amphipoda + Themisto gaudichaudii + Hyperiella sp. + Unident. amphipods + Vibilia sp. + Primno sp. + Nematocarcinidae + Nematocarcinus longirostris + Octopoda + Unident.juv. octopods + Decapoda + Unident.juv. squid + Kondakovia longimana + Onychoteuthidae LRL >2mm + Onychoteuthidae LRL < 2mm + Margarella expansa + Ophiuroidea + 58.6 + 21.1 + 1.3 + 0.5 + 50.7 + 14.6 + 11.1 + 6.3 + 6.3 + 5.8 + 1.5 + 1.4 + 1.1 + 1.0 + 0.7 + 0.2 + <0.1 + 12.5 + 4.2 + 4.2 + 4.9 + 3.5 + 4.2 + 4.9 + 1.4 + 0.7 + 0.2 + 0.1 + 0.7 + 1.4 + 86.4 + 25.7 + 33.3 + 13.5 + 0.1 + <0.1 + <0.1 + <0.1 + <0.1 + <0.1 + 4.9 + 2.1 + 2.1 + 1.4 + 1.4 + 1.4 + 94.9 + 2.9 + 1.1 + 0.6 + 0.6 + 18.8 + 9.0 + 3.5 + 3.5 + 3.5 + 2.1 +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 4] + RESULTS + Mean mass of food samples was 139.2 + + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. + (1984). Mean monthly sample mass was highest in December at 332 g. + General Composition of the Diet + Thirty species or species groups were identified (Table 1), with fish comprising + the largest single group (11 species). However, samples from individual penguins + were largely homogeneous consisting of a single species of crustacean or fish (see also + Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- + tained only a single prey taxon, 26% contained two taxa and 10% contained three + taxa (analysis based on taxa comprising at least 5% by mass of individual samples). + Fish and crustaceans accounted for + 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- + siderable changes in proportions of fish, crustaceans and cephalopods occurred + over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass + from March to June 1984 but decreased to + 0% by March 1985. Cephalopods accounted for more than 10% of the diet + only in February and March 1985. + Fish + The fish component of the diet was dominated by the family Nototheniidae, + particularly Notothenia squamifrons. It is likely that most of the unidentified + juvenile nototheniids were also this species + (Table 1). Nototheniids accounted for over + 90% of the fish component during June to + October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- + peared more commonly in the diet from + October and accounted for most of the food items in March 1985 (Table 3). + Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas + When plotted on a monthly basis, the was no evidence of any progressive chan + in size-class distribution throughout t year. However, during January and Fe + ruary smaller fish between 28.6 - 41.8 m + (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger + dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini + months. Standard parameters for all fish species are given in Table 4. + Crustaceans + Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta + cean component. During winter (March to + August 1984), N. marionis formed the largest part of the crustacean component, but E. + vallentini subsequently increased to nearly + 100% in January and February 1985 + (Table 3). Mean length of E. vallentini was + 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm + + 5.18, n = 1481). + Cephalopods + Cephalopods accounted for just over + 10% of the diet during February and + March 1985 but seldom occurred in the remaining months (Table 3). Most + cephalopods were small octopods (DML < + 15 mm). The only squid identified were juveniles of the species Kondakovia lon- + gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? + 3.8 g (n = 4) respectively. + DISCUSSION + Fish (Ealey 1954, Volkman et al. 198 and, in particular, nototheniids may fo + a substantial portion of the diet of the G too Penguin throughout its range. T + most important prey species by mass in + Fish Crustaceans Cephalopods + Wet mass (%) Mean 53.5 44.4 2.1 + Range (0-100) (0-100) (0-51.8) +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 5] + ADAMS & KLAGES * GENTOO PENGUIN DIET by month. + Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar + Crustaceans + Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 + Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 + Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 + Fish + Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 + Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 + Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 + Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 + Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 + Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island + was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La + Cock et al. 1984, N. T. Klages pers. obs.) + whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- + ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the + islands of the southern Indian and, less commonly, the South Atlantic Oceans + MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- + shaded segment: crustaceans, stippled segment: + fish, shaded segment: cephalopods. + (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all + in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and + Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile + nototheniids in the diet in November and + December preceded an increase in the relative abundance of small-size class N. + squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o + these larval fish. + Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish + in the diet. Adults were presumably too large for consumption. The species had + previously been considered a demersal species endemic to the colder waters + around Kerguelen (48?27'-50?S, 60?27'- + 70?35'E) and Heard Islands (53?01'S, + 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters + around Marion Island was surprising. + However, hydrographical evidence (Benon and Murail 1979) and sampling of + zooplankton (Grindley and Lane 1979, + Boden and Parker 1986) suggests that advection of foreign water masses past the + island with associated fauna may occur + (Boden & Parker 1986). + The appearance of myctophids in the diet of Gentoo Penguins coincided with + the increase in relative abundance of these fish in the diet of King Penguins Apteno- + dytes patagonicus during summer, suggesting increased availability (Adams and + Klages 1987). Small numbers of myctophids have previously been reported in + the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. + 1984). + U9 + - + CO a + C, +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 6] + recovered from Gentoo Penguin stomach samples at Marion Island. + Species OD (mm) Estimated SL (mm) Estimated mass (g) + N Mean SD Range Mean SD Range Mean SD Range + Nototheniidae + Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) + Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) + Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) + Channichthyidae + Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36.8 (127.0-238.3) 21.1 ? 18.9 (6.8-61.9) + Myctophidae + Protomvctophum normani 61 2.38?0.10 (2.19-2.6) 77.7? 3.7 (70.0-85.2) 6.4? 0.9 (4.7-8.3) + Protomyctophum tenisoni 57 1.63?0.13 (1.3-1.78) 49.2 ?4.9 (37.0-54.8) 1.8 ?0.5 (0.8-2.3) + Krefftichthys anderssoni 35 1.51 0.25 (0.97-1.87) 49.6? 11.2 (25.2-66.0) 2.0? 1.1 (0.2-4.3) + Gymnoscopelus nicholsi 12 6.20?0.19 (5.91-6.55) 156.2+5.7 (147.4-166.7) 44.3?4.8 (37.4-53.3) + Protomvctophum bolini 4 2.00?0.38 (1.7-2.35) 57.7 12.2 (46.1-67.3) 1.5?0.8 (0.8-2.2) + Muraenolepidae + Muraenolepis sp. 7 2.7?0.58 (1.87-3.4) 141.1 46.6 (76.8-200.8) 24.9?20.8 (2.3-60.6) + Two species of crustacean Euphausia vallentini and Nauticaris marionis, were of + almost equal importance by mass at Marion Island. This is in contrast to the situa- + tion at sites south of the Antarctic Polar + Front, where the crustacean component of the diet of the Gentoo Penguin is domi- + nated by a single species, E. superba. E. vallentini was the most important prey con- + sumed by Macaroni and Rockhopper penguins during the 1984-1985 summer, and + remained abundant in the diet of Rockhopper Penguins in March 1985 (Brown + and Klages 1987) when it was absent from the diet of the Gentoo Penguin. Both + species of crested penguins probably forage farther offshore during late chick- + rearing than do Gentoo Penguins (Brown + 1987, Adams and Wilson 1987). This may indicate a movement of euphausiids out of + the immediate inshore waters during February and March 1985. + Due to its benthic nature, adult N. + marionis may be available to Gentoo Penguins only within a few kilometers of the + shore (Adams and Wilson 1987). The size + (average length: 35.1 mm) of N. marionis taken by Gentoo Penguins at Marion Is- + land in 1984-1985 is clearly greater than juveniles (maximum length: 23 mm) taken + by crested penguins during December + 1983 to February 1984 (Brown and Klages + 1987) and by Gentoo Penguins in September 1982 (La Cock et al. 1984). The ab- + sence of juvenile N. marionis in the diet of + Gentoo and crested penguins at Marion Island during January to March 1985 is dif- + ficult to explain, since at least some adult individuals recovered during April to Sep- + tember 1984 were ovigerous (N. J. Adams pers. obs.). + The occurrence of octopods in the diet of Gentoo Penguins is apparently unique + to the Marion Island site. The appearance of juvenile octopods in the diet coincided + with their occurrence in the diet of Rockhopper Penguins late in the chick-rearing + period (Brown and Klages 1987). Adult octopods are generally benthic and solit- + ary. The appearance of large numbers of juveniles in the diet suggests highly sea- + sonal spawning, coupled with a tendency to form dense aggregations in shallower + water. In contrast, the small number of juvenile squid in the diet of the inshore + foraging Gentoo Penguin again emphasizes the generally pelagic nature of + squid (Adams and Klages 1987). +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 7] + ADAMS & KLAGES * GENTOO PENGUIN DIET + Gentoo Penguins apparently switched from a largely crustacean diet during + March to June 1984 to a largely fish diet during July 1984 to March 1985. This + change coincided with peak egg laying and could not be considered as a direct re- + sponse to the arrival, in October and + November, of the largely, euphausiid consuming and hence potentially competing + crested penguins (cf. Williams 1981). + Moreover, the most important crustacean component by mass in the diet during + March 1984 to September 1984 was adult + N. marionis (not taken by Macaroni and + Rockhopper penguins) and not krill + (euphausiids) as intimated by Williams + (1981). The large variation in abundance and prey-species composition of crusta- + ceans in penguin diet from year-to-year, indicated by the differences in the diet of + the Macaroni and Rockhopper Penguins in two years at Marion Island (Brown and + Klages 1987) and Gentoo Penguins in + March 1984 and March 1985 may reflect a greater degree of unpredictability in + availability of crustacean prey than at higher latitudes. Switches in diet may + merely reflect local changes in availability of particular prey within the inshore area + exploited by Gentoo Penguins. + Average meal size of Gentoo Penguins at Marion Island is small (La Cock et al. + 1984, this study) compared to those recovered from breeding penguins at higher + latitudes (Croxall and Prince 1980a, Jablonski 1985, Volkman et al. 1980). Evalua- + tion of the magnitude of this difference is complicated by the unknown ratio of + breeders to non-breeders sampled at Marion Island. However, the difference ap- + pears real and may reflect reduced food availability compared to more southerly + sites. This view is supported by the small total breeding population (Adams and + Wilson 1987), low breeding success and long growth period (Williams 1980) of + Gentoo Penguins at Marion Island and suggests that the population is food limited + (La Cock et al. 1982). However, in contrast to the southern populations which breed + in summer (Croxall and Prince 1980b), + Gentoo Penguins at Marion Island begin breeding in the austral winter (Williams + 1980). The infrequent feeding of King penguin chicks (Cherel et al. 1987, Pers. + obs.) and dispersal of Eudyptes penguins away from the island during winter + suggest food availability is low. Consequently, having excluded direct competi- + tion with crested penguins for crustacean prey (Williams 1981), the reason for winter + breeding by Gentoo Penguins remains unclear.[PAGE 8] + Duhamel, G. 1981. Characteristiques biologiques des principales espeches de poissons du plateau conti- + nental des iles Kerguelen. Cybium 3e serie 5: + Duhamel, G., Hureau, J. C. and Ozouf-Costaz, C. + 1983. Ecological survey of notothenioid fishes in the Southern Ocean from Bouvet to Kerguelen + Islands. Memoirs of the National Institute of Polar + Research Special Issue 27: 176-182. + Duhamel, G. and Ozouf-Costaz, C. 1985. Age, growth and reproductive biology of Notothenia squamifrons + Gunther 1880 from the Indian sector of the + Southern Ocean. Polar Biology 4: 143-153. + Ealey, E. H. M. 1954. Analysis of stomach contents of some Heard Island birds. Emu 54: 204-210. + Grindley, J. R. and Lane, S. B. 1979. Zooplankton around Marion and Prince Edward Islands. + Comite National Francais des Rescherches Antartiques 44: 111-126. + Hecht, T. 1987. A guide to the otoliths of Southern + Ocean fishes. South African Journal of Antarctica + Research 17: 1-87. + Hecht, T. and Cooper, J. 1986. Length/mass relationships, energetic content and otoliths of Antarctic + cod Paranotothenia magellanica (Nototheniidae: + Pisces) at sub-Antarctic Marion Island. South African Journal of Zoology 21: 294-296. + Jablonski, B. 1985. The diet of penguins on King + George Island, South Shetland Islands. Acta + Zoologica Cracoviensia 29: 117-186. + Jackson, S., Duffy, D. C. and Jenkins, J. F. G. 1987. + Gastric digestion in marine vertebrate predators: + in vitro standards. Functional Ecology 1: 287-291. + Kirkwood, J. M. 1982. A guide to the Euphausiacea of the Southern Ocean. ANARE Research Notes + Kirkwood, J. M. 1984. A guide to the Decap'da of the Southern Ocean. ANARE Research Notes 11: + Kock, K. H., Duhamel, G. and Hureau, J. C. 1985. + Biology and status of exploited Antarctic fish stocks: a review. BIOMASS Scientific Series 6: + La Cock, G. D., Hecht, T. and Klages, N. 1984.The winter diet of Gentoo Penguins at Marion Island. + Ostrich 55: 188-191. + Volkman, N. J., Presler, P. and Trivelpiece, W. 1980. + Diets of pygoscelid penguins at King George Island, Antarctica. Condor 82: 373-378. + Williams, A. J. 1980. Aspects of the breeding biology of the Gentoo Penguin Pygoscelis papua. Gerfaut + Williams, A. J. 1981. Factors affecting the time of breeding of Gentoo Penguins at Marion Island. In + Cooper J. (ed.) Proceedings of the symposium on birds of the sea and shore, 1979. 451-459. Cape + Town: African Seabird Group. + Wilson, R. P. 1984. An improved stomach pump for penguins and other seabirds. Journal of Field Or- + nithology 55: 109-112. +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC \ No newline at end of file diff --git a/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt b/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt new file mode 100644 index 0000000..2282198 --- /dev/null +++ b/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt @@ -0,0 +1,433 @@ +[PAGE 1] +Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic +Marion Island + +Author(s): N. J. Adams and N. T. Klages +Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 +Published by: Waterbird Society + [PAGE 2] + Temporal variation in the diet of the Gentoo Penguin + Pygoscelis papua at sub-Antarctic Marion Island + N. J. ADAMS & N. T. KLAGES' + Percy FitzPatrick Institute of African Ornithology, University of Cape Town, + Rondebosch 7700, South Africa + 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa + Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% + of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and + June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo + Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys + rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris + marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal + changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species + within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. + Key words.-diet, Gentoo Penguin, Pygoscelis papua, sub-Antarctic. + Colonial Waterbirds 12(1): 30-36, 1989 + Studies of the diets of both migrant and resident penguins in the Southern + Ocean region have concentrated on birds during the chick-rearing period, with little + information for the non-breeding season + (see Croxall and Lishman 1987). The Gentoo Penguin Pygoscelis papua is a year- + round resident at sub-Antarctic Marion Island (van Zinderen Bakker 1971), and at + some other breeding sites in the Southern + Ocean (Watson 1975), affording opportunities for sampling its diet throughout + the year. + Williams (1981) suggested that seasonal changes in the diet and earlier, winter + breeding by Gentoo Penguins at Marion + Island, compared to populations at higher latitudes, ensured a crustacean diet during + incubation and early chick rearing. He suggested that a switch by Gentoo Pen- + guins to a fish diet later in the season, deduced from a change in color of the + excreta, was a response to the arrival of large numbers of two species of potentially + competing, summer-resident crested penguins, the Macaroni Eudyptes chrysolophus + and the Rockhopper E. chrysocome. However, food samples collected at Marion Is- + land in September 1982 showed that fish are common in the diet of Gentoo Pen- + guins before the arrival of the other penguin species (La Cock et al. 1984). We re- + port here on a more comprehensive set o samples collected throughout a single ye + at Marion Island. Our principal objecti was to investigate temporal changes i + diet. + METHODS + A total of 144 stomach samples was collected fr + Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion + Island (46? 53'S, 37?54'E) from March 1984 to March + 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month + as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding + status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- + sequently, during this period, we probably sampled both breeding and non-breeding birds. + Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, + drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- + pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean + components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish + muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle + (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. + However, most stomach samples from Gentoo Penguins contained one prey type only (see below). + Most fish were recovered with heads separated from the remainder of the body. Consequently, + analyses were based on identification and measure- +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 3] + ADAMS & KLAGES * GENTOO PENGUIN DIET ment of otoliths (Adams and Klages, 1987). We esti- + mated standard length (SL) and mass of fish from regressions relating otolith diameter to these + parameters. Regressions not available in Adams and + Klages (1987) and Brown and Klages (1987) are given in Duhamel (1981), Hecht and Cooper (1986), and + Hecht (1987). Since otoliths were removed from intact fish crania, digestion and hence measurement + error were minimal. + Crustacean species were identified with the aid of published keys (Bowman and Gruner, 1973; + Kirkwood, 1982; 1984). Total length of intact individuals was measured from the anterior margin of an + eye to the tip of the telson. Numbers of crustaceans in each sample were estimated by dividing total mass + by the average mass of an individual crustacean of each respective species. Actual numbers and fresh + mass of prey ingested will be underestimated in welldigested samples. + Most of the small cephalopods were recovered intact, and numbers were counted directly. The iden- + tification of squid beaks was facilitated by a reference collection at the Port Elizabeth Museum, and con- + firmed by comparison with the literature (Clarke, + 1986). We estimated dorsal mantle length (DML) and mass of squid from regressions relating lower ros- + trum length (LRL) to these parameters (Adams and + Klages, 1987). Juvenile octopods were not identified. + samples at Marion Island. + Prey species Total % numbers % frequency number of prey class of occurrence + FISH + Notothiniidae + Notothenia squamifrons + Unident.juv. nototheniids + Notothenia acuta + Dissostichus eleginoides + Channichthyidae + Channichthys rhinoceratus + Myctophidae + Protomyctophum normani + Gymnoscopelus nicholsi + Protomyctophum tenisoni + Krefftichthys anderssoni + Protomyctophum bolini unid. myctophids + Electrona carlsbergi + Gymnoscopelus sp. + Muraenolepidae + Muraenolepts sp. + Unident. fish + Euphausiacea + Euphausia vallentini + Hippolytidae + Nauticaris marionis + Amphipoda + Themisto gaudichaudii + Hyperiella sp. + Unident. amphipods + Vibilia sp. + Primno sp. + Nematocarcinidae + Nematocarcinus longirostris + Octopoda + Unident.juv. octopods + Decapoda + Unident.juv. squid + Kondakovia longimana + Onychoteuthidae LRL >2mm + Onychoteuthidae LRL < 2mm + Margarella expansa + Ophiuroidea + 58.6 + 21.1 + 1.3 + 0.5 + 50.7 + 14.6 + 11.1 + 6.3 + 6.3 + 5.8 + 1.5 + 1.4 + 1.1 + 1.0 + 0.7 + 0.2 + <0.1 + 12.5 + 4.2 + 4.2 + 4.9 + 3.5 + 4.2 + 4.9 + 1.4 + 0.7 + 0.2 + 0.1 + 0.7 + 1.4 + 86.4 + 25.7 + 33.3 + 13.5 + 0.1 + <0.1 + <0.1 + <0.1 + <0.1 + <0.1 + 4.9 + 2.1 + 2.1 + 1.4 + 1.4 + 1.4 + 94.9 + 2.9 + 1.1 + 0.6 + 0.6 + 18.8 + 9.0 + 3.5 + 3.5 + 3.5 + 2.1 +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 4] + RESULTS + Mean mass of food samples was 139.2 + + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. + (1984). Mean monthly sample mass was highest in December at 332 g. + General Composition of the Diet + Thirty species or species groups were identified (Table 1), with fish comprising + the largest single group (11 species). However, samples from individual penguins + were largely homogeneous consisting of a single species of crustacean or fish (see also + Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- + tained only a single prey taxon, 26% contained two taxa and 10% contained three + taxa (analysis based on taxa comprising at least 5% by mass of individual samples). + Fish and crustaceans accounted for + 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- + siderable changes in proportions of fish, crustaceans and cephalopods occurred + over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass + from March to June 1984 but decreased to + 0% by March 1985. Cephalopods accounted for more than 10% of the diet + only in February and March 1985. + Fish + The fish component of the diet was dominated by the family Nototheniidae, + particularly Notothenia squamifrons. It is likely that most of the unidentified + juvenile nototheniids were also this species + (Table 1). Nototheniids accounted for over + 90% of the fish component during June to + October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- + peared more commonly in the diet from + October and accounted for most of the food items in March 1985 (Table 3). + Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas + When plotted on a monthly basis, the was no evidence of any progressive chan + in size-class distribution throughout t year. However, during January and Fe + ruary smaller fish between 28.6 - 41.8 m + (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger + dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini + months. Standard parameters for all fish species are given in Table 4. + Crustaceans + Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta + cean component. During winter (March to + August 1984), N. marionis formed the largest part of the crustacean component, but E. + vallentini subsequently increased to nearly + 100% in January and February 1985 + (Table 3). Mean length of E. vallentini was + 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm + + 5.18, n = 1481). + Cephalopods + Cephalopods accounted for just over + 10% of the diet during February and + March 1985 but seldom occurred in the remaining months (Table 3). Most + cephalopods were small octopods (DML < + 15 mm). The only squid identified were juveniles of the species Kondakovia lon- + gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? + 3.8 g (n = 4) respectively. + DISCUSSION + Fish (Ealey 1954, Volkman et al. 198 and, in particular, nototheniids may fo + a substantial portion of the diet of the G too Penguin throughout its range. T + most important prey species by mass in + Fish Crustaceans Cephalopods + Wet mass (%) Mean 53.5 44.4 2.1 + Range (0-100) (0-100) (0-51.8) +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 5] + ADAMS & KLAGES * GENTOO PENGUIN DIET by month. + Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar + Crustaceans + Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 + Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 + Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 + Fish + Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 + Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 + Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 + Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 + Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 + Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island + was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La + Cock et al. 1984, N. T. Klages pers. obs.) + whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- + ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the + islands of the southern Indian and, less commonly, the South Atlantic Oceans + MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- + shaded segment: crustaceans, stippled segment: + fish, shaded segment: cephalopods. + (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all + in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and + Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile + nototheniids in the diet in November and + December preceded an increase in the relative abundance of small-size class N. + squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o + these larval fish. + Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish + in the diet. Adults were presumably too large for consumption. The species had + previously been considered a demersal species endemic to the colder waters + around Kerguelen (48?27'-50?S, 60?27'- + 70?35'E) and Heard Islands (53?01'S, + 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters + around Marion Island was surprising. + However, hydrographical evidence (Benon and Murail 1979) and sampling of + zooplankton (Grindley and Lane 1979, + Boden and Parker 1986) suggests that advection of foreign water masses past the + island with associated fauna may occur + (Boden & Parker 1986). + The appearance of myctophids in the diet of Gentoo Penguins coincided with + the increase in relative abundance of these fish in the diet of King Penguins Apteno- + dytes patagonicus during summer, suggesting increased availability (Adams and + Klages 1987). Small numbers of myctophids have previously been reported in + the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. + 1984). + U9 + - + CO a + C, +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 6] + recovered from Gentoo Penguin stomach samples at Marion Island. + Species OD (mm) Estimated SL (mm) Estimated mass (g) + N Mean SD Range Mean SD Range Mean SD Range + Nototheniidae + Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) + Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) + Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) + Channichthyidae + Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36.8 (127.0-238.3) 21.1 ? 18.9 (6.8-61.9) + Myctophidae + Protomvctophum normani 61 2.38?0.10 (2.19-2.6) 77.7? 3.7 (70.0-85.2) 6.4? 0.9 (4.7-8.3) + Protomyctophum tenisoni 57 1.63?0.13 (1.3-1.78) 49.2 ?4.9 (37.0-54.8) 1.8 ?0.5 (0.8-2.3) + Krefftichthys anderssoni 35 1.51 0.25 (0.97-1.87) 49.6? 11.2 (25.2-66.0) 2.0? 1.1 (0.2-4.3) + Gymnoscopelus nicholsi 12 6.20?0.19 (5.91-6.55) 156.2+5.7 (147.4-166.7) 44.3?4.8 (37.4-53.3) + Protomvctophum bolini 4 2.00?0.38 (1.7-2.35) 57.7 12.2 (46.1-67.3) 1.5?0.8 (0.8-2.2) + Muraenolepidae + Muraenolepis sp. 7 2.7?0.58 (1.87-3.4) 141.1 46.6 (76.8-200.8) 24.9?20.8 (2.3-60.6) + Two species of crustacean Euphausia vallentini and Nauticaris marionis, were of + almost equal importance by mass at Marion Island. This is in contrast to the situa- + tion at sites south of the Antarctic Polar + Front, where the crustacean component of the diet of the Gentoo Penguin is domi- + nated by a single species, E. superba. E. vallentini was the most important prey con- + sumed by Macaroni and Rockhopper penguins during the 1984-1985 summer, and + remained abundant in the diet of Rockhopper Penguins in March 1985 (Brown + and Klages 1987) when it was absent from the diet of the Gentoo Penguin. Both + species of crested penguins probably forage farther offshore during late chick- + rearing than do Gentoo Penguins (Brown + 1987, Adams and Wilson 1987). This may indicate a movement of euphausiids out of + the immediate inshore waters during February and March 1985. + Due to its benthic nature, adult N. + marionis may be available to Gentoo Penguins only within a few kilometers of the + shore (Adams and Wilson 1987). The size + (average length: 35.1 mm) of N. marionis taken by Gentoo Penguins at Marion Is- + land in 1984-1985 is clearly greater than juveniles (maximum length: 23 mm) taken + by crested penguins during December + 1983 to February 1984 (Brown and Klages + 1987) and by Gentoo Penguins in September 1982 (La Cock et al. 1984). The ab- + sence of juvenile N. marionis in the diet of + Gentoo and crested penguins at Marion Island during January to March 1985 is dif- + ficult to explain, since at least some adult individuals recovered during April to Sep- + tember 1984 were ovigerous (N. J. Adams pers. obs.). + The occurrence of octopods in the diet of Gentoo Penguins is apparently unique + to the Marion Island site. The appearance of juvenile octopods in the diet coincided + with their occurrence in the diet of Rockhopper Penguins late in the chick-rearing + period (Brown and Klages 1987). Adult octopods are generally benthic and solit- + ary. The appearance of large numbers of juveniles in the diet suggests highly sea- + sonal spawning, coupled with a tendency to form dense aggregations in shallower + water. In contrast, the small number of juvenile squid in the diet of the inshore + foraging Gentoo Penguin again emphasizes the generally pelagic nature of + squid (Adams and Klages 1987). +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC + +[PAGE 7] + ADAMS & KLAGES * GENTOO PENGUIN DIET + Gentoo Penguins apparently switched from a largely crustacean diet during + March to June 1984 to a largely fish diet during July 1984 to March 1985. This + change coincided with peak egg laying and could not be considered as a direct re- + sponse to the arrival, in October and + November, of the largely, euphausiid consuming and hence potentially competing + crested penguins (cf. Williams 1981). + Moreover, the most important crustacean component by mass in the diet during + March 1984 to September 1984 was adult + N. marionis (not taken by Macaroni and + Rockhopper penguins) and not krill + (euphausiids) as intimated by Williams + (1981). The large variation in abundance and prey-species composition of crusta- + ceans in penguin diet from year-to-year, indicated by the differences in the diet of + the Macaroni and Rockhopper Penguins in two years at Marion Island (Brown and + Klages 1987) and Gentoo Penguins in + March 1984 and March 1985 may reflect a greater degree of unpredictability in + availability of crustacean prey than at higher latitudes. Switches in diet may + merely reflect local changes in availability of particular prey within the inshore area + exploited by Gentoo Penguins. + Average meal size of Gentoo Penguins at Marion Island is small (La Cock et al. + 1984, this study) compared to those recovered from breeding penguins at higher + latitudes (Croxall and Prince 1980a, Jablonski 1985, Volkman et al. 1980). Evalua- + tion of the magnitude of this difference is complicated by the unknown ratio of + breeders to non-breeders sampled at Marion Island. However, the difference ap- + pears real and may reflect reduced food availability compared to more southerly + sites. This view is supported by the small total breeding population (Adams and + Wilson 1987), low breeding success and long growth period (Williams 1980) of + Gentoo Penguins at Marion Island and suggests that the population is food limited + (La Cock et al. 1982). However, in contrast to the southern populations which breed + in summer (Croxall and Prince 1980b), + Gentoo Penguins at Marion Island begin breeding in the austral winter (Williams + 1980). The infrequent feeding of King penguin chicks (Cherel et al. 1987, Pers. + obs.) and dispersal of Eudyptes penguins away from the island during winter + suggest food availability is low. Consequently, having excluded direct competi- + tion with crested penguins for crustacean prey (Williams 1981), the reason for winter + breeding by Gentoo Penguins remains unclear.[PAGE 8] + Duhamel, G. 1981. Characteristiques biologiques des principales espeches de poissons du plateau conti- + nental des iles Kerguelen. Cybium 3e serie 5: + Duhamel, G., Hureau, J. C. and Ozouf-Costaz, C. + 1983. Ecological survey of notothenioid fishes in the Southern Ocean from Bouvet to Kerguelen + Islands. Memoirs of the National Institute of Polar + Research Special Issue 27: 176-182. + Duhamel, G. and Ozouf-Costaz, C. 1985. Age, growth and reproductive biology of Notothenia squamifrons + Gunther 1880 from the Indian sector of the + Southern Ocean. Polar Biology 4: 143-153. + Ealey, E. H. M. 1954. Analysis of stomach contents of some Heard Island birds. Emu 54: 204-210. + Grindley, J. R. and Lane, S. B. 1979. Zooplankton around Marion and Prince Edward Islands. + Comite National Francais des Rescherches Antartiques 44: 111-126. + Hecht, T. 1987. A guide to the otoliths of Southern + Ocean fishes. South African Journal of Antarctica + Research 17: 1-87. + Hecht, T. and Cooper, J. 1986. Length/mass relationships, energetic content and otoliths of Antarctic + cod Paranotothenia magellanica (Nototheniidae: + Pisces) at sub-Antarctic Marion Island. South African Journal of Zoology 21: 294-296. + Jablonski, B. 1985. The diet of penguins on King + George Island, South Shetland Islands. Acta + Zoologica Cracoviensia 29: 117-186. + Jackson, S., Duffy, D. C. and Jenkins, J. F. G. 1987. + Gastric digestion in marine vertebrate predators: + in vitro standards. Functional Ecology 1: 287-291. + Kirkwood, J. M. 1982. A guide to the Euphausiacea of the Southern Ocean. ANARE Research Notes + Kirkwood, J. M. 1984. A guide to the Decap'da of the Southern Ocean. ANARE Research Notes 11: + Kock, K. H., Duhamel, G. and Hureau, J. C. 1985. + Biology and status of exploited Antarctic fish stocks: a review. BIOMASS Scientific Series 6: + La Cock, G. D., Hecht, T. and Klages, N. 1984.The winter diet of Gentoo Penguins at Marion Island. + Ostrich 55: 188-191. + Volkman, N. J., Presler, P. and Trivelpiece, W. 1980. + Diets of pygoscelid penguins at King George Island, Antarctica. Condor 82: 373-378. + Williams, A. J. 1980. Aspects of the breeding biology of the Gentoo Penguin Pygoscelis papua. Gerfaut + Williams, A. J. 1981. Factors affecting the time of breeding of Gentoo Penguins at Marion Island. In + Cooper J. (ed.) Proceedings of the symposium on birds of the sea and shore, 1979. 451-459. Cape + Town: African Seabird Group. + Wilson, R. P. 1984. An improved stomach pump for penguins and other seabirds. Journal of Field Or- + nithology 55: 109-112. +This content downloaded from +97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC \ No newline at end of file diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index 6573232..86329c4 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -122,6 +122,29 @@ def _section_priority(heading: str) -> int: ), 5, ), + # frequency of occurrence — standard diet-study metric + ( + re.compile( + r"(?i)(frequency\s+of\s+occurrence" + r"|occurrence\s+frequency" + r"|%\s*fo\b" + r"|\bfo\s*=\s*\d" + r"|index\s+of\s+relative\s+importance" + r"|\biri\b)" + ), + 3, + ), + # stomach content / diet composition — general relevance signal + ( + re.compile( + r"(?i)(stomach\s+content" + r"|gut\s+content" + r"|diet\s+composition" + r"|food\s+habits?" + r"|dietary\s+(analysis|study|data))" + ), + 2, + ), # general percentage / fraction near gut/stomach context (re.compile(r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" r"|\d+\s+of\s+\d+\s+(were|had|contained)" r"|proportion\s+of\s+\d+)"), 2), # study date — collection period @@ -148,6 +171,23 @@ def _section_priority(heading: str) -> int: _ABSTRACT_CAP: int = 2000 +def _truncate_at_sentence(text: str, limit: int) -> str: + """Truncate *text* to at most *limit* chars, preferring a sentence boundary. + + Searches backwards from *limit* for the last ``.``, ``!``, or ``?``. If + one is found in the second half of the allowed window it is used; otherwise + the hard character limit is applied. + """ + if len(text) <= limit: + return text + candidate = text[:limit] + for punct in (".", "!", "?"): + pos = candidate.rfind(punct) + if pos > limit // 2: + return candidate[: pos + 1] + return candidate + + def _score_paragraph(para: str) -> int: """Return a keyword-relevance score for a single paragraph of text. @@ -284,16 +324,24 @@ def extract_key_sections(text: str, max_chars: int) -> str: sections.append((current_start, current_heading, "\n".join(current_lines))) # ── Phase 1 result: pin the abstract/preamble ────────────────────────── - preamble_text = "" + # Sections whose content is always pinned (counted against _ABSTRACT_CAP): + # the implicit preamble and any explicitly-named Abstract/Summary section. + _ABSTRACT_HEADING_RE = re.compile( + r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$" + ) + + preamble_parts: List[str] = [] body_sections: List[Tuple[int, str, str]] = [] # (start, heading, content) for start, heading, content in sections: if _DROP_SECTION_RE.match(heading.strip()) if heading.strip() else False: continue # hard-drop references / acknowledgements / appendix - if heading == "[PREAMBLE]": - preamble_text = content.strip()[:_ABSTRACT_CAP] + if heading == "[PREAMBLE]" or _ABSTRACT_HEADING_RE.match(heading.strip()): + preamble_parts.append(content.strip()) else: body_sections.append((start, heading, content)) + preamble_text = "\n\n".join(p for p in preamble_parts if p)[:_ABSTRACT_CAP] + budget = max_chars - len(preamble_text) # ── Phase 2: keyword-scored paragraph mining ─────────────────────────── @@ -321,7 +369,7 @@ def extract_key_sections(text: str, max_chars: int) -> str: para_start = sec_start + j + 1 if para_lines: para_text = "\n".join(para_lines).strip() - raw_paragraphs.append((sec_start + len(block.split("\n")), para_text, _score_paragraph(para_text))) + raw_paragraphs.append((para_start, para_text, _score_paragraph(para_text))) # Sort by score descending; use original position as tiebreaker (earlier first) raw_paragraphs.sort(key=lambda t: (-t[2], t[0])) @@ -335,7 +383,7 @@ def extract_key_sections(text: str, max_chars: int) -> str: selected_paras.append((pos, para_text)) budget -= len(para_text) elif budget > 200: - selected_paras.append((pos, para_text[:budget])) + selected_paras.append((pos, _truncate_at_sentence(para_text, budget))) budget = 0 # Re-sort to original document order so the LLM reads coherent text diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py index efe4662..7dbac0d 100644 --- a/src/preprocessing/text_cleaner.py +++ b/src/preprocessing/text_cleaner.py @@ -30,13 +30,19 @@ from pathlib import Path from typing import List +# Optional numeric section prefix shared with llm_text.py patterns. +# Matches e.g. "1.", "2.1.", "3.2.1 " so that "1. References" is also caught. +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + # --------------------------------------------------------------------------- # Section-level drop patterns # When a line matches one of these, the entire remainder of that page/section # block is discarded (until the next [PAGE N] marker or end of text). # --------------------------------------------------------------------------- _SECTION_DROP_HEADERS: re.Pattern = re.compile( - r"(?i)^\s*(" + r"(?i)^\s*" + + _NUM_PREFIX + + r"(" r"acknowledge?ments?" r"|literature\s+cited" r"|references?\s+cited" @@ -132,8 +138,9 @@ r"|findings?" r"|discussion" r"|conclusions?" + r"|conclusions?\s+and\s+discussion" + r"|discussion\s+and\s+conclusions?" r"|summary\s+and\s+discussion" - r"|acknowledge?ments?" r")\s*[:\.\-]?\s*$" ) From f66d48d851f733425d401cdcaed7e8649e19eeaa Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 12:55:24 -0700 Subject: [PATCH 19/30] added multi cpu processing option --- classify-extract.py | 249 ++++++++++++++++++++++++++------------------ 1 file changed, 147 insertions(+), 102 deletions(-) diff --git a/classify-extract.py b/classify-extract.py index be71e7c..a18b7fa 100644 --- a/classify-extract.py +++ b/classify-extract.py @@ -29,6 +29,7 @@ import argparse import csv import sys +from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path from src.preprocessing.pdf_text_extraction import extract_text_from_pdf @@ -42,6 +43,112 @@ # --------------------------------------------------------------------------- +def _process_single_pdf( + pdf_path: Path, + model_dir: str, + llm_model: str, + output_dir: Path, + confidence_threshold: float, + max_chars: int, + num_ctx: int, +) -> dict: + """Process a single PDF: classify and optionally extract metrics. + + Designed to run inside a worker process. All heavy resources (classifier, + LLM client) are created per-call so that the function is safe for + multiprocessing. + + Returns: + A summary-row dict for the CSV. + """ + row = { + "filename": pdf_path.name, + "classification": "", + "confidence": "", + "pred_prob": "", + "extraction_status": "", + "species_name": "", + "study_location": "", + "study_date": "", + "sample_size": "", + "num_empty_stomachs": "", + "num_nonempty_stomachs": "", + "fraction_feeding": "", + } + + # ── Step 1: Extract text ───────────────── + try: + original_text = extract_text_from_pdf(str(pdf_path)) + except Exception as e: + print(f" [ERROR] Text extraction failed ({pdf_path.name}): {e}", file=sys.stderr) + row["extraction_status"] = "text_extraction_failed" + return row + + if not original_text.strip(): + print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr) + row["extraction_status"] = "empty_text" + return row + + print(f" [INFO] {pdf_path.name}: {len(original_text)} chars", file=sys.stderr) + + # ── Step 2: Classify ────────────────────────── + clf_model, vectorizer, encoder = load_classifier(model_dir) + label, confidence, pred_prob = classify_text( + text=original_text, + model=clf_model, + vectorizer=vectorizer, + encoder=encoder, + threshold=confidence_threshold, + ) + print(f" [CLASSIFIER] {pdf_path.name} → {label} ({confidence:.2%})", file=sys.stderr) + + row["classification"] = label + row["confidence"] = f"{confidence:.4f}" + row["pred_prob"] = f"{pred_prob:.4f}" + + # ── Step 3: Extract ───────────────── + if label == "useful": + print(f" [INFO] {pdf_path.name}: Running LLM extraction...", file=sys.stderr) + + text_for_llm = original_text + if len(text_for_llm) > max_chars: + text_for_llm = extract_key_sections(text_for_llm, max_chars) + print(f" [INFO] {pdf_path.name}: trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr) + + try: + metrics = extract_metrics_from_text( + text=text_for_llm, + model=llm_model, + num_ctx=num_ctx, + ) + result = save_extraction_result( + metrics=metrics, + source_file=pdf_path, + original_text=original_text, + output_dir=output_dir, + ) + + m = result["metrics"] + row["extraction_status"] = "success" + row["species_name"] = m.get("species_name") or "" + row["study_location"] = m.get("study_location") or "" + row["study_date"] = m.get("study_date") or "" + row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] + row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] + row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] + row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] + + except Exception as e: + print(f" [ERROR] LLM extraction failed ({pdf_path.name}): {e}", file=sys.stderr) + row["extraction_status"] = "extraction_failed" + + else: + print(f" [INFO] {pdf_path.name}: Not useful — skipping LLM extraction.", file=sys.stderr) + row["extraction_status"] = "skipped_not_useful" + + return row + + def run_pipeline( input_path: Path, model_dir: str, @@ -50,6 +157,7 @@ def run_pipeline( confidence_threshold: float, max_chars: int, num_ctx: int, + workers: int = 1, ): """Run classify → extract pipeline on one or more PDFs. @@ -68,6 +176,7 @@ def run_pipeline( confidence_threshold: Classifier probability threshold for 'useful'. max_chars: Max characters to send to the LLM. num_ctx: Context window size for Ollama. + workers: Number of parallel worker processes (default: 1 = sequential). """ # ── Collect PDF paths ───────────────────────────────────────────────── if input_path.is_dir(): @@ -82,112 +191,41 @@ def run_pipeline( print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr) sys.exit(1) - # ── Load classifier once (avoid re-reading model artifacts per file) ── - print("[INFO] Loading classifier...", file=sys.stderr) - try: - clf_model, vectorizer, encoder = load_classifier(model_dir) - except FileNotFoundError as e: - print(f"[ERROR] {e}", file=sys.stderr) - sys.exit(1) - print("[INFO] Classifier loaded.", file=sys.stderr) - output_dir.mkdir(parents=True, exist_ok=True) summary_rows = [] - for idx, pdf_path in enumerate(pdf_paths, start=1): - print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) - - row = { - "filename": pdf_path.name, - "classification": "", - "confidence": "", - "pred_prob": "", - "extraction_status": "", - "species_name": "", - "study_location": "", - "study_date": "", - "sample_size": "", - "num_empty_stomachs": "", - "num_nonempty_stomachs": "", - "fraction_feeding": "", - } - - # ── Step 1: Extract text ───────────────── - try: - original_text = extract_text_from_pdf(str(pdf_path)) - except Exception as e: - print(f" [ERROR] Text extraction failed: {e}", file=sys.stderr) - row["extraction_status"] = "text_extraction_failed" - summary_rows.append(row) - continue - - if not original_text.strip(): - print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr) - row["extraction_status"] = "empty_text" + if workers > 1 and len(pdf_paths) > 1: + print(f"[INFO] Using {workers} worker processes.", file=sys.stderr) + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + _process_single_pdf, + pdf_path, + model_dir, + llm_model, + output_dir, + confidence_threshold, + max_chars, + num_ctx, + ): pdf_path + for pdf_path in pdf_paths + } + for future in as_completed(futures): + pdf_path = futures[future] + try: + row = future.result() + except Exception as exc: + print(f" [ERROR] Worker failed for {pdf_path.name}: {exc}", file=sys.stderr) + row = {"filename": pdf_path.name, "extraction_status": "worker_failed"} + summary_rows.append(row) + else: + for idx, pdf_path in enumerate(pdf_paths, start=1): + print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) + row = _process_single_pdf( + pdf_path, model_dir, llm_model, output_dir, + confidence_threshold, max_chars, num_ctx, + ) summary_rows.append(row) - continue - - print(f" [INFO] Text size: {len(original_text)} chars", file=sys.stderr) - - # ── Step 2: Classify ────────────────────────── - label, confidence, pred_prob = classify_text( - text=original_text, - model=clf_model, - vectorizer=vectorizer, - encoder=encoder, - threshold=confidence_threshold, - ) - print(f" [CLASSIFIER] → {label} ({confidence:.2%} confidence)", file=sys.stderr) - - row["classification"] = label - row["confidence"] = f"{confidence:.4f}" - row["pred_prob"] = f"{pred_prob:.4f}" - - # ── Step 3: Extract ───────────────── - if label == "useful": - print(f" [INFO] Running LLM extraction...", file=sys.stderr) - - # Trim text to budget using section-priority logic (llm_text.py) - text_for_llm = original_text - if len(text_for_llm) > max_chars: - text_for_llm = extract_key_sections(text_for_llm, max_chars) - print(f" [INFO] Text trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr) - - try: - # LLM call (llm_client.py) - metrics = extract_metrics_from_text( - text=text_for_llm, - model=llm_model, - num_ctx=num_ctx, - ) - - # Resolve source pages + save JSON (llm_client.py) - result = save_extraction_result( - metrics=metrics, - source_file=pdf_path, - original_text=original_text, - output_dir=output_dir, - ) - - m = result["metrics"] - row["extraction_status"] = "success" - row["species_name"] = m.get("species_name") or "" - row["study_location"] = m.get("study_location") or "" - row["study_date"] = m.get("study_date") or "" - row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] - row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] - row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] - row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] - - except Exception as e: - print(f" [ERROR] LLM extraction failed: {e}", file=sys.stderr) - row["extraction_status"] = "extraction_failed" - - else: - print(f" [INFO] Not useful — skipping LLM extraction.", file=sys.stderr) - row["extraction_status"] = "skipped_not_useful" - - summary_rows.append(row) # ── Write summary CSV ───────────────────────────────────────────────── from datetime import datetime @@ -301,6 +339,12 @@ def main(): default=4096, help="Context window size for Ollama (default: 4096).", ) + parser.add_argument( + "--workers", + type=int, + default=1, + help="Number of parallel worker processes (default: 1 = sequential).", + ) args = parser.parse_args() @@ -317,6 +361,7 @@ def main(): confidence_threshold=args.confidence_threshold, max_chars=args.max_chars, num_ctx=args.num_ctx, + workers=args.workers, ) From 208029d6d4493cfb8bf9d5c3c7c247caeed9a8bb Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 12:57:05 -0700 Subject: [PATCH 20/30] added --workers arg --- scripts/full_pipeline.py | 57 ++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index f7ff28f..e8baac1 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -25,8 +25,9 @@ import os import json import argparse +from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path -from typing import Dict +from typing import Dict, Tuple import subprocess import sys @@ -49,6 +50,19 @@ from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes +def _extract_local_pdf(args: Tuple[Path, str]) -> Tuple[str, str, str | None]: + """Worker: read a local PDF and return (txt_name, label, text | None).""" + pdf_path, label = args + try: + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + text = extract_text_from_pdf_bytes(pdf_bytes) + return (f"{pdf_path.stem}.txt", label, text) + except Exception as e: + print(f"Error processing {pdf_path.name}: {e}") + return (f"{pdf_path.stem}.txt", label, None) + + def run(cmd): print(f"$ {' '.join(cmd)}") r = subprocess.run(cmd) @@ -97,7 +111,7 @@ def process_api_mode(): print(f"Wrote {len(labels)} labeled text files.") -def process_local_mode(data_path: Path): +def process_local_mode(data_path: Path, workers: int = 1): """Process PDFs from local directory.""" if not data_path.exists(): raise RuntimeError(f"Data path does not exist: {data_path}") @@ -114,23 +128,31 @@ def process_local_mode(data_path: Path): out_dir.mkdir(parents=True, exist_ok=True) labels: Dict[str, str] = {} + # Build work items: (pdf_path, label) + work_items = [] for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]: pdf_files = list(folder.glob("*.pdf")) print(f"Found {len(pdf_files)} PDFs in local folder '{label}'") - for pdf_path in pdf_files: - try: - with open(pdf_path, "rb") as f: - pdf_bytes = f.read() - text = extract_text_from_pdf_bytes(pdf_bytes) - stem = pdf_path.stem - txt_name = f"{stem}.txt" + work_items.append((pdf_path, label)) + + if workers > 1 and len(work_items) > 1: + print(f"[INFO] Using {workers} worker processes for PDF extraction.") + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = {executor.submit(_extract_local_pdf, item): item for item in work_items} + for future in as_completed(futures): + txt_name, label, text = future.result() + if text is not None: + (out_dir / txt_name).write_text(text, encoding="utf-8") + labels[txt_name] = label + print(f"Processed {txt_name}") + else: + for item in work_items: + txt_name, label, text = _extract_local_pdf(item) + if text is not None: (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label - print(f"Processed {pdf_path.name}") - except Exception as e: - print(f"Error processing {pdf_path.name}: {e}") - continue + print(f"Processed {txt_name}") write_labels(labels, Path("data/labels.json")) print(f"Wrote {len(labels)} labeled text files.") @@ -152,11 +174,18 @@ def main(): group.add_argument("--api", action="store_true", help="Use API mode to download PDFs from Google Drive") group.add_argument("--local", type=Path, metavar="PATH", help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)") + parser.add_argument( + "--workers", + type=int, + default=1, + help="Number of parallel worker processes for PDF extraction (default: 1 = sequential).", + ) + args = parser.parse_args() if args.local: print(f"Running in LOCAL mode with data path: {args.local}") - process_local_mode(args.local) + process_local_mode(args.local, workers=args.workers) else: # args.api print("Running in API mode (Google Drive)") process_api_mode() From 4a01b9d63c1658b373c606bb186c9c9cc7dfc34b Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 13:20:35 -0700 Subject: [PATCH 21/30] added sequqnetial pdf processing --- scripts/full_pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index e8baac1..d7dd673 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -177,15 +177,16 @@ def main(): parser.add_argument( "--workers", type=int, - default=1, - help="Number of parallel worker processes for PDF extraction (default: 1 = sequential).", + default=0, + help="Number of parallel worker processes for PDF extraction (default: 0 = auto-detect CPU count).", ) args = parser.parse_args() + workers = args.workers if args.workers > 0 else os.cpu_count() or 4 if args.local: print(f"Running in LOCAL mode with data path: {args.local}") - process_local_mode(args.local, workers=args.workers) + process_local_mode(args.local, workers=workers) else: # args.api print("Running in API mode (Google Drive)") process_api_mode() From 33adda553a54a39308f7b9de37231e501adcaf78 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 13:20:49 -0700 Subject: [PATCH 22/30] loads SpellChecker() only once --- src/preprocessing/pdf_text_extraction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 2ae6eae..9c922ec 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -29,6 +29,9 @@ # Maximum allowed ratio of misspelled words to total words in a pdf MAX_SPELLING_ERROR_RATE = 0.05 +# Module-level singleton — avoids reloading the dictionary for every PDF +_spell_checker = SpellChecker() + def check_spelling(text: str) -> float: """ @@ -36,11 +39,10 @@ def check_spelling(text: str) -> float: Returns 1 if no words detected in input string """ - spellChecker = SpellChecker() - words = spellChecker.split_words(text) + words = _spell_checker.split_words(text) if len(words) == 0: return 1 - misspelled = spellChecker.unknown(words) + misspelled = _spell_checker.unknown(words) return len(misspelled) / len(words) From 823fd7f93cc33f8e4cdcd920eb2315cef9485d38 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 13:21:09 -0700 Subject: [PATCH 23/30] xgboost trinaing set to gpu if available --- src/model/train_model.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/model/train_model.py b/src/model/train_model.py index f2f617d..e8bf427 100644 --- a/src/model/train_model.py +++ b/src/model/train_model.py @@ -92,8 +92,20 @@ def train_pdf_classifier(texts, labels, output_dir="src/model/models"): dtrain = xgb.DMatrix(X_train_vec, label=y_train) dtest = xgb.DMatrix(X_test_vec, label=y_test) + # Use GPU if available (e.g. HPC gpu nodes), fall back to CPU + try: + _probe = xgb.DMatrix(X_train_vec[:1], label=y_train[:1]) + xgb.train({"device": "cuda", "tree_method": "hist"}, _probe, num_boost_round=1) + device, tree_method = "cuda", "hist" + print("[INFO] GPU detected — training with CUDA.") + except xgb.core.XGBoostError: + device, tree_method = "cpu", "hist" + print("[INFO] No GPU available — training on CPU.") + # XGBoost parameters params = { + "device": device, + "tree_method": tree_method, "objective": "binary:logistic", # binary classification "eval_metric": "logloss", # log loss metric "eta": 0.05, # learning rate From 72388c1ecb6020b6b3d6347fd850169860898435 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 13:30:33 -0700 Subject: [PATCH 24/30] added bypass OCR because it froze workers --- scripts/full_pipeline.py | 93 +++++++++++++++++++++--- src/preprocessing/pdf_text_extraction.py | 10 ++- 2 files changed, 88 insertions(+), 15 deletions(-) diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index d7dd673..b03bb4c 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -25,7 +25,8 @@ import os import json import argparse -from concurrent.futures import ProcessPoolExecutor, as_completed +import time +from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError from pathlib import Path from typing import Dict, Tuple import subprocess @@ -50,13 +51,24 @@ from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes +# Module-level flag set once per worker process via initializer +_worker_skip_ocr: bool = False + + +def _init_worker(skip_ocr: bool) -> None: + """Called once per worker process to set shared config.""" + global _worker_skip_ocr + _worker_skip_ocr = skip_ocr + + def _extract_local_pdf(args: Tuple[Path, str]) -> Tuple[str, str, str | None]: """Worker: read a local PDF and return (txt_name, label, text | None).""" pdf_path, label = args + print(f" [STARTED] {pdf_path.name}", flush=True) try: with open(pdf_path, "rb") as f: pdf_bytes = f.read() - text = extract_text_from_pdf_bytes(pdf_bytes) + text = extract_text_from_pdf_bytes(pdf_bytes, skip_ocr=_worker_skip_ocr) return (f"{pdf_path.stem}.txt", label, text) except Exception as e: print(f"Error processing {pdf_path.name}: {e}") @@ -111,7 +123,7 @@ def process_api_mode(): print(f"Wrote {len(labels)} labeled text files.") -def process_local_mode(data_path: Path, workers: int = 1): +def process_local_mode(data_path: Path, workers: int = 1, skip_ocr: bool = False, timeout: int = 120): """Process PDFs from local directory.""" if not data_path.exists(): raise RuntimeError(f"Data path does not exist: {data_path}") @@ -126,7 +138,17 @@ def process_local_mode(data_path: Path, workers: int = 1): out_dir = Path("data/processed-text") out_dir.mkdir(parents=True, exist_ok=True) - labels: Dict[str, str] = {} + + # Resume support: load existing labels and skip already-processed files + labels_file = Path("data/labels.json") + if labels_file.exists(): + with labels_file.open("r", encoding="utf-8") as f: + labels: Dict[str, str] = json.load(f) + already_done = {name for name in labels if (out_dir / name).exists()} + print(f"[INFO] Resuming — {len(already_done)} files already processed, skipping them.") + else: + labels = {} + already_done = set() # Build work items: (pdf_path, label) work_items = [] @@ -134,28 +156,66 @@ def process_local_mode(data_path: Path, workers: int = 1): pdf_files = list(folder.glob("*.pdf")) print(f"Found {len(pdf_files)} PDFs in local folder '{label}'") for pdf_path in pdf_files: + txt_name = f"{pdf_path.stem}.txt" + if txt_name in already_done: + continue work_items.append((pdf_path, label)) + if not work_items: + print("[INFO] All files already processed. Nothing to do.") + write_labels(labels, labels_file) + return + + print(f"[INFO] {len(work_items)} PDFs to process.") + if skip_ocr: + print("[INFO] OCR disabled — using embedded text only (fast mode).") + + total = len(work_items) + done = 0 + failed = 0 + t0 = time.time() + if workers > 1 and len(work_items) > 1: print(f"[INFO] Using {workers} worker processes for PDF extraction.") - with ProcessPoolExecutor(max_workers=workers) as executor: + with ProcessPoolExecutor( + max_workers=workers, + initializer=_init_worker, + initargs=(skip_ocr,), + ) as executor: futures = {executor.submit(_extract_local_pdf, item): item for item in work_items} for future in as_completed(futures): - txt_name, label, text = future.result() + pdf_path, label = futures[future] + try: + txt_name, label, text = future.result(timeout=timeout) + except TimeoutError: + print(f" [TIMEOUT] {pdf_path.name} exceeded {timeout}s — skipped") + failed += 1 + continue + except Exception as exc: + print(f" [ERROR] {pdf_path.name}: {exc}") + failed += 1 + continue + done += 1 if text is not None: (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label - print(f"Processed {txt_name}") + elapsed = time.time() - t0 + print(f" [{done + failed}/{total}] Processed {txt_name} ({elapsed:.0f}s elapsed)") + # Checkpoint labels every 50 files + if done % 50 == 0: + write_labels(labels, labels_file) else: for item in work_items: txt_name, label, text = _extract_local_pdf(item) + done += 1 if text is not None: (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label - print(f"Processed {txt_name}") + elapsed = time.time() - t0 + print(f" [{done}/{total}] Processed {txt_name} ({elapsed:.0f}s elapsed)") - write_labels(labels, Path("data/labels.json")) - print(f"Wrote {len(labels)} labeled text files.") + write_labels(labels, labels_file) + print(f"Wrote {len(labels)} labeled text files. ({failed} timed out / failed)") def main(): @@ -180,13 +240,24 @@ def main(): default=0, help="Number of parallel worker processes for PDF extraction (default: 0 = auto-detect CPU count).", ) + parser.add_argument( + "--skip-ocr", + action="store_true", + help="Skip Tesseract OCR fallback — use only embedded text (much faster, recommended for first pass).", + ) + parser.add_argument( + "--timeout", + type=int, + default=120, + help="Per-PDF timeout in seconds (default: 120). PDFs exceeding this are skipped.", + ) args = parser.parse_args() workers = args.workers if args.workers > 0 else os.cpu_count() or 4 if args.local: print(f"Running in LOCAL mode with data path: {args.local}") - process_local_mode(args.local, workers=workers) + process_local_mode(args.local, workers=workers, skip_ocr=args.skip_ocr, timeout=args.timeout) else: # args.api print("Running in API mode (Google Drive)") process_api_mode() diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 9c922ec..822f865 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -179,7 +179,7 @@ def parse_page_embedded(page: fitz.Page, page_num: int) -> str: return f"[PAGE {page_num}]\n{page_text}" -def extract_text_from_pdf(pdf_path: str) -> str: +def extract_text_from_pdf(pdf_path: str, skip_ocr: bool = False) -> str: text = [] try: with fitz.open(pdf_path) as doc: @@ -190,7 +190,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + if not skip_ocr and check_spelling(text) > MAX_SPELLING_ERROR_RATE: + print(f"[INFO] High misspelling rate in {Path(pdf_path).name} — falling back to OCR", file=sys.stderr) text = [] try: with fitz.open(pdf_path) as doc: @@ -203,7 +204,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: return text -def extract_text_from_pdf_bytes(data: bytes) -> str: +def extract_text_from_pdf_bytes(data: bytes, skip_ocr: bool = False) -> str: """Extract text from an in-memory PDF without writing the PDF to disk.""" text = [] try: @@ -215,7 +216,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return "" - if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + if not skip_ocr and check_spelling(text) > MAX_SPELLING_ERROR_RATE: + print(f"[INFO] High misspelling rate — falling back to OCR", file=sys.stderr) text = [] try: with fitz.open(stream=data, filetype="pdf") as doc: From 22631167fce928a3bbb475694a6c223fb6434229 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 15:54:43 -0700 Subject: [PATCH 25/30] --labels option to process all useful papers --- extract-from-txt.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/extract-from-txt.py b/extract-from-txt.py index 8b0be91..c8d7dfa 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -71,6 +71,7 @@ def run_txt_pipeline( max_chars: int, num_ctx: int, single_file: Path = None, + useful_stems: set = None, ) -> None: """Process every .txt file in *input_dir* through clean → filter → trim → extract. @@ -87,6 +88,8 @@ def run_txt_pipeline( txt_paths = [single_file] else: txt_paths = sorted(input_dir.glob("*.txt")) + if useful_stems is not None: + txt_paths = [p for p in txt_paths if p.stem in useful_stems] if not txt_paths: print(f"[ERROR] No .txt files found in: {input_dir}", file=sys.stderr) sys.exit(1) @@ -335,9 +338,28 @@ def main() -> None: default=8192, help="Ollama context window size (default: 8192).", ) + parser.add_argument( + "--labels", + type=str, + default=None, + help="Path to labels.json. When provided, only files labelled 'useful' are processed.", + ) args = parser.parse_args() + # ── Load label filter ─────────────────────────────────────────────── + useful_stems = None + if args.labels: + import json + labels_path = Path(args.labels) + if not labels_path.exists(): + print(f"[ERROR] Labels file not found: {labels_path}", file=sys.stderr) + sys.exit(1) + with open(labels_path, encoding="utf-8") as f: + labels = json.load(f) + useful_stems = {k for k, v in labels.items() if v == "useful"} + print(f"[INFO] Labels filter: {len(useful_stems)} useful papers", file=sys.stderr) + single_file = None if args.file: single_file = Path(args.file) @@ -364,6 +386,7 @@ def main() -> None: max_chars=args.max_chars, num_ctx=args.num_ctx, single_file=single_file, + useful_stems=useful_stems, ) From 3320e37d380060ae1b19f6537afdf679372104aa Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Thu, 12 Mar 2026 15:56:45 -0700 Subject: [PATCH 26/30] fixed name scanning in labels.json --- extract-from-txt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract-from-txt.py b/extract-from-txt.py index c8d7dfa..ccdcc41 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -89,7 +89,7 @@ def run_txt_pipeline( else: txt_paths = sorted(input_dir.glob("*.txt")) if useful_stems is not None: - txt_paths = [p for p in txt_paths if p.stem in useful_stems] + txt_paths = [p for p in txt_paths if p.stem in useful_stems or p.name in useful_stems] if not txt_paths: print(f"[ERROR] No .txt files found in: {input_dir}", file=sys.stderr) sys.exit(1) From 251d86c8ab558688c9da77a1ed6c36631b21659a Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sat, 14 Mar 2026 15:12:08 -0700 Subject: [PATCH 27/30] reformat --- classify-extract.py | 9 +++++++-- extract-from-txt.py | 1 + src/llm/llm_text.py | 21 +++------------------ src/preprocessing/text_cleaner.py | 4 +--- 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/classify-extract.py b/classify-extract.py index a18b7fa..9e612ab 100644 --- a/classify-extract.py +++ b/classify-extract.py @@ -222,8 +222,13 @@ def run_pipeline( for idx, pdf_path in enumerate(pdf_paths, start=1): print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) row = _process_single_pdf( - pdf_path, model_dir, llm_model, output_dir, - confidence_threshold, max_chars, num_ctx, + pdf_path, + model_dir, + llm_model, + output_dir, + confidence_threshold, + max_chars, + num_ctx, ) summary_rows.append(row) diff --git a/extract-from-txt.py b/extract-from-txt.py index ccdcc41..0ac01d1 100644 --- a/extract-from-txt.py +++ b/extract-from-txt.py @@ -351,6 +351,7 @@ def main() -> None: useful_stems = None if args.labels: import json + labels_path = Path(args.labels) if not labels_path.exists(): print(f"[ERROR] Labels file not found: {labels_path}", file=sys.stderr) diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index 86329c4..b3dd846 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -124,25 +124,12 @@ def _section_priority(heading: str) -> int: ), # frequency of occurrence — standard diet-study metric ( - re.compile( - r"(?i)(frequency\s+of\s+occurrence" - r"|occurrence\s+frequency" - r"|%\s*fo\b" - r"|\bfo\s*=\s*\d" - r"|index\s+of\s+relative\s+importance" - r"|\biri\b)" - ), + re.compile(r"(?i)(frequency\s+of\s+occurrence" r"|occurrence\s+frequency" r"|%\s*fo\b" r"|\bfo\s*=\s*\d" r"|index\s+of\s+relative\s+importance" r"|\biri\b)"), 3, ), # stomach content / diet composition — general relevance signal ( - re.compile( - r"(?i)(stomach\s+content" - r"|gut\s+content" - r"|diet\s+composition" - r"|food\s+habits?" - r"|dietary\s+(analysis|study|data))" - ), + re.compile(r"(?i)(stomach\s+content" r"|gut\s+content" r"|diet\s+composition" r"|food\s+habits?" r"|dietary\s+(analysis|study|data))"), 2, ), # general percentage / fraction near gut/stomach context @@ -326,9 +313,7 @@ def extract_key_sections(text: str, max_chars: int) -> str: # ── Phase 1 result: pin the abstract/preamble ────────────────────────── # Sections whose content is always pinned (counted against _ABSTRACT_CAP): # the implicit preamble and any explicitly-named Abstract/Summary section. - _ABSTRACT_HEADING_RE = re.compile( - r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$" - ) + _ABSTRACT_HEADING_RE = re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$") preamble_parts: List[str] = [] body_sections: List[Tuple[int, str, str]] = [] # (start, heading, content) diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py index 7dbac0d..e41b067 100644 --- a/src/preprocessing/text_cleaner.py +++ b/src/preprocessing/text_cleaner.py @@ -40,9 +40,7 @@ # block is discarded (until the next [PAGE N] marker or end of text). # --------------------------------------------------------------------------- _SECTION_DROP_HEADERS: re.Pattern = re.compile( - r"(?i)^\s*" - + _NUM_PREFIX - + r"(" + r"(?i)^\s*" + _NUM_PREFIX + r"(" r"acknowledge?ments?" r"|literature\s+cited" r"|references?\s+cited" From 6920c958338a8988cfd57e37c3805751b2ed96d1 Mon Sep 17 00:00:00 2001 From: Raymond Cen <109717872+raymondcen@users.noreply.github.com> Date: Sun, 15 Mar 2026 15:29:06 -0700 Subject: [PATCH 28/30] Delete data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt --- .../llm_text/Adams_1989_20260312_115204.txt | 160 ------------------ 1 file changed, 160 deletions(-) delete mode 100644 data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt diff --git a/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt b/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt deleted file mode 100644 index 124e9ce..0000000 --- a/data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt +++ /dev/null @@ -1,160 +0,0 @@ -[PAGE 1] -Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic -Marion Island - -Author(s): N. J. Adams and N. T. Klages -Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 -Published by: Waterbird Society - [PAGE 2] - Temporal variation in the diet of the Gentoo Penguin - Pygoscelis papua at sub-Antarctic Marion Island - N. J. ADAMS & N. T. KLAGES' - Percy FitzPatrick Institute of African Ornithology, University of Cape Town, - Rondebosch 7700, South Africa - 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa - Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% - of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and - June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo - Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys - rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris - marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal - changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species - within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. - Key words.-diet, Gentoo Penguin, Pygoscelis papua, - -METHODS - A total of 144 stomach samples was collected fr - Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion - Island (46? 53'S, 37?54'E) from March 1984 to March - 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month - as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding - status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- - sequently, during this period, we probably sampled both breeding and non-breeding birds. - Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, - drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- - pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean - components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish - muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle - (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. - However, most stomach samples from Gentoo Penguins contained one prey type only (see below). - Most fish were recovered with heads separated from the remainder of the body. Consequently, - analyses were based on identification and measure- -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -RESULTS - Mean mass of food samples was 139.2 - + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. - (1984). Mean monthly sample mass was highest in December at 332 g. - General Composition of the Diet - Thirty species or species groups were identified (Table 1), with fish comprising - the largest single group (11 species). However, samples from individual penguins - were largely homogeneous consisting of a single species of crustacean or fish (see also - Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- - tained only a single prey taxon, 26% contained two taxa and 10% contained three - taxa (analysis based on taxa comprising at least 5% by mass of individual samples). - Fish and crustaceans accounted for - 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- - siderable changes in proportions of fish, crustaceans and cephalopods occurred - over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass - from March to June 1984 but decreased to - 0% by March 1985. Cephalopods accounted for more than 10% of the diet - only in February and March 1985. - Fish - The fish component of the diet was dominated by the family Nototheniidae, - particularly Notothenia squamifrons. It is likely that most of the unidentified - juvenile nototheniids were also this species - (Table 1). Nototheniids accounted for over - 90% of the fish component during June to - October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- - peared more commonly in the diet from - October and accounted for most of the food items in March 1985 (Table 3). - Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas - When plotted on a monthly basis, the was no evidence of any progressive chan - in size-class distribution throughout t year. However, during January and Fe - ruary smaller fish between 28.6 - 41.8 m - (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger - dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini - months. Standard parameters for all fish species are given in Table 4. - Crustaceans - Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta - cean component. During winter (March to - August 1984), N. marionis formed the largest part of the crustacean component, but E. - vallentini subsequently increased to nearly - 100% in January and February 1985 - (Table 3). Mean length of E. vallentini was - 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm - + 5.18, n = 1481). - Cephalopods - Cephalopods accounted for just over - 10% of the diet during February and - March 1985 but seldom occurred in the remaining months (Table 3). Most - cephalopods were small octopods (DML < - 15 mm). The only squid identified were juveniles of the species Kondakovia lon- - gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? - 3.8 g (n = 4) respectively. - -[PAGE 5] - ADAMS & KLAGES * GENTOO PENGUIN DIET by month. - Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar - Crustaceans - Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 - Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 - Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 - Fish - Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 - Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 - Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 - Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 - Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 - Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island - was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La - Cock et al. 1984, N. T. Klages pers. obs.) - whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- - ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the - islands of the southern Indian and, less commonly, the South Atlantic Oceans - MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- - shaded segment: crustaceans, stippled segment: - fish, shaded segment: cephalopods. - (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all - in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and - Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile - nototheniids in the diet in November and - December preceded an increase in the relative abundance of small-size class N. - squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o - these larval fish. - Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish - in the diet. Adults were presumably too large for consumption. The species had - previously been considered a demersal species endemic to the colder waters - around Kerguelen (48?27'-50?S, 60?27'- - 70?35'E) and Heard Islands (53?01'S, - 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters - around Marion Island was surprising. - However, hydrographical evidence (Benon and Murail 1979) and sampling of - zooplankton (Grindley and Lane 1979, - Boden and Parker 1986) suggests that advection of foreign water masses past the - island with associated fauna may occur - (Boden & Parker 1986). - The appearance of myctophids in the diet of Gentoo Penguins coincided with - the increase in relative abundance of these fish in the diet of King Penguins Apteno- - dytes patagonicus during summer, suggesting increased availability (Adams and - Klages 1987). Small numbers of myctophids have previously been reported in - the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. - 1984). - U9 - - - CO a - C, -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 6] - recovered from Gentoo Penguin stomach samples at Marion Island. - Species OD (mm) Estimated SL (mm) Estimated mass (g) - N Mean SD Range Mean SD Range Mean SD Range - Nototheniidae - Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) - Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) - Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) - Channichthyidae - Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36. \ No newline at end of file From 41c49a49b70ccee430d6d633f8f5db2687c03847 Mon Sep 17 00:00:00 2001 From: Raymond Cen <109717872+raymondcen@users.noreply.github.com> Date: Sun, 15 Mar 2026 15:29:16 -0700 Subject: [PATCH 29/30] Delete data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt --- .../Adams_1989_20260312_115204.txt | 433 ------------------ 1 file changed, 433 deletions(-) delete mode 100644 data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt diff --git a/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt b/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt deleted file mode 100644 index 2282198..0000000 --- a/data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt +++ /dev/null @@ -1,433 +0,0 @@ -[PAGE 1] -Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic -Marion Island - -Author(s): N. J. Adams and N. T. Klages -Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 -Published by: Waterbird Society - [PAGE 2] - Temporal variation in the diet of the Gentoo Penguin - Pygoscelis papua at sub-Antarctic Marion Island - N. J. ADAMS & N. T. KLAGES' - Percy FitzPatrick Institute of African Ornithology, University of Cape Town, - Rondebosch 7700, South Africa - 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa - Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% - of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and - June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo - Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys - rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris - marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal - changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species - within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. - Key words.-diet, Gentoo Penguin, Pygoscelis papua, sub-Antarctic. - Colonial Waterbirds 12(1): 30-36, 1989 - Studies of the diets of both migrant and resident penguins in the Southern - Ocean region have concentrated on birds during the chick-rearing period, with little - information for the non-breeding season - (see Croxall and Lishman 1987). The Gentoo Penguin Pygoscelis papua is a year- - round resident at sub-Antarctic Marion Island (van Zinderen Bakker 1971), and at - some other breeding sites in the Southern - Ocean (Watson 1975), affording opportunities for sampling its diet throughout - the year. - Williams (1981) suggested that seasonal changes in the diet and earlier, winter - breeding by Gentoo Penguins at Marion - Island, compared to populations at higher latitudes, ensured a crustacean diet during - incubation and early chick rearing. He suggested that a switch by Gentoo Pen- - guins to a fish diet later in the season, deduced from a change in color of the - excreta, was a response to the arrival of large numbers of two species of potentially - competing, summer-resident crested penguins, the Macaroni Eudyptes chrysolophus - and the Rockhopper E. chrysocome. However, food samples collected at Marion Is- - land in September 1982 showed that fish are common in the diet of Gentoo Pen- - guins before the arrival of the other penguin species (La Cock et al. 1984). We re- - port here on a more comprehensive set o samples collected throughout a single ye - at Marion Island. Our principal objecti was to investigate temporal changes i - diet. - METHODS - A total of 144 stomach samples was collected fr - Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion - Island (46? 53'S, 37?54'E) from March 1984 to March - 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month - as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding - status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- - sequently, during this period, we probably sampled both breeding and non-breeding birds. - Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, - drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- - pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean - components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish - muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle - (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. - However, most stomach samples from Gentoo Penguins contained one prey type only (see below). - Most fish were recovered with heads separated from the remainder of the body. Consequently, - analyses were based on identification and measure- -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 3] - ADAMS & KLAGES * GENTOO PENGUIN DIET ment of otoliths (Adams and Klages, 1987). We esti- - mated standard length (SL) and mass of fish from regressions relating otolith diameter to these - parameters. Regressions not available in Adams and - Klages (1987) and Brown and Klages (1987) are given in Duhamel (1981), Hecht and Cooper (1986), and - Hecht (1987). Since otoliths were removed from intact fish crania, digestion and hence measurement - error were minimal. - Crustacean species were identified with the aid of published keys (Bowman and Gruner, 1973; - Kirkwood, 1982; 1984). Total length of intact individuals was measured from the anterior margin of an - eye to the tip of the telson. Numbers of crustaceans in each sample were estimated by dividing total mass - by the average mass of an individual crustacean of each respective species. Actual numbers and fresh - mass of prey ingested will be underestimated in welldigested samples. - Most of the small cephalopods were recovered intact, and numbers were counted directly. The iden- - tification of squid beaks was facilitated by a reference collection at the Port Elizabeth Museum, and con- - firmed by comparison with the literature (Clarke, - 1986). We estimated dorsal mantle length (DML) and mass of squid from regressions relating lower ros- - trum length (LRL) to these parameters (Adams and - Klages, 1987). Juvenile octopods were not identified. - samples at Marion Island. - Prey species Total % numbers % frequency number of prey class of occurrence - FISH - Notothiniidae - Notothenia squamifrons - Unident.juv. nototheniids - Notothenia acuta - Dissostichus eleginoides - Channichthyidae - Channichthys rhinoceratus - Myctophidae - Protomyctophum normani - Gymnoscopelus nicholsi - Protomyctophum tenisoni - Krefftichthys anderssoni - Protomyctophum bolini unid. myctophids - Electrona carlsbergi - Gymnoscopelus sp. - Muraenolepidae - Muraenolepts sp. - Unident. fish - Euphausiacea - Euphausia vallentini - Hippolytidae - Nauticaris marionis - Amphipoda - Themisto gaudichaudii - Hyperiella sp. - Unident. amphipods - Vibilia sp. - Primno sp. - Nematocarcinidae - Nematocarcinus longirostris - Octopoda - Unident.juv. octopods - Decapoda - Unident.juv. squid - Kondakovia longimana - Onychoteuthidae LRL >2mm - Onychoteuthidae LRL < 2mm - Margarella expansa - Ophiuroidea - 58.6 - 21.1 - 1.3 - 0.5 - 50.7 - 14.6 - 11.1 - 6.3 - 6.3 - 5.8 - 1.5 - 1.4 - 1.1 - 1.0 - 0.7 - 0.2 - <0.1 - 12.5 - 4.2 - 4.2 - 4.9 - 3.5 - 4.2 - 4.9 - 1.4 - 0.7 - 0.2 - 0.1 - 0.7 - 1.4 - 86.4 - 25.7 - 33.3 - 13.5 - 0.1 - <0.1 - <0.1 - <0.1 - <0.1 - <0.1 - 4.9 - 2.1 - 2.1 - 1.4 - 1.4 - 1.4 - 94.9 - 2.9 - 1.1 - 0.6 - 0.6 - 18.8 - 9.0 - 3.5 - 3.5 - 3.5 - 2.1 -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 4] - RESULTS - Mean mass of food samples was 139.2 - + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. - (1984). Mean monthly sample mass was highest in December at 332 g. - General Composition of the Diet - Thirty species or species groups were identified (Table 1), with fish comprising - the largest single group (11 species). However, samples from individual penguins - were largely homogeneous consisting of a single species of crustacean or fish (see also - Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- - tained only a single prey taxon, 26% contained two taxa and 10% contained three - taxa (analysis based on taxa comprising at least 5% by mass of individual samples). - Fish and crustaceans accounted for - 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- - siderable changes in proportions of fish, crustaceans and cephalopods occurred - over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass - from March to June 1984 but decreased to - 0% by March 1985. Cephalopods accounted for more than 10% of the diet - only in February and March 1985. - Fish - The fish component of the diet was dominated by the family Nototheniidae, - particularly Notothenia squamifrons. It is likely that most of the unidentified - juvenile nototheniids were also this species - (Table 1). Nototheniids accounted for over - 90% of the fish component during June to - October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- - peared more commonly in the diet from - October and accounted for most of the food items in March 1985 (Table 3). - Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas - When plotted on a monthly basis, the was no evidence of any progressive chan - in size-class distribution throughout t year. However, during January and Fe - ruary smaller fish between 28.6 - 41.8 m - (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger - dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini - months. Standard parameters for all fish species are given in Table 4. - Crustaceans - Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta - cean component. During winter (March to - August 1984), N. marionis formed the largest part of the crustacean component, but E. - vallentini subsequently increased to nearly - 100% in January and February 1985 - (Table 3). Mean length of E. vallentini was - 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm - + 5.18, n = 1481). - Cephalopods - Cephalopods accounted for just over - 10% of the diet during February and - March 1985 but seldom occurred in the remaining months (Table 3). Most - cephalopods were small octopods (DML < - 15 mm). The only squid identified were juveniles of the species Kondakovia lon- - gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? - 3.8 g (n = 4) respectively. - DISCUSSION - Fish (Ealey 1954, Volkman et al. 198 and, in particular, nototheniids may fo - a substantial portion of the diet of the G too Penguin throughout its range. T - most important prey species by mass in - Fish Crustaceans Cephalopods - Wet mass (%) Mean 53.5 44.4 2.1 - Range (0-100) (0-100) (0-51.8) -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 5] - ADAMS & KLAGES * GENTOO PENGUIN DIET by month. - Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar - Crustaceans - Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 - Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 - Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 - Fish - Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 - Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 - Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 - Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 - Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 - Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island - was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La - Cock et al. 1984, N. T. Klages pers. obs.) - whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- - ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the - islands of the southern Indian and, less commonly, the South Atlantic Oceans - MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- - shaded segment: crustaceans, stippled segment: - fish, shaded segment: cephalopods. - (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all - in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and - Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile - nototheniids in the diet in November and - December preceded an increase in the relative abundance of small-size class N. - squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o - these larval fish. - Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish - in the diet. Adults were presumably too large for consumption. The species had - previously been considered a demersal species endemic to the colder waters - around Kerguelen (48?27'-50?S, 60?27'- - 70?35'E) and Heard Islands (53?01'S, - 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters - around Marion Island was surprising. - However, hydrographical evidence (Benon and Murail 1979) and sampling of - zooplankton (Grindley and Lane 1979, - Boden and Parker 1986) suggests that advection of foreign water masses past the - island with associated fauna may occur - (Boden & Parker 1986). - The appearance of myctophids in the diet of Gentoo Penguins coincided with - the increase in relative abundance of these fish in the diet of King Penguins Apteno- - dytes patagonicus during summer, suggesting increased availability (Adams and - Klages 1987). Small numbers of myctophids have previously been reported in - the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. - 1984). - U9 - - - CO a - C, -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 6] - recovered from Gentoo Penguin stomach samples at Marion Island. - Species OD (mm) Estimated SL (mm) Estimated mass (g) - N Mean SD Range Mean SD Range Mean SD Range - Nototheniidae - Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) - Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) - Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) - Channichthyidae - Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36.8 (127.0-238.3) 21.1 ? 18.9 (6.8-61.9) - Myctophidae - Protomvctophum normani 61 2.38?0.10 (2.19-2.6) 77.7? 3.7 (70.0-85.2) 6.4? 0.9 (4.7-8.3) - Protomyctophum tenisoni 57 1.63?0.13 (1.3-1.78) 49.2 ?4.9 (37.0-54.8) 1.8 ?0.5 (0.8-2.3) - Krefftichthys anderssoni 35 1.51 0.25 (0.97-1.87) 49.6? 11.2 (25.2-66.0) 2.0? 1.1 (0.2-4.3) - Gymnoscopelus nicholsi 12 6.20?0.19 (5.91-6.55) 156.2+5.7 (147.4-166.7) 44.3?4.8 (37.4-53.3) - Protomvctophum bolini 4 2.00?0.38 (1.7-2.35) 57.7 12.2 (46.1-67.3) 1.5?0.8 (0.8-2.2) - Muraenolepidae - Muraenolepis sp. 7 2.7?0.58 (1.87-3.4) 141.1 46.6 (76.8-200.8) 24.9?20.8 (2.3-60.6) - Two species of crustacean Euphausia vallentini and Nauticaris marionis, were of - almost equal importance by mass at Marion Island. This is in contrast to the situa- - tion at sites south of the Antarctic Polar - Front, where the crustacean component of the diet of the Gentoo Penguin is domi- - nated by a single species, E. superba. E. vallentini was the most important prey con- - sumed by Macaroni and Rockhopper penguins during the 1984-1985 summer, and - remained abundant in the diet of Rockhopper Penguins in March 1985 (Brown - and Klages 1987) when it was absent from the diet of the Gentoo Penguin. Both - species of crested penguins probably forage farther offshore during late chick- - rearing than do Gentoo Penguins (Brown - 1987, Adams and Wilson 1987). This may indicate a movement of euphausiids out of - the immediate inshore waters during February and March 1985. - Due to its benthic nature, adult N. - marionis may be available to Gentoo Penguins only within a few kilometers of the - shore (Adams and Wilson 1987). The size - (average length: 35.1 mm) of N. marionis taken by Gentoo Penguins at Marion Is- - land in 1984-1985 is clearly greater than juveniles (maximum length: 23 mm) taken - by crested penguins during December - 1983 to February 1984 (Brown and Klages - 1987) and by Gentoo Penguins in September 1982 (La Cock et al. 1984). The ab- - sence of juvenile N. marionis in the diet of - Gentoo and crested penguins at Marion Island during January to March 1985 is dif- - ficult to explain, since at least some adult individuals recovered during April to Sep- - tember 1984 were ovigerous (N. J. Adams pers. obs.). - The occurrence of octopods in the diet of Gentoo Penguins is apparently unique - to the Marion Island site. The appearance of juvenile octopods in the diet coincided - with their occurrence in the diet of Rockhopper Penguins late in the chick-rearing - period (Brown and Klages 1987). Adult octopods are generally benthic and solit- - ary. The appearance of large numbers of juveniles in the diet suggests highly sea- - sonal spawning, coupled with a tendency to form dense aggregations in shallower - water. In contrast, the small number of juvenile squid in the diet of the inshore - foraging Gentoo Penguin again emphasizes the generally pelagic nature of - squid (Adams and Klages 1987). -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 7] - ADAMS & KLAGES * GENTOO PENGUIN DIET - Gentoo Penguins apparently switched from a largely crustacean diet during - March to June 1984 to a largely fish diet during July 1984 to March 1985. This - change coincided with peak egg laying and could not be considered as a direct re- - sponse to the arrival, in October and - November, of the largely, euphausiid consuming and hence potentially competing - crested penguins (cf. Williams 1981). - Moreover, the most important crustacean component by mass in the diet during - March 1984 to September 1984 was adult - N. marionis (not taken by Macaroni and - Rockhopper penguins) and not krill - (euphausiids) as intimated by Williams - (1981). The large variation in abundance and prey-species composition of crusta- - ceans in penguin diet from year-to-year, indicated by the differences in the diet of - the Macaroni and Rockhopper Penguins in two years at Marion Island (Brown and - Klages 1987) and Gentoo Penguins in - March 1984 and March 1985 may reflect a greater degree of unpredictability in - availability of crustacean prey than at higher latitudes. Switches in diet may - merely reflect local changes in availability of particular prey within the inshore area - exploited by Gentoo Penguins. - Average meal size of Gentoo Penguins at Marion Island is small (La Cock et al. - 1984, this study) compared to those recovered from breeding penguins at higher - latitudes (Croxall and Prince 1980a, Jablonski 1985, Volkman et al. 1980). Evalua- - tion of the magnitude of this difference is complicated by the unknown ratio of - breeders to non-breeders sampled at Marion Island. However, the difference ap- - pears real and may reflect reduced food availability compared to more southerly - sites. This view is supported by the small total breeding population (Adams and - Wilson 1987), low breeding success and long growth period (Williams 1980) of - Gentoo Penguins at Marion Island and suggests that the population is food limited - (La Cock et al. 1982). However, in contrast to the southern populations which breed - in summer (Croxall and Prince 1980b), - Gentoo Penguins at Marion Island begin breeding in the austral winter (Williams - 1980). The infrequent feeding of King penguin chicks (Cherel et al. 1987, Pers. - obs.) and dispersal of Eudyptes penguins away from the island during winter - suggest food availability is low. Consequently, having excluded direct competi- - tion with crested penguins for crustacean prey (Williams 1981), the reason for winter - breeding by Gentoo Penguins remains unclear.[PAGE 8] - Duhamel, G. 1981. Characteristiques biologiques des principales espeches de poissons du plateau conti- - nental des iles Kerguelen. Cybium 3e serie 5: - Duhamel, G., Hureau, J. C. and Ozouf-Costaz, C. - 1983. Ecological survey of notothenioid fishes in the Southern Ocean from Bouvet to Kerguelen - Islands. Memoirs of the National Institute of Polar - Research Special Issue 27: 176-182. - Duhamel, G. and Ozouf-Costaz, C. 1985. Age, growth and reproductive biology of Notothenia squamifrons - Gunther 1880 from the Indian sector of the - Southern Ocean. Polar Biology 4: 143-153. - Ealey, E. H. M. 1954. Analysis of stomach contents of some Heard Island birds. Emu 54: 204-210. - Grindley, J. R. and Lane, S. B. 1979. Zooplankton around Marion and Prince Edward Islands. - Comite National Francais des Rescherches Antartiques 44: 111-126. - Hecht, T. 1987. A guide to the otoliths of Southern - Ocean fishes. South African Journal of Antarctica - Research 17: 1-87. - Hecht, T. and Cooper, J. 1986. Length/mass relationships, energetic content and otoliths of Antarctic - cod Paranotothenia magellanica (Nototheniidae: - Pisces) at sub-Antarctic Marion Island. South African Journal of Zoology 21: 294-296. - Jablonski, B. 1985. The diet of penguins on King - George Island, South Shetland Islands. Acta - Zoologica Cracoviensia 29: 117-186. - Jackson, S., Duffy, D. C. and Jenkins, J. F. G. 1987. - Gastric digestion in marine vertebrate predators: - in vitro standards. Functional Ecology 1: 287-291. - Kirkwood, J. M. 1982. A guide to the Euphausiacea of the Southern Ocean. ANARE Research Notes - Kirkwood, J. M. 1984. A guide to the Decap'da of the Southern Ocean. ANARE Research Notes 11: - Kock, K. H., Duhamel, G. and Hureau, J. C. 1985. - Biology and status of exploited Antarctic fish stocks: a review. BIOMASS Scientific Series 6: - La Cock, G. D., Hecht, T. and Klages, N. 1984.The winter diet of Gentoo Penguins at Marion Island. - Ostrich 55: 188-191. - Volkman, N. J., Presler, P. and Trivelpiece, W. 1980. - Diets of pygoscelid penguins at King George Island, Antarctica. Condor 82: 373-378. - Williams, A. J. 1980. Aspects of the breeding biology of the Gentoo Penguin Pygoscelis papua. Gerfaut - Williams, A. J. 1981. Factors affecting the time of breeding of Gentoo Penguins at Marion Island. In - Cooper J. (ed.) Proceedings of the symposium on birds of the sea and shore, 1979. 451-459. Cape - Town: African Seabird Group. - Wilson, R. P. 1984. An improved stomach pump for penguins and other seabirds. Journal of Field Or- - nithology 55: 109-112. -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC \ No newline at end of file From 46dabc1decacf83abd9d1580996a89f25194805b Mon Sep 17 00:00:00 2001 From: raymondcen Date: Sun, 15 Mar 2026 15:30:00 -0700 Subject: [PATCH 30/30] deleted .txt files --- .../Adams_1989_20260227_210332.txt | 75 --- .../Adams_1989_20260312_115204.txt | 433 ------------------ 2 files changed, 508 deletions(-) delete mode 100644 data/cleaned-text/Adams_1989_20260227_210332.txt delete mode 100644 data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt diff --git a/data/cleaned-text/Adams_1989_20260227_210332.txt b/data/cleaned-text/Adams_1989_20260227_210332.txt deleted file mode 100644 index 61b7abc..0000000 --- a/data/cleaned-text/Adams_1989_20260227_210332.txt +++ /dev/null @@ -1,75 +0,0 @@ -[PAGE 1] -Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic -Marion Island - -Author(s): N. J. Adams and N. T. Klages -Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 -Published by: Waterbird Society - [PAGE 2] - Temporal variation in the diet of the Gentoo Penguin - Pygoscelis papua at sub-Antarctic Marion Island - N. J. ADAMS & N. T. KLAGES' - Percy FitzPatrick Institute of African Ornithology, University of Cape Town, - Rondebosch 7700, South Africa - 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa - Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% - of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and - June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo - Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys - rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris - marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal - changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species - within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. - Key words.-diet, Gentoo Penguin, Pygoscelis papua, - -RESULTS - Mean mass of food samples was 139.2 - + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. - (1984). Mean monthly sample mass was highest in December at 332 g. - General Composition of the Diet - Thirty species or species groups were identified (Table 1), with fish comprising - the largest single group (11 species). However, samples from individual penguins - were largely homogeneous consisting of a single species of crustacean or fish (see also - Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- - tained only a single prey taxon, 26% contained two taxa and 10% contained three - taxa (analysis based on taxa comprising at least 5% by mass of individual samples). - Fish and crustaceans accounted for - 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- - siderable changes in proportions of fish, crustaceans and cephalopods occurred - over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass - from March to June 1984 but decreased to - 0% by March 1985. Cephalopods accounted for more than 10% of the diet - only in February and March 1985. - Fish - The fish component of the diet was dominated by the family Nototheniidae, - particularly Notothenia squamifrons. It is likely that most of the unidentified - juvenile nototheniids were also this species - (Table 1). Nototheniids accounted for over - 90% of the fish component during June to - October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- - peared more commonly in the diet from - October and accounted for most of the food items in March 1985 (Table 3). - Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas - When plotted on a monthly basis, the was no evidence of any progressive chan - in size-class distribution throughout t year. However, during January and Fe - ruary smaller fish between 28.6 - 41.8 m - (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger - dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini - months. Standard parameters for all fish species are given in Table 4. - Crustaceans - Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta - cean component. During winter (March to - August 1984), N. marionis formed the largest part of the crustacean component, but E. - vallentini subsequently increased to nearly - 100% in January and February 1985 - (Table 3). Mean length of E. vallentini was - 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm - + 5.18, n = 1481). - Cephalopods - Cephalopods accounted for just over - 10% of the diet during February and - March 1985 but seldom occurred in the remaining months (Table 3). Most - cephalopods were small octopods (DML < - 15 mm). The only squid identified were juveniles of the species Kondakovia lon- - gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? - 3.8 g (n = 4) respectively. \ No newline at end of file diff --git a/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt b/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt deleted file mode 100644 index 2282198..0000000 --- a/data/cleaned-text/text_cleaner/Adams_1989_20260312_115204.txt +++ /dev/null @@ -1,433 +0,0 @@ -[PAGE 1] -Temporal Variation in the Diet of the Gentoo Penguin Pygoscelis papua at Sub-Antarctic -Marion Island - -Author(s): N. J. Adams and N. T. Klages -Source: Colonial Waterbirds , 1989, Vol. 12, No. 1 (1989), pp. 30-36 -Published by: Waterbird Society - [PAGE 2] - Temporal variation in the diet of the Gentoo Penguin - Pygoscelis papua at sub-Antarctic Marion Island - N. J. ADAMS & N. T. KLAGES' - Percy FitzPatrick Institute of African Ornithology, University of Cape Town, - Rondebosch 7700, South Africa - 'Port Elizabeth Museum, P. 0. Box 13147, Humewood 6013, South Africa - Abstract.-The diet of the Gentoo Penguin Pygoscelis papua at sub-Antarctic Marion Island was sampled by stomach pumping at monthly intervals from March 1984 to March 1985. Overall, fish accounted for 53% - of the diet by mass, crustaceans 44% and cephalopods 2%. Crustaceans predominated between March and - June 1984; a marked increase in the proportion of fish in July coincided with the start of egg laying by Gentoo - Penguins. Fish accounted for almost all of the diet in January and March 1985. Juvenile nototheniid fish, in particular Notothenia squamifrons, formed the bulk of the fish component; myctophids and Channichthys - rhinoceratus were less common. The pelagic euphausiid Euphausia vallentini accounted for about 50% by mass of the overall crustacean component lumped over the entire study period. The decapod shrimp Nauticaris - marionis was the most important crustacean species consumed during June to September 1984. Cephalopods, predominantly octopods, were taken mainly in February and March 1985. The hypothesis that seasonal - changes in diet occur in direct response to the arrival of crested penguins (Eudyptes spp) early in summer is not supported by the data. Changes in diet appear to reflect local changes in the availability of prey species - within the inshore waters exploited by Gentoo Penguins. Received 10 March 1988, accepted 25 November 1988. - Key words.-diet, Gentoo Penguin, Pygoscelis papua, sub-Antarctic. - Colonial Waterbirds 12(1): 30-36, 1989 - Studies of the diets of both migrant and resident penguins in the Southern - Ocean region have concentrated on birds during the chick-rearing period, with little - information for the non-breeding season - (see Croxall and Lishman 1987). The Gentoo Penguin Pygoscelis papua is a year- - round resident at sub-Antarctic Marion Island (van Zinderen Bakker 1971), and at - some other breeding sites in the Southern - Ocean (Watson 1975), affording opportunities for sampling its diet throughout - the year. - Williams (1981) suggested that seasonal changes in the diet and earlier, winter - breeding by Gentoo Penguins at Marion - Island, compared to populations at higher latitudes, ensured a crustacean diet during - incubation and early chick rearing. He suggested that a switch by Gentoo Pen- - guins to a fish diet later in the season, deduced from a change in color of the - excreta, was a response to the arrival of large numbers of two species of potentially - competing, summer-resident crested penguins, the Macaroni Eudyptes chrysolophus - and the Rockhopper E. chrysocome. However, food samples collected at Marion Is- - land in September 1982 showed that fish are common in the diet of Gentoo Pen- - guins before the arrival of the other penguin species (La Cock et al. 1984). We re- - port here on a more comprehensive set o samples collected throughout a single ye - at Marion Island. Our principal objecti was to investigate temporal changes i - diet. - METHODS - A total of 144 stomach samples was collected fr - Gentoo Penguins at three colonies distributed alo a nine kilometre stretch of the east coast of Marion - Island (46? 53'S, 37?54'E) from March 1984 to March - 1985 inclusive. Using the technique of Wilson (1984), we stomach pumped an average of 11 birds a month - as they returned to their landing beaches in the evening. We did not attempt to distinguish the breeding - status of the birds. Gentoo Penguins at Marion Island tend chicks from late July to December. Con- - sequently, during this period, we probably sampled both breeding and non-breeding birds. - Immediately after collection, samples were drained through a 0.5 mm sieve. Subsequently, - drained samples were weighed to the nearest gram and then stored at 4?C until sorting, generally com- - pleted within 24 h of collection. Soft identifiable material was sorted into fish, cephalopod and crustacean - components (further separated into shrimps, euphausiids and amphipods) and weighed. Since fish - muscle is digested more rapidly than squid, which is in turn digested more rapidly than crustacea muscle - (Jackson et al. 1987), we may have overestimated, in particular, the mass of crustaceans in mixed samples. - However, most stomach samples from Gentoo Penguins contained one prey type only (see below). - Most fish were recovered with heads separated from the remainder of the body. Consequently, - analyses were based on identification and measure- -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 3] - ADAMS & KLAGES * GENTOO PENGUIN DIET ment of otoliths (Adams and Klages, 1987). We esti- - mated standard length (SL) and mass of fish from regressions relating otolith diameter to these - parameters. Regressions not available in Adams and - Klages (1987) and Brown and Klages (1987) are given in Duhamel (1981), Hecht and Cooper (1986), and - Hecht (1987). Since otoliths were removed from intact fish crania, digestion and hence measurement - error were minimal. - Crustacean species were identified with the aid of published keys (Bowman and Gruner, 1973; - Kirkwood, 1982; 1984). Total length of intact individuals was measured from the anterior margin of an - eye to the tip of the telson. Numbers of crustaceans in each sample were estimated by dividing total mass - by the average mass of an individual crustacean of each respective species. Actual numbers and fresh - mass of prey ingested will be underestimated in welldigested samples. - Most of the small cephalopods were recovered intact, and numbers were counted directly. The iden- - tification of squid beaks was facilitated by a reference collection at the Port Elizabeth Museum, and con- - firmed by comparison with the literature (Clarke, - 1986). We estimated dorsal mantle length (DML) and mass of squid from regressions relating lower ros- - trum length (LRL) to these parameters (Adams and - Klages, 1987). Juvenile octopods were not identified. - samples at Marion Island. - Prey species Total % numbers % frequency number of prey class of occurrence - FISH - Notothiniidae - Notothenia squamifrons - Unident.juv. nototheniids - Notothenia acuta - Dissostichus eleginoides - Channichthyidae - Channichthys rhinoceratus - Myctophidae - Protomyctophum normani - Gymnoscopelus nicholsi - Protomyctophum tenisoni - Krefftichthys anderssoni - Protomyctophum bolini unid. myctophids - Electrona carlsbergi - Gymnoscopelus sp. - Muraenolepidae - Muraenolepts sp. - Unident. fish - Euphausiacea - Euphausia vallentini - Hippolytidae - Nauticaris marionis - Amphipoda - Themisto gaudichaudii - Hyperiella sp. - Unident. amphipods - Vibilia sp. - Primno sp. - Nematocarcinidae - Nematocarcinus longirostris - Octopoda - Unident.juv. octopods - Decapoda - Unident.juv. squid - Kondakovia longimana - Onychoteuthidae LRL >2mm - Onychoteuthidae LRL < 2mm - Margarella expansa - Ophiuroidea - 58.6 - 21.1 - 1.3 - 0.5 - 50.7 - 14.6 - 11.1 - 6.3 - 6.3 - 5.8 - 1.5 - 1.4 - 1.1 - 1.0 - 0.7 - 0.2 - <0.1 - 12.5 - 4.2 - 4.2 - 4.9 - 3.5 - 4.2 - 4.9 - 1.4 - 0.7 - 0.2 - 0.1 - 0.7 - 1.4 - 86.4 - 25.7 - 33.3 - 13.5 - 0.1 - <0.1 - <0.1 - <0.1 - <0.1 - <0.1 - 4.9 - 2.1 - 2.1 - 1.4 - 1.4 - 1.4 - 94.9 - 2.9 - 1.1 - 0.6 - 0.6 - 18.8 - 9.0 - 3.5 - 3.5 - 3.5 - 2.1 -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 4] - RESULTS - Mean mass of food samples was 139.2 - + 130.5 g (range: 8- 650 g, n= 144), similar to the 147 g recorded by La Cock et al. - (1984). Mean monthly sample mass was highest in December at 332 g. - General Composition of the Diet - Thirty species or species groups were identified (Table 1), with fish comprising - the largest single group (11 species). However, samples from individual penguins - were largely homogeneous consisting of a single species of crustacean or fish (see also - Jablonski 1985, Croxall et al. 1988). Fiftyeight percent of stomach samples con- - tained only a single prey taxon, 26% contained two taxa and 10% contained three - taxa (analysis based on taxa comprising at least 5% by mass of individual samples). - Fish and crustaceans accounted for - 53.5 and 44.4% respectively of the total annual diet by mass (Table 2). However, con- - siderable changes in proportions of fish, crustaceans and cephalopods occurred - over the year (Table 3, Fig. 1). Crustaceans accounted for over 75% of the diet by mass - from March to June 1984 but decreased to - 0% by March 1985. Cephalopods accounted for more than 10% of the diet - only in February and March 1985. - Fish - The fish component of the diet was dominated by the family Nototheniidae, - particularly Notothenia squamifrons. It is likely that most of the unidentified - juvenile nototheniids were also this species - (Table 1). Nototheniids accounted for over - 90% of the fish component during June to - October 1984. Myctophids and the channichthyid Channichthys rhinoceratus ap- - peared more commonly in the diet from - October and accounted for most of the food items in March 1985 (Table 3). - Fish of 78.8 - 84.1 mm (5.9 - 7.3 were the most frequently taken size clas - When plotted on a monthly basis, the was no evidence of any progressive chan - in size-class distribution throughout t year. However, during January and Fe - ruary smaller fish between 28.6 - 41.8 m - (0.2 - 0.7 g) accounted for the most fr quently taken size class, whereas larger - dividuals (76.2 - 84.1 mm, 5.3 - 7.3 g) we more numerous during the remaini - months. Standard parameters for all fish species are given in Table 4. - Crustaceans - Euphausia vallentini and Nauticaris marionis made up nearly all of the crusta - cean component. During winter (March to - August 1984), N. marionis formed the largest part of the crustacean component, but E. - vallentini subsequently increased to nearly - 100% in January and February 1985 - (Table 3). Mean length of E. vallentini was - 22.5 mm ? 1.74 (n = 681), considerably smaller than that of N. marionis (35.1 mm - + 5.18, n = 1481). - Cephalopods - Cephalopods accounted for just over - 10% of the diet during February and - March 1985 but seldom occurred in the remaining months (Table 3). Most - cephalopods were small octopods (DML < - 15 mm). The only squid identified were juveniles of the species Kondakovia lon- - gimana of estimated dorsal mantle length and mass of 58.4 ? 11.4 mm and 9.6 ? - 3.8 g (n = 4) respectively. - DISCUSSION - Fish (Ealey 1954, Volkman et al. 198 and, in particular, nototheniids may fo - a substantial portion of the diet of the G too Penguin throughout its range. T - most important prey species by mass in - Fish Crustaceans Cephalopods - Wet mass (%) Mean 53.5 44.4 2.1 - Range (0-100) (0-100) (0-51.8) -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 5] - ADAMS & KLAGES * GENTOO PENGUIN DIET by month. - Species Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan Feb Mar - Crustaceans - Euphausia va lentini 16.5 42.8 47.1 4.5 4.2 8.9 17.6 23.6 29.7 11.6 1.0 36.2 0 - Nauticaris marionis 65.8 42.8 51.0 70.8 23.9 26.7 38.1 4.7 4.2 7.0 0 0 0 - Themisto gaudichaudii 0 0 0 0 0 0 0 0 0 0 0 1.8 0 - Fish - Nototheniidae 0 0 0 23.5 66.4 62.5 44.3 68.4 45.1 79.1 71.4 35.2 13.5 - Myctophidae 0 14.4 0 0 0.5 1.9 0 1.6 20.8 1.9 4.8 10.0 48.2 - Channichythidae 0 0 0 0 0 0 0 0 0 0 21.9 6.1 23.7 - Muraenolpidae 0 0 0 0 4.3 0 0 0 0 0 0 0 0 - Unidentified 16.8 0 1.4 0 0 0 0 0 0 0 0 0 0 - Cephalopods 0 0 0.5 1.2 0.7 0 0 1.7 0.2 0.4 0.9 10.7 14.6 diet of Gentoo Penguins at Marion Island - was Notothenia squamifrons (incorrectly identified as Harpagifer georgianus in La - Cock et al. 1984, N. T. Klages pers. obs.) - whereas N. rossi and N. larseni were consumed at southerly sites (Croxall and Pr- - ince 1980a, Jablonski 1985). N. squamifrons is widely distributed occurring around the - islands of the southern Indian and, less commonly, the South Atlantic Oceans - MAMJJii A S N D J F M guin at Marion Island throughout a single year. Un- - shaded segment: crustaceans, stippled segment: - fish, shaded segment: cephalopods. - (Duhamel et al. 1983). N. squamifrons taken by Gentoo Penguins at Marion Island, all - in the range 28 - 134 mm, were larval or juvenile fish of 0 - 4 years (Duhamel and - Ozouf-Costaz 1985). The occurrence of large numbers of unidentified juvenile - nototheniids in the diet in November and - December preceded an increase in the relative abundance of small-size class N. - squamifrons in the diet in January and Feb ruary 1985. This may reflect growth o - these larval fish. - Juvenile Channichthys rhinoceratus, were the largest and second most abundant fish - in the diet. Adults were presumably too large for consumption. The species had - previously been considered a demersal species endemic to the colder waters - around Kerguelen (48?27'-50?S, 60?27'- - 70?35'E) and Heard Islands (53?01'S, - 73?23'E) (Kock et al. 1985). The presence of this species in the relatively warm waters - around Marion Island was surprising. - However, hydrographical evidence (Benon and Murail 1979) and sampling of - zooplankton (Grindley and Lane 1979, - Boden and Parker 1986) suggests that advection of foreign water masses past the - island with associated fauna may occur - (Boden & Parker 1986). - The appearance of myctophids in the diet of Gentoo Penguins coincided with - the increase in relative abundance of these fish in the diet of King Penguins Apteno- - dytes patagonicus during summer, suggesting increased availability (Adams and - Klages 1987). Small numbers of myctophids have previously been reported in - the diet of Gentoo Penguins of unknown status at Marion Island (La Cock et al. - 1984). - U9 - - - CO a - C, -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 6] - recovered from Gentoo Penguin stomach samples at Marion Island. - Species OD (mm) Estimated SL (mm) Estimated mass (g) - N Mean SD Range Mean SD Range Mean SD Range - Nototheniidae - Notothenia squamifrons 1109 2.43 0.87 (0.8-5.8) 71.1 ? 18.7 (23.3-155.5) 5.6?4.1 (0.1-56.3) - Notothenia acuta 47 3.19?0.64 (1.38-4.7) 85.0+20.3 (38.6-126.4) 9.2?6.6 (0.6-28.3) - Dissostichus eleginoides 17 3.52?0.73 (2.19-5.27) 105.7?42.6 (42.4-215.1) 10.6+ 16.9 (0.3-65.0) - Channichthyidae - Channichthys rhinoceratus 170 1.48?0.38 (0.97-2.45) 170.3+36.8 (127.0-238.3) 21.1 ? 18.9 (6.8-61.9) - Myctophidae - Protomvctophum normani 61 2.38?0.10 (2.19-2.6) 77.7? 3.7 (70.0-85.2) 6.4? 0.9 (4.7-8.3) - Protomyctophum tenisoni 57 1.63?0.13 (1.3-1.78) 49.2 ?4.9 (37.0-54.8) 1.8 ?0.5 (0.8-2.3) - Krefftichthys anderssoni 35 1.51 0.25 (0.97-1.87) 49.6? 11.2 (25.2-66.0) 2.0? 1.1 (0.2-4.3) - Gymnoscopelus nicholsi 12 6.20?0.19 (5.91-6.55) 156.2+5.7 (147.4-166.7) 44.3?4.8 (37.4-53.3) - Protomvctophum bolini 4 2.00?0.38 (1.7-2.35) 57.7 12.2 (46.1-67.3) 1.5?0.8 (0.8-2.2) - Muraenolepidae - Muraenolepis sp. 7 2.7?0.58 (1.87-3.4) 141.1 46.6 (76.8-200.8) 24.9?20.8 (2.3-60.6) - Two species of crustacean Euphausia vallentini and Nauticaris marionis, were of - almost equal importance by mass at Marion Island. This is in contrast to the situa- - tion at sites south of the Antarctic Polar - Front, where the crustacean component of the diet of the Gentoo Penguin is domi- - nated by a single species, E. superba. E. vallentini was the most important prey con- - sumed by Macaroni and Rockhopper penguins during the 1984-1985 summer, and - remained abundant in the diet of Rockhopper Penguins in March 1985 (Brown - and Klages 1987) when it was absent from the diet of the Gentoo Penguin. Both - species of crested penguins probably forage farther offshore during late chick- - rearing than do Gentoo Penguins (Brown - 1987, Adams and Wilson 1987). This may indicate a movement of euphausiids out of - the immediate inshore waters during February and March 1985. - Due to its benthic nature, adult N. - marionis may be available to Gentoo Penguins only within a few kilometers of the - shore (Adams and Wilson 1987). The size - (average length: 35.1 mm) of N. marionis taken by Gentoo Penguins at Marion Is- - land in 1984-1985 is clearly greater than juveniles (maximum length: 23 mm) taken - by crested penguins during December - 1983 to February 1984 (Brown and Klages - 1987) and by Gentoo Penguins in September 1982 (La Cock et al. 1984). The ab- - sence of juvenile N. marionis in the diet of - Gentoo and crested penguins at Marion Island during January to March 1985 is dif- - ficult to explain, since at least some adult individuals recovered during April to Sep- - tember 1984 were ovigerous (N. J. Adams pers. obs.). - The occurrence of octopods in the diet of Gentoo Penguins is apparently unique - to the Marion Island site. The appearance of juvenile octopods in the diet coincided - with their occurrence in the diet of Rockhopper Penguins late in the chick-rearing - period (Brown and Klages 1987). Adult octopods are generally benthic and solit- - ary. The appearance of large numbers of juveniles in the diet suggests highly sea- - sonal spawning, coupled with a tendency to form dense aggregations in shallower - water. In contrast, the small number of juvenile squid in the diet of the inshore - foraging Gentoo Penguin again emphasizes the generally pelagic nature of - squid (Adams and Klages 1987). -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC - -[PAGE 7] - ADAMS & KLAGES * GENTOO PENGUIN DIET - Gentoo Penguins apparently switched from a largely crustacean diet during - March to June 1984 to a largely fish diet during July 1984 to March 1985. This - change coincided with peak egg laying and could not be considered as a direct re- - sponse to the arrival, in October and - November, of the largely, euphausiid consuming and hence potentially competing - crested penguins (cf. Williams 1981). - Moreover, the most important crustacean component by mass in the diet during - March 1984 to September 1984 was adult - N. marionis (not taken by Macaroni and - Rockhopper penguins) and not krill - (euphausiids) as intimated by Williams - (1981). The large variation in abundance and prey-species composition of crusta- - ceans in penguin diet from year-to-year, indicated by the differences in the diet of - the Macaroni and Rockhopper Penguins in two years at Marion Island (Brown and - Klages 1987) and Gentoo Penguins in - March 1984 and March 1985 may reflect a greater degree of unpredictability in - availability of crustacean prey than at higher latitudes. Switches in diet may - merely reflect local changes in availability of particular prey within the inshore area - exploited by Gentoo Penguins. - Average meal size of Gentoo Penguins at Marion Island is small (La Cock et al. - 1984, this study) compared to those recovered from breeding penguins at higher - latitudes (Croxall and Prince 1980a, Jablonski 1985, Volkman et al. 1980). Evalua- - tion of the magnitude of this difference is complicated by the unknown ratio of - breeders to non-breeders sampled at Marion Island. However, the difference ap- - pears real and may reflect reduced food availability compared to more southerly - sites. This view is supported by the small total breeding population (Adams and - Wilson 1987), low breeding success and long growth period (Williams 1980) of - Gentoo Penguins at Marion Island and suggests that the population is food limited - (La Cock et al. 1982). However, in contrast to the southern populations which breed - in summer (Croxall and Prince 1980b), - Gentoo Penguins at Marion Island begin breeding in the austral winter (Williams - 1980). The infrequent feeding of King penguin chicks (Cherel et al. 1987, Pers. - obs.) and dispersal of Eudyptes penguins away from the island during winter - suggest food availability is low. Consequently, having excluded direct competi- - tion with crested penguins for crustacean prey (Williams 1981), the reason for winter - breeding by Gentoo Penguins remains unclear.[PAGE 8] - Duhamel, G. 1981. Characteristiques biologiques des principales espeches de poissons du plateau conti- - nental des iles Kerguelen. Cybium 3e serie 5: - Duhamel, G., Hureau, J. C. and Ozouf-Costaz, C. - 1983. Ecological survey of notothenioid fishes in the Southern Ocean from Bouvet to Kerguelen - Islands. Memoirs of the National Institute of Polar - Research Special Issue 27: 176-182. - Duhamel, G. and Ozouf-Costaz, C. 1985. Age, growth and reproductive biology of Notothenia squamifrons - Gunther 1880 from the Indian sector of the - Southern Ocean. Polar Biology 4: 143-153. - Ealey, E. H. M. 1954. Analysis of stomach contents of some Heard Island birds. Emu 54: 204-210. - Grindley, J. R. and Lane, S. B. 1979. Zooplankton around Marion and Prince Edward Islands. - Comite National Francais des Rescherches Antartiques 44: 111-126. - Hecht, T. 1987. A guide to the otoliths of Southern - Ocean fishes. South African Journal of Antarctica - Research 17: 1-87. - Hecht, T. and Cooper, J. 1986. Length/mass relationships, energetic content and otoliths of Antarctic - cod Paranotothenia magellanica (Nototheniidae: - Pisces) at sub-Antarctic Marion Island. South African Journal of Zoology 21: 294-296. - Jablonski, B. 1985. The diet of penguins on King - George Island, South Shetland Islands. Acta - Zoologica Cracoviensia 29: 117-186. - Jackson, S., Duffy, D. C. and Jenkins, J. F. G. 1987. - Gastric digestion in marine vertebrate predators: - in vitro standards. Functional Ecology 1: 287-291. - Kirkwood, J. M. 1982. A guide to the Euphausiacea of the Southern Ocean. ANARE Research Notes - Kirkwood, J. M. 1984. A guide to the Decap'da of the Southern Ocean. ANARE Research Notes 11: - Kock, K. H., Duhamel, G. and Hureau, J. C. 1985. - Biology and status of exploited Antarctic fish stocks: a review. BIOMASS Scientific Series 6: - La Cock, G. D., Hecht, T. and Klages, N. 1984.The winter diet of Gentoo Penguins at Marion Island. - Ostrich 55: 188-191. - Volkman, N. J., Presler, P. and Trivelpiece, W. 1980. - Diets of pygoscelid penguins at King George Island, Antarctica. Condor 82: 373-378. - Williams, A. J. 1980. Aspects of the breeding biology of the Gentoo Penguin Pygoscelis papua. Gerfaut - Williams, A. J. 1981. Factors affecting the time of breeding of Gentoo Penguins at Marion Island. In - Cooper J. (ed.) Proceedings of the symposium on birds of the sea and shore, 1979. 451-459. Cape - Town: African Seabird Group. - Wilson, R. P. 1984. An improved stomach pump for penguins and other seabirds. Journal of Field Or- - nithology 55: 109-112. -This content downloaded from -97.113.13.157 on Tue, 29 Mar 2022 18:03:12 UTC \ No newline at end of file