Skip to content

Commit cf3740b

Browse files
committed
fixed pdf parsing issue on some filings
1 parent 52f0111 commit cf3740b

4 files changed

Lines changed: 43 additions & 5 deletions

File tree

datamule/datamule/datamule/tar_downloader.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from threading import Lock
1515
from os import cpu_count
1616
from secsgml.utils import calculate_documents_locations_in_tar
17+
from secsgml.parse_sgml import decode_uuencoded_content
1718
from ..utils.format_accession import format_accession
1819
from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
1920
from .datamule_lookup import datamule_lookup
@@ -457,6 +458,9 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
457458
extraction_pool,
458459
partial(self._extract_documents_from_probe_by_list, probe_bytes, docs_in_probe)
459460
)
461+
462+
for doc in probe_documents:
463+
doc['content'] = should_decode_file_from_content(doc['content'])
460464
documents.extend(probe_documents)
461465

462466
# Download each document beyond probe individually
@@ -488,6 +492,8 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
488492
extraction_pool,
489493
partial(self._decompress_zstd, doc_content)
490494
)
495+
496+
decompressed = should_decode_file_from_content(decompressed)
491497

492498
documents.append({
493499
'name': doc_name,
@@ -648,4 +654,26 @@ def download_tar(cik=None, ticker=None, submission_type=None, filing_date=None,
648654
keep_document_types=keep_document_types,
649655
max_batch_size=max_batch_size,
650656
keep_filtered_metadata=keep_filtered_metadata
651-
)
657+
)
658+
659+
660+
# band aid fix for tar archive initial sgml processing screw up
661+
# PDF was not always detected due to tags on new line.
662+
def should_decode_file_from_content(content):
663+
"""
664+
Bandaid fix: Check if content is UU-encoded and decode if needed.
665+
This catches any documents that slipped through without decoding.
666+
"""
667+
# Quick check: does content start with 'begin ' in first 200 bytes?
668+
if b'begin ' in content[:200]:
669+
try:
670+
# Attempt to decode
671+
decoded = decode_uuencoded_content(content)
672+
# Verify we got valid binary (not empty, different from input)
673+
if decoded and decoded != content:
674+
logger.debug(f"Post-decode applied: {len(content)} -> {len(decoded)} bytes")
675+
return decoded
676+
except Exception as e:
677+
logger.debug(f"Post-decode attempted but failed: {str(e)}")
678+
679+
return content

datamule/datamule/document/document.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import json
22
import csv
33
import re
4-
from doc2dict import xml2dict, txt2dict
5-
from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
4+
from doc2dict import xml2dict, txt2dict, html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict, convert_dict_to_columnar
65
from ..mapping_dicts.xml_mapping_dicts import dict_345
76
from ..mapping_dicts.html_mapping_dicts import MAPPING_DICTS_BY_TYPE, STANDARD_CONFIG
87
from pathlib import Path
@@ -271,6 +270,8 @@ def __init__(self, type, content, extension,accession,filing_date,path=None):
271270
self._tables = None
272271
self._text = None
273272
self._markdown = None
273+
self._data_tuples_columnar = None
274+
274275

275276
# booleans
276277
self._data_bool = self.extension in ('.htm', '.html','.txt')
@@ -342,13 +343,22 @@ def data(self):
342343

343344
return self._data
344345

346+
345347
@property
346348
def data_tuples(self):
347349
if self._data_bool:
348350
if self._data_tuples is None:
349351
self._data_tuples = unnest_dict(self.data)
350352
return self._data_tuples
351353

354+
@property
355+
def data_tuples_columnar(self):
356+
if self._data_bool:
357+
if self._data_tuples_columnar is None:
358+
self._data_tuples_columnar = convert_dict_to_columnar(self.data)
359+
return self._data_tuples_columnar
360+
361+
352362
@property
353363
def text(self):
354364
if self._text_bool:

datamule/docs-rewrite/docs/datamule-python/portfolio/document.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Document in either string or bytes format.
1717
Available for html, text and some pdf files. Document content parsed into dictionary form.
1818

1919
### `document.data_tuples`
20-
Available for html, text and some pdf files. `document.data` flattened into form: (id,type,content,level).
20+
Available for html, text and some pdf files. `document.data_tuples` flattened into form: (id, type, content, level, class).
2121

2222
### `document.text`
2323
Available for html, text and some pdf files. Returns the document's text.

datamule/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
setup(
3333
name="datamule",
3434
author="John Friedman",
35-
version="3.2.8",
35+
version="3.2.9",
3636
description="Work with SEC submissions at scale.",
3737
packages=find_packages(include=['datamule', 'datamule.*']),
3838
url="https://github.com/john-friedman/datamule-python",

0 commit comments

Comments
 (0)