fixed pdf parsing issue on some filings

john-friedman · john-friedman · commit cf3740b4c124 · 2026-02-01T23:20:54.000-08:00
diff --git a/datamule/datamule/datamule/tar_downloader.py b/datamule/datamule/datamule/tar_downloader.py
@@ -14,6 +14,7 @@
 from threading import Lock
 from os import cpu_count
 from secsgml.utils import calculate_documents_locations_in_tar
+from secsgml.parse_sgml import decode_uuencoded_content
 from ..utils.format_accession import format_accession
 from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
 from .datamule_lookup import datamule_lookup
@@ -457,6 +458,9 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
                             extraction_pool,
                             partial(self._extract_documents_from_probe_by_list, probe_bytes, docs_in_probe)
                         )
+
+                        for doc in probe_documents:
+                            doc['content'] = should_decode_file_from_content(doc['content'])
                         documents.extend(probe_documents)
                     
                     # Download each document beyond probe individually
@@ -488,6 +492,8 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
                                     extraction_pool,
                                     partial(self._decompress_zstd, doc_content)
                                 )
+
+                                decompressed = should_decode_file_from_content(decompressed)
                                 
                                 documents.append({
                                     'name': doc_name,
@@ -648,4 +654,26 @@ def download_tar(cik=None, ticker=None, submission_type=None, filing_date=None,
         keep_document_types=keep_document_types,
         max_batch_size=max_batch_size,
         keep_filtered_metadata=keep_filtered_metadata
-    )
+    )
+
+
+# band aid fix for tar archive initial sgml processing screw up
+# PDF was not always detected due to tags on new line.
+def should_decode_file_from_content(content):
+    """
+    Bandaid fix: Check if content is UU-encoded and decode if needed.
+    This catches any documents that slipped through without decoding.
+    """
+    # Quick check: does content start with 'begin ' in first 200 bytes?
+    if b'begin ' in content[:200]:
+        try:
+            # Attempt to decode
+            decoded = decode_uuencoded_content(content)
+            # Verify we got valid binary (not empty, different from input)
+            if decoded and decoded != content:
+                logger.debug(f"Post-decode applied: {len(content)} -> {len(decoded)} bytes")
+                return decoded
+        except Exception as e:
+            logger.debug(f"Post-decode attempted but failed: {str(e)}")
+    
+    return content
diff --git a/datamule/datamule/document/document.py b/datamule/datamule/document/document.py
@@ -1,8 +1,7 @@
 import json
 import csv
 import re
-from doc2dict import xml2dict, txt2dict
-from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
+from doc2dict import xml2dict, txt2dict, html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict, convert_dict_to_columnar
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import MAPPING_DICTS_BY_TYPE, STANDARD_CONFIG
 from pathlib import Path
@@ -271,6 +270,8 @@ def __init__(self, type, content, extension,accession,filing_date,path=None):
         self._tables = None
         self._text = None
         self._markdown = None
+        self._data_tuples_columnar = None 
+
 
         # booleans
         self._data_bool = self.extension in ('.htm', '.html','.txt')
@@ -342,13 +343,22 @@ def data(self):
             
         return self._data
     
+    
     @property
     def data_tuples(self):
         if self._data_bool:
             if self._data_tuples is None:
                 self._data_tuples = unnest_dict(self.data)
         return self._data_tuples
     
+    @property
+    def data_tuples_columnar(self):
+        if self._data_bool:
+            if self._data_tuples_columnar is None:
+                self._data_tuples_columnar = convert_dict_to_columnar(self.data)
+        return self._data_tuples_columnar
+    
+
     @property
     def text(self):
         if self._text_bool:
diff --git a/datamule/docs-rewrite/docs/datamule-python/portfolio/document.md b/datamule/docs-rewrite/docs/datamule-python/portfolio/document.md
@@ -17,7 +17,7 @@ Document in either string or bytes format.
 Available for html, text and some pdf files. Document content parsed into dictionary form. 
 
 ### `document.data_tuples`
-Available for html, text and some pdf files. `document.data` flattened into form: (id,type,content,level).
+Available for html, text and some pdf files. `document.data_tuples` flattened into form: (id, type, content, level, class).
 
 ### `document.text`
 Available for html, text and some pdf files. Returns the document's text.
diff --git a/datamule/setup.py b/datamule/setup.py
@@ -32,7 +32,7 @@
 setup(
     name="datamule",
     author="John Friedman",
-    version="3.2.8",
+    version="3.2.9",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",