pts · T-3B · Mar 3, 2026 · Mar 4, 2026 · Mar 5, 2026 · Mar 14, 2026
diff --git a/lib/pdfsizeopt/main.py b/lib/pdfsizeopt/main.py
@@ -72,9 +72,17 @@
   contain at least one space, and %(targetfnq)s, to which the target filename
   would be substituted. Additionally, it may contain %(sourcefnqs) for the
   source filename. Both of them will be autogenerated temporary filenames.
-  (There are some other sobstitutions as well in CMD_PATTERN.) The CMD_PATTERN
+  (There are some other substitutions as well in CMD_PATTERN.) The CMD_PATTERN
   can be used to run an image optimizer whose command-line syntax is not
   built in to pdfsizeopt.
+--use-zlib-optimizer=CMD_PATTERN
+  Run the specified zlib optimizer program command-line for optimizing
+  zlib embedded in PDF. Can be slow as pdfsizeopt decompresses & compresses the
+  same object 2~3 times. CMD_PATTERN is an os.system command-line pattern, which
+  must contain at least one space, and %(targetfnq)s, to which the target
+  filename would be substituted. Additionally, it may contain %(sourcefnqs) for
+  the source filename. Both of them will be autogenerated temporary filenames.
+  (There are some other substitutions as well in CMD_PATTERN.)
 --do-fast-bilevel-images=YES_NO; default: no
   Disable some slow image optimizers for bilevel images in favor of jbig2.
   This flag is smart, and it disables optimizers only if jbig2 is enabled,
@@ -112,6 +120,15 @@
   After Type1C font serialization, parse it again, and check that the glyphs
   and most other fields are still there? It is slow, but it can reveal some
   bugs in Ghostscript.
+--do-remove-core-fonts=YES_NO; default: no
+  Enable unembedding the Base14 PDF fonts. These fonts were guaranteed to be
+  present with all PDF readers, but while virtually all system provide
+  reasonable metric-compatible fonts with the original Base14 fonts, they
+  may have different shapes than the original fonts and, as a result, the
+  processed document may not look exactly the same as the original document.
+  Needs external optimizer tool Multivalent.
+  Only use this flag if you are willing to have processed documents that may
+  not look exactly the same as the original document.
 --do-optimize-streams=YES_NO; default: yes
   Recompress all non-image streams, keep the smallest value. To optimize image
   streams, please use --do-optimize-images=yes.
@@ -225,9 +242,50 @@
 from pdfsizeopt import cff
 from pdfsizeopt import psproc
 
+compat_compress = zlib.compress
+
+def ZlibCmd(data, level, cmd_pattern):
+  compressed_data = zlib.compress(data, 1)
+  del data  # Save memory.
+  # Use Adler-32 checksum in filename to ensure no collision
+  sourcefn = TMP_PREFIX + 'zlib-%s.zlib' % compressed_data[-4:].encode('hex')
+  targetfn = TMP_PREFIX + 'zlib-%s-new.zlib' % compressed_data[-4:].encode('hex')
+  cmd_values_dict = {
+    'sourcefnq': ShellQuoteFileName(sourcefn),
+    'targetfnq': ShellQuoteFileName(targetfn),
+  }
+  assert '%(targetfnq)s' in cmd_pattern, cmd_pattern
+  cmd = cmd_pattern % cmd_values_dict
+
+  f = open(sourcefn if '%(sourcefnq)s' in cmd_pattern else targetfn, 'wb')
+  try:
+    f.write(compressed_data)
+  finally:
+    f.close()
+
+  LogProportionalInfo('executing zlib optimizer: %s' % cmd)
+  sys.stdout.flush()
+  romode = (None, False)[NeedToolLogOutput()]
+  status = os.system(RedirectOutput(cmd, mode=romode))
+  if status:
+    LogFatal('zlib optimizer has failed (status=0x%x): %s' % (status, cmd))
+  assert os.path.exists(targetfn), (
+      'zlib optimizer has not created the output file %r: %s' % (targetfn, cmd))
+
+  f = open(targetfn, 'rb')
+  try:
+    compressed_data = f.read()
+  finally:
+    f.close()
+
+  os.remove(targetfn)
+  if '%(sourcefnq)s' in cmd_pattern:
+    os.remove(sourcefn)
+  return compressed_data
+
 
 class Error(Exception):
-  """Comon base class for exceptions defined in this module."""
+  """Common base class for exceptions defined in this module."""
 
 try:
   bytearray_tostring = bytearray.__str__  # Python 2.6 and 2.7.
@@ -733,7 +791,7 @@ def PermissiveZlibDecompress(data):
     if not (8 <= wbits <= 15):
       raise zlib.error('Bad zlib wbits: %d' % wbits)
     if flg & 32:
-      raise zlib.error('Unexpected zlib preset diectionary.')
+      raise zlib.error('Unexpected zlib preset dictionary.')
     # This won't work data = zlib.decompress(buffer(data, 2), -wbits)
     # It may raise: zlib.error: Error -5 while decompressing data: incomplete or truncated stream
     zd = zlib.decompressobj(-wbits)
@@ -1935,7 +1993,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
     if data:
       if is_flate_ok:
         items.append([None, 'zip', PdfObj(self)])
-        items[-1][2].stream = zlib.compress(data, 9)
+        items[-1][2].stream = compat_compress(data, 9)
         items[-1][2].Set('Length', len(items[-1][2].stream))
         items[-1][2].Set('Filter', '/FlateDecode')
         items[-1][2].Set('DecodeParms', None)
@@ -1958,7 +2016,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
           output.append(bytearray_tostring(b))
           i += predictor_width
         items.append([None, 'zip-pred10', PdfObj(self)])
-        items[-1][2].stream = zlib.compress(''.join(output), 9)
+        items[-1][2].stream = compat_compress(''.join(output), 9)
         items[-1][2].Set('Length', len(items[-1][2].stream))
         items[-1][2].Set('Filter', '/FlateDecode')
         # Oddly enough, Multivalent fails if /Predictor 10 or /Predictor 11
@@ -1980,7 +2038,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
           output.append(bytearray_tostring(b))
           i += predictor_width
         items.append([None, 'zip-pred2', PdfObj(self)])
-        items[-1][2].stream = zlib.compress(''.join(output), 9)
+        items[-1][2].stream = compat_compress(''.join(output), 9)
         items[-1][2].Set('Length', len(items[-1][2].stream))
         items[-1][2].Set('Filter', '/FlateDecode')
         items[-1][2].Set('DecodeParms',
@@ -3819,7 +3877,7 @@ def FixFontNameInType1C(self, new_font_name='F', objs=None,
     # TODO(pts): Add generic recompression of all /FlateDecode filters
     #            (because Ghostscript is suboptimal everywhere).
     if self.Get('Filter') != '/FlateDecode' or new_data != data:
-      self.stream = zlib.compress(new_data, 9)
+      self.stream = compat_compress(new_data, 9)
       self.Set('Filter', '/FlateDecode')
       self.Set('DecodeParms', None)
       self.Set('Length', len(self.stream))
@@ -4228,7 +4286,7 @@ def CompressToZipPng(
         output.append(idat[i : i + bytes_per_row])
 
     # TODO(pts): Maybe use a smaller effort? We're not optimizing anyway.
-    self.idat = zlib.compress(''.join(output), effort)
+    self.idat = compat_compress(''.join(output), effort)
     self.compression = 'zip-png'
     if do_try_invert:
       self.is_inverted = not self.is_inverted
@@ -4434,7 +4492,7 @@ def LoadPdfImageObj(self, obj, do_zip, decode_kind=None):
       if do_zip:
         compression = 'zip'
         # TODO(pts): Would a smaller effort (compression level) suffice here?
-        idat = zlib.compress(idat, 9)
+        idat = compat_compress(idat, 9)
     elif predictor in (1, None):
       compression = 'zip'
     elif predictor == 2:
@@ -7566,7 +7624,7 @@ def OptimizeImages(self, img_cmd_patterns, do_fast_bilevel_images):
       else:
         assert image1 is not image2
         if image1.compression == 'none':
-          image1.idat = zlib.compress(image1.idat, 9)
+          image1.idat = compat_compress(image1.idat, 9)
           image1.compression = 'zip'
         if len(image1.idat) < len(image2.idat):
           # For testing: ./pdfsizeopt.py --use-pngout=false PLRM.pdf
@@ -8249,7 +8307,7 @@ def OptimizeStreams(self, do_decompress_only=False):
       else:
         # Try flate with maximum effort.
         obj2 = PdfObj(obj)
-        obj2.stream = zlib.compress(data, 9)
+        obj2.stream = compat_compress(data, 9)
         obj2.Set('Length', len(obj2.stream))
         obj2.Set('Filter', '/FlateDecode')
         obj2.Set('DecodeParms', None)
@@ -9190,7 +9248,8 @@ def FixPdfFromMultivalent(cls, data, output,
   PDFDATA_MULTIVALENT_EXT_SUB_RE = re.compile(r'[.][^.]+\Z')
 
   def _RunMultivalent(self, do_escape_images,
-                      multivalent_compress_command):
+                      multivalent_compress_command,
+                      do_remove_core_fonts):
     """Run Multivalent, and read its output.
 
    Args:
@@ -9236,7 +9295,10 @@ def _RunMultivalent(self, do_escape_images,
     # * Don't add -jpeg, it introduces lossy compression.
     # * Don't add -subset, it's expreimental.
     # * FYI http://code.google.com/p/pdfsizeopt/issues/detail?id=30 .
-    multivalent_flags = '-nopagepiece -noalt -mon'
+    multivalent_flags = '-nopagepiece -noalt -mon -nowebcap -nostruct'
+
+    if do_remove_core_fonts:
+      multivalent_flags += ' -nocore'
 
     # TODO(pts): Work around exception for emptypage.pdf:
     # psotmp.PID.conv.mi.tmp.pdf: java.lang.ClassCastException:
@@ -9275,6 +9337,7 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
            do_escape_images_from_multivalent,
            do_generate_xref_stream,
            do_generate_object_stream,
+           do_remove_core_fonts,
            is_flate_ok):
     """Save this PDF to a file, with or without Multivalent.
 
@@ -9285,6 +9348,8 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
         prefix for running Multivalent tool.pdf.Compress.
       do_update_file_meta: bool indicating whether self.file_name and
         self.file_size should be updated after a successful save.
+      do_remove_core_fonts: bool indicating whether we should unembed the
+        core14 fonts.
       is_flate_ok: bool indicating if it's OK to generate xref and object
         streams with /Filter/FlateDecode.
     """
@@ -9327,7 +9392,8 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
     if multivalent_compress_command:
       multivalent_output_data, tmp_files_to_remove = self._RunMultivalent(
           do_escape_images=do_escape_images_from_multivalent,
-          multivalent_compress_command=multivalent_compress_command)
+          multivalent_compress_command=multivalent_compress_command,
+          do_remove_core_fonts=do_remove_core_fonts)
     else:
       tmp_files_to_remove = ()
       multivalent_output_data = None
@@ -9547,6 +9613,7 @@ def __init__(self):
     f.use_pngout = f.use_jbig2 = f.use_sam2p_pr = None
     f.mode = 'optimize'
     f.img_cmds = []
+    f.zlib_cmd = ''
     f.args = []
     f.verbosity = 190
     f.tmp_dir = None
@@ -9599,8 +9666,13 @@ def LongHasArgs(opt, longopts):
         if len(value.split()) < 2:
           f.img_cmds.extend(filter(None, value.split(',')))
         else:
-          # Special value 'none' and 'none' are also OK.
+          # Special value 'no' and 'none' are also OK.
           f.img_cmds.append(value)
+      elif flag_name == 'use_zlib_optimizer':
+        value = value.strip()
+        if not value:
+          raise getopt.GetoptError('Empty zlib optimizer command.')
+        f.zlib_cmd = value
       elif flag_name == 'do_double_check_missing_glyphs':  # Legacy flag name.
         f.do_double_check_type1c_output = ParseBoolFlag(key, value)
       elif flag_name == 'v':
@@ -9687,6 +9759,19 @@ def main(argv, script_dir=None, zip_file=None):
         raise getopt.GetoptError('--do-generate-object-stream=yes requires '
                                  '--do-generate-xref-stream=yes')
 
+      zlib_cmd_pattern = ''
+      if f.zlib_cmd:
+        cmd_name = GetCmdName(f.zlib_cmd)
+        if not cmd_name:
+          raise getopt.GetoptError(
+              'command name missing from zlib optimizer command: %s' %
+              cmd_pattern)
+        if '%(targetfnq)s' not in cmd_pattern:
+          raise getopt.GetoptError(
+              'targetfnq missing from zlib optimizer command: %s' %
+              cmd_pattern)
+        zlib_cmd_pattern = f.zlib_cmd
+
   except getopt.GetoptError, exc:
     LogFatal(
         '%s\nfatal: error in command line: %s' % (welcome_msg, exc), 1)
@@ -9755,11 +9840,38 @@ def main(argv, script_dir=None, zip_file=None):
       has_not_found = True
     else:
       img_cmd_patterns_good.append(cmd_pattern)
+
+  zlib_cmd_pattern_is_good = False
+  if zlib_cmd_pattern:
+    cmd_pattern = zlib_cmd_pattern
+    # TODO(pts): Use shlib on Linux etc. for parsing the command name.
+    if cmd_pattern.startswith('"'):  # and sys.platform.startswith('win'):
+      # TODO(pts): How is it possible to specify this on the command-line?
+      cmd_prog = cmd_pattern[1 : cmd_pattern.find('"', 1)]
+    else:
+      cmd_prog = (cmd_pattern.split() or ('',))[0]
+    if not cmd_prog:
+      LogFatal('empty zlib optimizer program: %s' % cmd_prog, 1)
+    if os.path.isabs(cmd_prog):
+      if not os.path.isfile(cmd_prog):
+        LogError('zlib optimizer not found: %s' % cmd_prog)
+        has_not_found = True
+      else:
+        zlib_cmd_pattern_is_good = True
+    elif not FindExeOnPath(cmd_prog):
+      LogError('zlib optimizer not found on PATH: %s' % cmd_prog)
+      has_not_found = True
+    else:
+      zlib_cmd_pattern_is_good = True
+
   if has_not_found and f.do_require_image_optimizers:
     LogFatal(
         'not all image optimizers found (see above), '
         'ignore with --do-require-image-optimizers=no', 3)
   img_cmd_patterns = img_cmd_patterns_good
+  if zlib_cmd_pattern_is_good:
+    global compat_compress
+    compat_compress = lambda data, level: ZlibCmd(data, level, zlib_cmd_pattern)
 
   if output_file_name is None:  # Just --do-debug-gs=yes.
     return
@@ -9817,6 +9929,7 @@ def main(argv, script_dir=None, zip_file=None):
       do_escape_images_from_multivalent=f.do_escape_images_from_multivalent,
       do_generate_xref_stream=f.do_generate_xref_stream,
       do_generate_object_stream=f.do_generate_object_stream,
+      do_remove_core_fonts=f.do_remove_core_fonts,
       is_flate_ok=(f.do_compress_uncompressed_streams and
                    not f.do_decompress_most_streams))
   Rename(output_file_name + '.tmp', output_file_name)