Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 128 additions & 15 deletions lib/pdfsizeopt/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,17 @@
contain at least one space, and %(targetfnq)s, to which the target filename
would be substituted. Additionally, it may contain %(sourcefnqs) for the
source filename. Both of them will be autogenerated temporary filenames.
(There are some other sobstitutions as well in CMD_PATTERN.) The CMD_PATTERN
(There are some other substitutions as well in CMD_PATTERN.) The CMD_PATTERN
can be used to run an image optimizer whose command-line syntax is not
built in to pdfsizeopt.
--use-zlib-optimizer=CMD_PATTERN
Run the specified zlib optimizer program command-line for optimizing
zlib embedded in PDF. Can be slow as pdfsizeopt decompresses & compresses the
same object 2~3 times. CMD_PATTERN is an os.system command-line pattern, which
must contain at least one space, and %(targetfnq)s, to which the target
filename would be substituted. Additionally, it may contain %(sourcefnqs) for
the source filename. Both of them will be autogenerated temporary filenames.
(There are some other substitutions as well in CMD_PATTERN.)
--do-fast-bilevel-images=YES_NO; default: no
Disable some slow image optimizers for bilevel images in favor of jbig2.
This flag is smart, and it disables optimizers only if jbig2 is enabled,
Expand Down Expand Up @@ -112,6 +120,15 @@
After Type1C font serialization, parse it again, and check that the glyphs
and most other fields are still there? It is slow, but it can reveal some
bugs in Ghostscript.
--do-remove-core-fonts=YES_NO; default: no
Enable unembedding the Base14 PDF fonts. These fonts were guaranteed to be
present with all PDF readers, but while virtually all system provide
reasonable metric-compatible fonts with the original Base14 fonts, they
may have different shapes than the original fonts and, as a result, the
processed document may not look exactly the same as the original document.
Needs external optimizer tool Multivalent.
Only use this flag if you are willing to have processed documents that may
not look exactly the same as the original document.
--do-optimize-streams=YES_NO; default: yes
Recompress all non-image streams, keep the smallest value. To optimize image
streams, please use --do-optimize-images=yes.
Expand Down Expand Up @@ -225,9 +242,50 @@
from pdfsizeopt import cff
from pdfsizeopt import psproc

compat_compress = zlib.compress

def ZlibCmd(data, level, cmd_pattern):
compressed_data = zlib.compress(data, 1)
del data # Save memory.
# Use Adler-32 checksum in filename to ensure no collision
sourcefn = TMP_PREFIX + 'zlib-%s.zlib' % compressed_data[-4:].encode('hex')
targetfn = TMP_PREFIX + 'zlib-%s-new.zlib' % compressed_data[-4:].encode('hex')
cmd_values_dict = {
'sourcefnq': ShellQuoteFileName(sourcefn),
'targetfnq': ShellQuoteFileName(targetfn),
}
assert '%(targetfnq)s' in cmd_pattern, cmd_pattern
cmd = cmd_pattern % cmd_values_dict

f = open(sourcefn if '%(sourcefnq)s' in cmd_pattern else targetfn, 'wb')
try:
f.write(compressed_data)
finally:
f.close()

LogProportionalInfo('executing zlib optimizer: %s' % cmd)
sys.stdout.flush()
romode = (None, False)[NeedToolLogOutput()]
status = os.system(RedirectOutput(cmd, mode=romode))
if status:
LogFatal('zlib optimizer has failed (status=0x%x): %s' % (status, cmd))
assert os.path.exists(targetfn), (
'zlib optimizer has not created the output file %r: %s' % (targetfn, cmd))

f = open(targetfn, 'rb')
try:
compressed_data = f.read()
finally:
f.close()

os.remove(targetfn)
if '%(sourcefnq)s' in cmd_pattern:
os.remove(sourcefn)
return compressed_data


class Error(Exception):
"""Comon base class for exceptions defined in this module."""
"""Common base class for exceptions defined in this module."""

try:
bytearray_tostring = bytearray.__str__ # Python 2.6 and 2.7.
Expand Down Expand Up @@ -733,7 +791,7 @@ def PermissiveZlibDecompress(data):
if not (8 <= wbits <= 15):
raise zlib.error('Bad zlib wbits: %d' % wbits)
if flg & 32:
raise zlib.error('Unexpected zlib preset diectionary.')
raise zlib.error('Unexpected zlib preset dictionary.')
# This won't work data = zlib.decompress(buffer(data, 2), -wbits)
# It may raise: zlib.error: Error -5 while decompressing data: incomplete or truncated stream
zd = zlib.decompressobj(-wbits)
Expand Down Expand Up @@ -1935,7 +1993,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
if data:
if is_flate_ok:
items.append([None, 'zip', PdfObj(self)])
items[-1][2].stream = zlib.compress(data, 9)
items[-1][2].stream = compat_compress(data, 9)
items[-1][2].Set('Length', len(items[-1][2].stream))
items[-1][2].Set('Filter', '/FlateDecode')
items[-1][2].Set('DecodeParms', None)
Expand All @@ -1958,7 +2016,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
output.append(bytearray_tostring(b))
i += predictor_width
items.append([None, 'zip-pred10', PdfObj(self)])
items[-1][2].stream = zlib.compress(''.join(output), 9)
items[-1][2].stream = compat_compress(''.join(output), 9)
items[-1][2].Set('Length', len(items[-1][2].stream))
items[-1][2].Set('Filter', '/FlateDecode')
# Oddly enough, Multivalent fails if /Predictor 10 or /Predictor 11
Expand All @@ -1980,7 +2038,7 @@ def SetStreamAndCompress(self, data, may_keep_old=False, is_flate_ok=True,
output.append(bytearray_tostring(b))
i += predictor_width
items.append([None, 'zip-pred2', PdfObj(self)])
items[-1][2].stream = zlib.compress(''.join(output), 9)
items[-1][2].stream = compat_compress(''.join(output), 9)
items[-1][2].Set('Length', len(items[-1][2].stream))
items[-1][2].Set('Filter', '/FlateDecode')
items[-1][2].Set('DecodeParms',
Expand Down Expand Up @@ -3819,7 +3877,7 @@ def FixFontNameInType1C(self, new_font_name='F', objs=None,
# TODO(pts): Add generic recompression of all /FlateDecode filters
# (because Ghostscript is suboptimal everywhere).
if self.Get('Filter') != '/FlateDecode' or new_data != data:
self.stream = zlib.compress(new_data, 9)
self.stream = compat_compress(new_data, 9)
self.Set('Filter', '/FlateDecode')
self.Set('DecodeParms', None)
self.Set('Length', len(self.stream))
Expand Down Expand Up @@ -4228,7 +4286,7 @@ def CompressToZipPng(
output.append(idat[i : i + bytes_per_row])

# TODO(pts): Maybe use a smaller effort? We're not optimizing anyway.
self.idat = zlib.compress(''.join(output), effort)
self.idat = compat_compress(''.join(output), effort)
self.compression = 'zip-png'
if do_try_invert:
self.is_inverted = not self.is_inverted
Expand Down Expand Up @@ -4434,7 +4492,7 @@ def LoadPdfImageObj(self, obj, do_zip, decode_kind=None):
if do_zip:
compression = 'zip'
# TODO(pts): Would a smaller effort (compression level) suffice here?
idat = zlib.compress(idat, 9)
idat = compat_compress(idat, 9)
elif predictor in (1, None):
compression = 'zip'
elif predictor == 2:
Expand Down Expand Up @@ -7566,7 +7624,7 @@ def OptimizeImages(self, img_cmd_patterns, do_fast_bilevel_images):
else:
assert image1 is not image2
if image1.compression == 'none':
image1.idat = zlib.compress(image1.idat, 9)
image1.idat = compat_compress(image1.idat, 9)
image1.compression = 'zip'
if len(image1.idat) < len(image2.idat):
# For testing: ./pdfsizeopt.py --use-pngout=false PLRM.pdf
Expand Down Expand Up @@ -8249,7 +8307,7 @@ def OptimizeStreams(self, do_decompress_only=False):
else:
# Try flate with maximum effort.
obj2 = PdfObj(obj)
obj2.stream = zlib.compress(data, 9)
obj2.stream = compat_compress(data, 9)
obj2.Set('Length', len(obj2.stream))
obj2.Set('Filter', '/FlateDecode')
obj2.Set('DecodeParms', None)
Expand Down Expand Up @@ -9190,7 +9248,8 @@ def FixPdfFromMultivalent(cls, data, output,
PDFDATA_MULTIVALENT_EXT_SUB_RE = re.compile(r'[.][^.]+\Z')

def _RunMultivalent(self, do_escape_images,
multivalent_compress_command):
multivalent_compress_command,
do_remove_core_fonts):
"""Run Multivalent, and read its output.

Args:
Expand Down Expand Up @@ -9236,7 +9295,10 @@ def _RunMultivalent(self, do_escape_images,
# * Don't add -jpeg, it introduces lossy compression.
# * Don't add -subset, it's expreimental.
# * FYI http://code.google.com/p/pdfsizeopt/issues/detail?id=30 .
multivalent_flags = '-nopagepiece -noalt -mon'
multivalent_flags = '-nopagepiece -noalt -mon -nowebcap -nostruct'

if do_remove_core_fonts:
multivalent_flags += ' -nocore'

# TODO(pts): Work around exception for emptypage.pdf:
# psotmp.PID.conv.mi.tmp.pdf: java.lang.ClassCastException:
Expand Down Expand Up @@ -9275,6 +9337,7 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
do_escape_images_from_multivalent,
do_generate_xref_stream,
do_generate_object_stream,
do_remove_core_fonts,
is_flate_ok):
"""Save this PDF to a file, with or without Multivalent.

Expand All @@ -9285,6 +9348,8 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
prefix for running Multivalent tool.pdf.Compress.
do_update_file_meta: bool indicating whether self.file_name and
self.file_size should be updated after a successful save.
do_remove_core_fonts: bool indicating whether we should unembed the
core14 fonts.
is_flate_ok: bool indicating if it's OK to generate xref and object
streams with /Filter/FlateDecode.
"""
Expand Down Expand Up @@ -9327,7 +9392,8 @@ def Save(self, file_name, display_file_name, multivalent_compress_command,
if multivalent_compress_command:
multivalent_output_data, tmp_files_to_remove = self._RunMultivalent(
do_escape_images=do_escape_images_from_multivalent,
multivalent_compress_command=multivalent_compress_command)
multivalent_compress_command=multivalent_compress_command,
do_remove_core_fonts=do_remove_core_fonts)
else:
tmp_files_to_remove = ()
multivalent_output_data = None
Expand Down Expand Up @@ -9547,6 +9613,7 @@ def __init__(self):
f.use_pngout = f.use_jbig2 = f.use_sam2p_pr = None
f.mode = 'optimize'
f.img_cmds = []
f.zlib_cmd = ''
f.args = []
f.verbosity = 190
f.tmp_dir = None
Expand Down Expand Up @@ -9599,8 +9666,13 @@ def LongHasArgs(opt, longopts):
if len(value.split()) < 2:
f.img_cmds.extend(filter(None, value.split(',')))
else:
# Special value 'none' and 'none' are also OK.
# Special value 'no' and 'none' are also OK.
f.img_cmds.append(value)
elif flag_name == 'use_zlib_optimizer':
value = value.strip()
if not value:
raise getopt.GetoptError('Empty zlib optimizer command.')
f.zlib_cmd = value
elif flag_name == 'do_double_check_missing_glyphs': # Legacy flag name.
f.do_double_check_type1c_output = ParseBoolFlag(key, value)
elif flag_name == 'v':
Expand Down Expand Up @@ -9687,6 +9759,19 @@ def main(argv, script_dir=None, zip_file=None):
raise getopt.GetoptError('--do-generate-object-stream=yes requires '
'--do-generate-xref-stream=yes')

zlib_cmd_pattern = ''
if f.zlib_cmd:
cmd_name = GetCmdName(f.zlib_cmd)
if not cmd_name:
raise getopt.GetoptError(
'command name missing from zlib optimizer command: %s' %
cmd_pattern)
if '%(targetfnq)s' not in cmd_pattern:
raise getopt.GetoptError(
'targetfnq missing from zlib optimizer command: %s' %
cmd_pattern)
zlib_cmd_pattern = f.zlib_cmd

except getopt.GetoptError, exc:
LogFatal(
'%s\nfatal: error in command line: %s' % (welcome_msg, exc), 1)
Expand Down Expand Up @@ -9755,11 +9840,38 @@ def main(argv, script_dir=None, zip_file=None):
has_not_found = True
else:
img_cmd_patterns_good.append(cmd_pattern)

zlib_cmd_pattern_is_good = False
if zlib_cmd_pattern:
cmd_pattern = zlib_cmd_pattern
# TODO(pts): Use shlib on Linux etc. for parsing the command name.
if cmd_pattern.startswith('"'): # and sys.platform.startswith('win'):
# TODO(pts): How is it possible to specify this on the command-line?
cmd_prog = cmd_pattern[1 : cmd_pattern.find('"', 1)]
else:
cmd_prog = (cmd_pattern.split() or ('',))[0]
if not cmd_prog:
LogFatal('empty zlib optimizer program: %s' % cmd_prog, 1)
if os.path.isabs(cmd_prog):
if not os.path.isfile(cmd_prog):
LogError('zlib optimizer not found: %s' % cmd_prog)
has_not_found = True
else:
zlib_cmd_pattern_is_good = True
elif not FindExeOnPath(cmd_prog):
LogError('zlib optimizer not found on PATH: %s' % cmd_prog)
has_not_found = True
else:
zlib_cmd_pattern_is_good = True

if has_not_found and f.do_require_image_optimizers:
LogFatal(
'not all image optimizers found (see above), '
'ignore with --do-require-image-optimizers=no', 3)
img_cmd_patterns = img_cmd_patterns_good
if zlib_cmd_pattern_is_good:
global compat_compress
compat_compress = lambda data, level: ZlibCmd(data, level, zlib_cmd_pattern)

if output_file_name is None: # Just --do-debug-gs=yes.
return
Expand Down Expand Up @@ -9817,6 +9929,7 @@ def main(argv, script_dir=None, zip_file=None):
do_escape_images_from_multivalent=f.do_escape_images_from_multivalent,
do_generate_xref_stream=f.do_generate_xref_stream,
do_generate_object_stream=f.do_generate_object_stream,
do_remove_core_fonts=f.do_remove_core_fonts,
is_flate_ok=(f.do_compress_uncompressed_streams and
not f.do_decompress_most_streams))
Rename(output_file_name + '.tmp', output_file_name)