issue #2959: changes based on @hugovk's review

This commit is contained in:
Dvořák Václav 2018-01-31 00:25:04 +01:00
parent ede57b91e0
commit 9be8d669f9
6 changed files with 54 additions and 56 deletions

View File

@ -107,19 +107,23 @@ class TestFilePdf(PillowTestCase):
def test_pdf_open(self):
# fail on a buffer full of null bytes
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
# make an empty PDF object
with pdfParser.PdfParser() as empty_pdf:
self.assertEqual(len(empty_pdf.pages), 0)
self.assertEqual(len(empty_pdf.info), 0)
self.assertFalse(empty_pdf.should_close_buf)
self.assertFalse(empty_pdf.should_close_file)
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB")
# open the PDF file
with pdfParser.PdfParser(filename=pdf_filename) as hopper_pdf:
self.assertEqual(len(hopper_pdf.pages), 1)
self.assertTrue(hopper_pdf.should_close_buf)
self.assertTrue(hopper_pdf.should_close_file)
# read a PDF file from a buffer with a non-zero offset
with open(pdf_filename, "rb") as f:
content = b"xyzzy" + f.read()
@ -127,6 +131,7 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(len(hopper_pdf.pages), 1)
self.assertFalse(hopper_pdf.should_close_buf)
self.assertFalse(hopper_pdf.should_close_file)
# read a PDF file from an already open file
with open(pdf_filename, "rb") as f:
with pdfParser.PdfParser(f=f) as hopper_pdf:
@ -145,11 +150,13 @@ class TestFilePdf(PillowTestCase):
def test_pdf_append(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser")
# open it, check pages and info
with pdfParser.PdfParser(pdf_filename, mode="r+b") as pdf:
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 1)
self.assertEqual(pdf.info.Producer, "pdfParser")
# append some info
pdf.info.Title = "abc"
pdf.info.Author = "def"
@ -157,16 +164,19 @@ class TestFilePdf(PillowTestCase):
pdf.info.Keywords = "qw)e\\r(ty"
pdf.info.Creator = "hopper()"
pdf.start_writing()
pdf.write_xref_and_trailer(f)
pdf.write_xref_and_trailer()
# open it again, check pages and info again
with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "abc")
# append two images
mode_CMYK = hopper("CMYK")
mode_P = hopper("P")
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
# open the PDF again, check pages and info again
with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 3)
@ -177,9 +187,10 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
def test_pdf_append(self):
def test_pdf_info(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer")
# open it, check pages and info
with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.info), 6)

View File

@ -250,7 +250,7 @@ class TestCffi(AccessTest):
class TestEmbeddable(unittest.TestCase):
@unittest.skipIf(not sys.platform.startswith('win32') or
@unittest.skipIf(sys.platform.startswith('win32') or
sys.version_info[:2] == (3, 4) or
on_appveyor(), # failing on appveyor when run from
# subprocess, not from shell

View File

@ -1,6 +1,6 @@
from helper import unittest, PillowTestCase
from PIL.pdfParser import *
from PIL.pdfParser import IndirectObjectDef, IndirectReference, PdfBinary, PdfDict, PdfFormatError, PdfName, PdfParser, PdfStream, decode_text, encode_text, pdf_repr
class TestPdfParser(PillowTestCase):
@ -12,14 +12,14 @@ class TestPdfParser(PillowTestCase):
self.assertEqual(decode_text(b"\x1B a \x1C"), u"\u02D9 a \u02DD")
def test_indirect_refs(self):
self.assertEqual(IndirectReference(1,2), IndirectReference(1,2))
self.assertNotEqual(IndirectReference(1,2), IndirectReference(1,3))
self.assertNotEqual(IndirectReference(1,2), IndirectObjectDef(1,2))
self.assertNotEqual(IndirectReference(1,2), (1,2))
self.assertEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,2))
self.assertNotEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,3))
self.assertNotEqual(IndirectObjectDef(1,2), IndirectReference(1,2))
self.assertNotEqual(IndirectObjectDef(1,2), (1,2))
self.assertEqual(IndirectReference(1, 2), IndirectReference(1, 2))
self.assertNotEqual(IndirectReference(1, 2), IndirectReference(1, 3))
self.assertNotEqual(IndirectReference(1, 2), IndirectObjectDef(1, 2))
self.assertNotEqual(IndirectReference(1, 2), (1, 2))
self.assertEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 2))
self.assertNotEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 3))
self.assertNotEqual(IndirectObjectDef(1, 2), IndirectReference(1, 2))
self.assertNotEqual(IndirectObjectDef(1, 2), (1, 2))
def test_parsing(self):
self.assertEqual(PdfParser.interpret_name(b"Name#23Hash"), b"Name#Hash")
@ -64,18 +64,18 @@ class TestPdfParser(PillowTestCase):
self.assertEqual(s.decode(), b"abcde")
def test_pdf_repr(self):
self.assertEqual(bytes(IndirectReference(1,2)), b"1 2 R")
self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj")
self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R")
self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj")
self.assertEqual(bytes(PdfName(b"Name#Hash")), b"/Name#23Hash")
self.assertEqual(bytes(PdfName("Name#Hash")), b"/Name#23Hash")
self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(IndirectReference(1,2)), b"1 2 R")
self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj")
self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(IndirectReference(1, 2)), b"1 2 R")
self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj")
self.assertEqual(pdf_repr(PdfName(b"Name#Hash")), b"/Name#23Hash")
self.assertEqual(pdf_repr(PdfName("Name#Hash")), b"/Name#23Hash")
self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(123), b"123")
self.assertEqual(pdf_repr(True), b"true")
self.assertEqual(pdf_repr(False), b"false")

View File

@ -616,7 +616,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
A list of images to append as additional frames. Each of the
images in the list can be single or multiframe images. Note however, that for
correct results, all the appended images should have the same
encoderinfo and encoderconfig properties.
``encoderinfo`` and ``encoderconfig`` properties.
.. versionadded:: 4.2.0
@ -973,7 +973,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
**append**
Set to True to append pages to an existing PDF file. If the file doesn't
exist, an IOError will be raised.
exist, an :py:exc:`IOError` will be raised.
.. versionadded:: 5.1.0

View File

@ -21,7 +21,6 @@
##
from . import Image, ImageFile, ImageSequence, pdfParser
from ._binary import i8
import io
__version__ = "0.5"

View File

@ -1,20 +1,19 @@
import codecs
import collections
import io
import mmap
import os
import re
import sys
import zlib
try:
from UserDict import UserDict
from UserDict import UserDict # Python 2.x
except ImportError:
UserDict = collections.UserDict
UserDict = collections.UserDict # Python 3.x
if str == bytes: # Python 2.x
make_bytes = lambda s: s # pragma: no cover
def make_bytes(s): # pragma: no cover
return s # pragma: no cover
else: # Python 3.x
def make_bytes(s):
return s.encode("us-ascii")
@ -68,6 +67,8 @@ PDFDocEncoding = {
0x9E: u"\u017E",
0xA0: u"\u20AC",
}
def decode_text(b):
if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be")
@ -181,7 +182,7 @@ class XrefTable:
return startxref
class PdfName():
class PdfName:
def __init__(self, name):
if isinstance(name, PdfName):
self.name = name.name
@ -203,7 +204,8 @@ class PdfName():
def from_pdf_stream(klass, data):
return klass(PdfParser.interpret_name(data))
allowed_chars = set(range(33,127)) - set((ord(c) for c in "#%/()<>[]{}"))
allowed_chars = set(range(33,127)) - set(ord(c) for c in "#%/()<>[]{}")
def __bytes__(self):
if str == bytes: # Python 2.x
result = bytearray(b"/")
@ -495,16 +497,13 @@ class PdfParser:
self.info = PdfDict()
else:
self.info = PdfDict(self.read_indirect(self.info_ref))
#print(repr(self.root))
check_format_condition(b"Type" in self.root, "/Type missing in Root")
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
self.pages_ref = self.root[b"Pages"]
self.page_tree_root = self.read_indirect(self.pages_ref)
#print("page_tree_root: " + str(self.page_tree_root))
self.pages = self.linearize_page_tree(self.page_tree_root)
#print("pages: " + str(self.pages))
def next_object_id(self, offset=None):
try:
@ -524,16 +523,15 @@ class PdfParser:
whitespace_mandatory = whitespace + b"+"
newline_only = br"[\r\n]+"
newline = whitespace_optional + newline_only + whitespace_optional
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline \
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline \
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
def read_trailer(self):
search_start_offset = len(self.buf) - 16384
if search_start_offset < self.start_offset:
search_start_offset = self.start_offset
#data_at_end = self.buf[search_start_offset:]
#m = self.re_trailer_end.search(data_at_end)
m = self.re_trailer_end.search(self.buf, search_start_offset)
check_format_condition(m, "trailer end not found")
# make sure we found the LAST trailer
@ -544,12 +542,10 @@ class PdfParser:
if not m:
m = last_match
trailer_data = m.group(1)
#print(trailer_data)
self.last_xref_section_offset = int(m.group(2))
self.trailer_dict = self.interpret_trailer(trailer_data)
self.xref_table = XrefTable()
self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
#print(self.xref_table)
if b"Prev" in self.trailer_dict:
self.read_prev_trailer(self.trailer_dict[b"Prev"])
@ -558,7 +554,6 @@ class PdfParser:
m = self.re_trailer_prev.search(self.buf[trailer_offset:trailer_offset+16384])
check_format_condition(m, "previous trailer not found")
trailer_data = m.group(1)
#print(trailer_data)
check_format_condition(int(m.group(2)) == xref_section_offset, "xref section offset in previous trailer doesn't match what was expected")
trailer_dict = self.interpret_trailer(trailer_data)
if b"Prev" in trailer_dict:
@ -568,6 +563,7 @@ class PdfParser:
re_name = re.compile(whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" + delimiter_or_ws + br")")
re_dict_start = re.compile(whitespace_optional + br"\<\<")
re_dict_end = re.compile(whitespace_optional + br"\>\>" + whitespace_optional)
@classmethod
def interpret_trailer(klass, trailer_data):
trailer = {}
@ -579,15 +575,14 @@ class PdfParser:
check_format_condition(m and m.end() == len(trailer_data), "name not found in trailer, remaining data: " + repr(trailer_data[offset:]))
break
key = klass.interpret_name(m.group(1))
#print(key)
value, offset = klass.get_value(trailer_data, m.end())
#print(value)
trailer[key] = value
check_format_condition(b"Size" in trailer and isinstance(trailer[b"Size"], int), "/Size not in trailer or not an integer")
check_format_condition(b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference), "/Root not in trailer or not an indirect reference")
return trailer
re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?")
@classmethod
def interpret_name(klass, raw, as_text=False):
name = b""
@ -616,10 +611,11 @@ class PdfParser:
re_comment = re.compile(br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*")
re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
re_stream_end = re.compile(whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")")
@classmethod
def get_value(klass, data, offset, expect_indirect=None, max_nesting=-1):
#if max_nesting == 0:
# return None, None
if max_nesting == 0:
return None, None
m = klass.re_comment.match(data, offset)
if m:
offset = m.end()
@ -645,26 +641,22 @@ class PdfParser:
if m:
offset = m.end()
result = {}
#print("<<")
m = klass.re_dict_end.match(data, offset)
while not m:
key, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
#print ("key " + str(key))
if offset is None:
return result, None
value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
result[key] = value
#print ("value " + str(value))
if offset is None:
return result, None
m = klass.re_dict_end.match(data, offset)
#print(">>")
offset = m.end()
m = klass.re_stream_start.match(data, offset)
if m:
try:
stream_len = int(result[b"Length"])
except:
except (TypeError, KeyError, ValueError):
raise PdfFormatError("bad or missing Length in stream dict (%r)" % result.get(b"Length", None))
stream_data = data[m.end():m.end() + stream_len]
m = klass.re_stream_end.match(data, m.end() + stream_len)
@ -682,7 +674,6 @@ class PdfParser:
while not m:
value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
result.append(value)
#print ("item " + str(value))
if offset is None:
return result, None
m = klass.re_array_end.match(data, offset)
@ -717,7 +708,6 @@ class PdfParser:
#return None, offset # fallback (only for debugging)
raise PdfFormatError("unrecognized object: " + repr(data[offset:offset+32]))
re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))")
escaped_chars = {
b"n": b"\n",
@ -737,6 +727,7 @@ class PdfParser:
ord(b")"): b")",
ord(b"\\"): b"\\",
}
@classmethod
def get_literal_string(klass, data, offset):
nesting_depth = 0
@ -746,7 +737,6 @@ class PdfParser:
if m.group(1):
result.extend(klass.escaped_chars[m.group(1)[1]])
elif m.group(2):
#result.append(eval(m.group(1)))
result.append(int(m.group(2)[1:], 8))
elif m.group(3):
pass
@ -763,10 +753,10 @@ class PdfParser:
offset = m.end()
raise PdfFormatError("unfinished literal string")
re_xref_section_start = re.compile(whitespace_optional + br"xref" + newline)
re_xref_subsection_start = re.compile(whitespace_optional + br"([0-9]+)" + whitespace_mandatory + br"([0-9]+)" + whitespace_optional + newline_only)
re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
def read_xref_table(self, xref_section_offset):
subsection_found = False
m = self.re_xref_section_start.match(self.buf, xref_section_offset + self.start_offset)
@ -793,20 +783,18 @@ class PdfParser:
self.xref_table[i] = new_entry
return offset
def read_indirect(self, ref, max_nesting=-1):
offset, generation = self.xref_table[ref[0]]
assert generation == ref[1]
return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0]
def linearize_page_tree(self, node=None):
if node is None:
node = self.page_tree_root
check_format_condition(node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages")
pages = []
for kid in node[b"Kids"]:
kid_object = self.read_indirect(kid, max_nesting=3)
kid_object = self.read_indirect(kid)
if kid_object[b"Type"] == b"Page":
pages.append(kid)
else: