diff --git a/Tests/test_file_pdf.py b/Tests/test_file_pdf.py index 824f6149a..173e4d5c7 100644 --- a/Tests/test_file_pdf.py +++ b/Tests/test_file_pdf.py @@ -107,19 +107,23 @@ class TestFilePdf(PillowTestCase): def test_pdf_open(self): # fail on a buffer full of null bytes self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536)) + # make an empty PDF object with pdfParser.PdfParser() as empty_pdf: self.assertEqual(len(empty_pdf.pages), 0) self.assertEqual(len(empty_pdf.info), 0) self.assertFalse(empty_pdf.should_close_buf) self.assertFalse(empty_pdf.should_close_file) + # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB") + # open the PDF file with pdfParser.PdfParser(filename=pdf_filename) as hopper_pdf: self.assertEqual(len(hopper_pdf.pages), 1) self.assertTrue(hopper_pdf.should_close_buf) self.assertTrue(hopper_pdf.should_close_file) + # read a PDF file from a buffer with a non-zero offset with open(pdf_filename, "rb") as f: content = b"xyzzy" + f.read() @@ -127,6 +131,7 @@ class TestFilePdf(PillowTestCase): self.assertEqual(len(hopper_pdf.pages), 1) self.assertFalse(hopper_pdf.should_close_buf) self.assertFalse(hopper_pdf.should_close_file) + # read a PDF file from an already open file with open(pdf_filename, "rb") as f: with pdfParser.PdfParser(f=f) as hopper_pdf: @@ -145,11 +150,13 @@ class TestFilePdf(PillowTestCase): def test_pdf_append(self): # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser") + # open it, check pages and info with pdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 1) self.assertEqual(pdf.info.Producer, "pdfParser") + # append some info pdf.info.Title = "abc" pdf.info.Author = "def" @@ -157,16 +164,19 @@ class TestFilePdf(PillowTestCase): pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Creator = "hopper()" pdf.start_writing() - pdf.write_xref_and_trailer(f) + pdf.write_xref_and_trailer() + # open it again, check pages and info again with pdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 6) self.assertEqual(pdf.info.Title, "abc") + # append two images mode_CMYK = hopper("CMYK") mode_P = hopper("P") mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) + # open the PDF again, check pages and info again with pdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 3) @@ -177,9 +187,10 @@ class TestFilePdf(PillowTestCase): self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Subject, u"ghi\uABCD") - def test_pdf_append(self): + def test_pdf_info(self): # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer") + # open it, check pages and info with pdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.info), 6) diff --git a/Tests/test_image_access.py b/Tests/test_image_access.py index 464e0a208..0fcc5a689 100644 --- a/Tests/test_image_access.py +++ b/Tests/test_image_access.py @@ -250,7 +250,7 @@ class TestCffi(AccessTest): class TestEmbeddable(unittest.TestCase): - @unittest.skipIf(not sys.platform.startswith('win32') or + @unittest.skipIf(sys.platform.startswith('win32') or sys.version_info[:2] == (3, 4) or on_appveyor(), # failing on appveyor when run from # subprocess, not from shell diff --git a/Tests/test_pdfparser.py b/Tests/test_pdfparser.py index bf1066ff1..af6170008 100644 --- a/Tests/test_pdfparser.py +++ b/Tests/test_pdfparser.py @@ -1,6 +1,6 @@ from helper import unittest, PillowTestCase -from PIL.pdfParser import * +from PIL.pdfParser import IndirectObjectDef, IndirectReference, PdfBinary, PdfDict, PdfFormatError, PdfName, PdfParser, PdfStream, decode_text, encode_text, pdf_repr class TestPdfParser(PillowTestCase): @@ -12,14 +12,14 @@ class TestPdfParser(PillowTestCase): self.assertEqual(decode_text(b"\x1B a \x1C"), u"\u02D9 a \u02DD") def test_indirect_refs(self): - self.assertEqual(IndirectReference(1,2), IndirectReference(1,2)) - self.assertNotEqual(IndirectReference(1,2), IndirectReference(1,3)) - self.assertNotEqual(IndirectReference(1,2), IndirectObjectDef(1,2)) - self.assertNotEqual(IndirectReference(1,2), (1,2)) - self.assertEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,2)) - self.assertNotEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,3)) - self.assertNotEqual(IndirectObjectDef(1,2), IndirectReference(1,2)) - self.assertNotEqual(IndirectObjectDef(1,2), (1,2)) + self.assertEqual(IndirectReference(1, 2), IndirectReference(1, 2)) + self.assertNotEqual(IndirectReference(1, 2), IndirectReference(1, 3)) + self.assertNotEqual(IndirectReference(1, 2), IndirectObjectDef(1, 2)) + self.assertNotEqual(IndirectReference(1, 2), (1, 2)) + self.assertEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 2)) + self.assertNotEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 3)) + self.assertNotEqual(IndirectObjectDef(1, 2), IndirectReference(1, 2)) + self.assertNotEqual(IndirectObjectDef(1, 2), (1, 2)) def test_parsing(self): self.assertEqual(PdfParser.interpret_name(b"Name#23Hash"), b"Name#Hash") @@ -64,18 +64,18 @@ class TestPdfParser(PillowTestCase): self.assertEqual(s.decode(), b"abcde") def test_pdf_repr(self): - self.assertEqual(bytes(IndirectReference(1,2)), b"1 2 R") - self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj") + self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R") + self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj") self.assertEqual(bytes(PdfName(b"Name#Hash")), b"/Name#23Hash") self.assertEqual(bytes(PdfName("Name#Hash")), b"/Name#23Hash") - self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") - self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") - self.assertEqual(pdf_repr(IndirectReference(1,2)), b"1 2 R") - self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj") + self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>") + self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>") + self.assertEqual(pdf_repr(IndirectReference(1, 2)), b"1 2 R") + self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj") self.assertEqual(pdf_repr(PdfName(b"Name#Hash")), b"/Name#23Hash") self.assertEqual(pdf_repr(PdfName("Name#Hash")), b"/Name#23Hash") - self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") - self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") + self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>") + self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>") self.assertEqual(pdf_repr(123), b"123") self.assertEqual(pdf_repr(True), b"true") self.assertEqual(pdf_repr(False), b"false") diff --git a/docs/handbook/image-file-formats.rst b/docs/handbook/image-file-formats.rst index fb6b3f2a8..d265561de 100644 --- a/docs/handbook/image-file-formats.rst +++ b/docs/handbook/image-file-formats.rst @@ -616,7 +616,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum A list of images to append as additional frames. Each of the images in the list can be single or multiframe images. Note however, that for correct results, all the appended images should have the same - encoderinfo and encoderconfig properties. + ``encoderinfo`` and ``encoderconfig`` properties. .. versionadded:: 4.2.0 @@ -973,7 +973,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum **append** Set to True to append pages to an existing PDF file. If the file doesn't - exist, an IOError will be raised. + exist, an :py:exc:`IOError` will be raised. .. versionadded:: 5.1.0 diff --git a/src/PIL/PdfImagePlugin.py b/src/PIL/PdfImagePlugin.py index 6b90db12e..33a22b9d0 100644 --- a/src/PIL/PdfImagePlugin.py +++ b/src/PIL/PdfImagePlugin.py @@ -21,7 +21,6 @@ ## from . import Image, ImageFile, ImageSequence, pdfParser -from ._binary import i8 import io __version__ = "0.5" diff --git a/src/PIL/pdfParser.py b/src/PIL/pdfParser.py index be4f2ccf1..6c4942112 100644 --- a/src/PIL/pdfParser.py +++ b/src/PIL/pdfParser.py @@ -1,20 +1,19 @@ import codecs import collections -import io import mmap import os import re -import sys import zlib try: - from UserDict import UserDict + from UserDict import UserDict # Python 2.x except ImportError: - UserDict = collections.UserDict + UserDict = collections.UserDict # Python 3.x if str == bytes: # Python 2.x - make_bytes = lambda s: s # pragma: no cover + def make_bytes(s): # pragma: no cover + return s # pragma: no cover else: # Python 3.x def make_bytes(s): return s.encode("us-ascii") @@ -68,6 +67,8 @@ PDFDocEncoding = { 0x9E: u"\u017E", 0xA0: u"\u20AC", } + + def decode_text(b): if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE: return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be") @@ -181,7 +182,7 @@ class XrefTable: return startxref -class PdfName(): +class PdfName: def __init__(self, name): if isinstance(name, PdfName): self.name = name.name @@ -203,7 +204,8 @@ class PdfName(): def from_pdf_stream(klass, data): return klass(PdfParser.interpret_name(data)) - allowed_chars = set(range(33,127)) - set((ord(c) for c in "#%/()<>[]{}")) + allowed_chars = set(range(33,127)) - set(ord(c) for c in "#%/()<>[]{}") + def __bytes__(self): if str == bytes: # Python 2.x result = bytearray(b"/") @@ -495,16 +497,13 @@ class PdfParser: self.info = PdfDict() else: self.info = PdfDict(self.read_indirect(self.info_ref)) - #print(repr(self.root)) check_format_condition(b"Type" in self.root, "/Type missing in Root") check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog") check_format_condition(b"Pages" in self.root, "/Pages missing in Root") check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference") self.pages_ref = self.root[b"Pages"] self.page_tree_root = self.read_indirect(self.pages_ref) - #print("page_tree_root: " + str(self.page_tree_root)) self.pages = self.linearize_page_tree(self.page_tree_root) - #print("pages: " + str(self.pages)) def next_object_id(self, offset=None): try: @@ -524,16 +523,15 @@ class PdfParser: whitespace_mandatory = whitespace + b"+" newline_only = br"[\r\n]+" newline = whitespace_optional + newline_only + whitespace_optional - re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline \ + re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL) - re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline \ + re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL) + def read_trailer(self): search_start_offset = len(self.buf) - 16384 if search_start_offset < self.start_offset: search_start_offset = self.start_offset - #data_at_end = self.buf[search_start_offset:] - #m = self.re_trailer_end.search(data_at_end) m = self.re_trailer_end.search(self.buf, search_start_offset) check_format_condition(m, "trailer end not found") # make sure we found the LAST trailer @@ -544,12 +542,10 @@ class PdfParser: if not m: m = last_match trailer_data = m.group(1) - #print(trailer_data) self.last_xref_section_offset = int(m.group(2)) self.trailer_dict = self.interpret_trailer(trailer_data) self.xref_table = XrefTable() self.read_xref_table(xref_section_offset=self.last_xref_section_offset) - #print(self.xref_table) if b"Prev" in self.trailer_dict: self.read_prev_trailer(self.trailer_dict[b"Prev"]) @@ -558,7 +554,6 @@ class PdfParser: m = self.re_trailer_prev.search(self.buf[trailer_offset:trailer_offset+16384]) check_format_condition(m, "previous trailer not found") trailer_data = m.group(1) - #print(trailer_data) check_format_condition(int(m.group(2)) == xref_section_offset, "xref section offset in previous trailer doesn't match what was expected") trailer_dict = self.interpret_trailer(trailer_data) if b"Prev" in trailer_dict: @@ -568,6 +563,7 @@ class PdfParser: re_name = re.compile(whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" + delimiter_or_ws + br")") re_dict_start = re.compile(whitespace_optional + br"\<\<") re_dict_end = re.compile(whitespace_optional + br"\>\>" + whitespace_optional) + @classmethod def interpret_trailer(klass, trailer_data): trailer = {} @@ -579,15 +575,14 @@ class PdfParser: check_format_condition(m and m.end() == len(trailer_data), "name not found in trailer, remaining data: " + repr(trailer_data[offset:])) break key = klass.interpret_name(m.group(1)) - #print(key) value, offset = klass.get_value(trailer_data, m.end()) - #print(value) trailer[key] = value check_format_condition(b"Size" in trailer and isinstance(trailer[b"Size"], int), "/Size not in trailer or not an integer") check_format_condition(b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference), "/Root not in trailer or not an indirect reference") return trailer re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?") + @classmethod def interpret_name(klass, raw, as_text=False): name = b"" @@ -616,10 +611,11 @@ class PdfParser: re_comment = re.compile(br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*") re_stream_start = re.compile(whitespace_optional + br"stream\r?\n") re_stream_end = re.compile(whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")") + @classmethod def get_value(klass, data, offset, expect_indirect=None, max_nesting=-1): - #if max_nesting == 0: - # return None, None + if max_nesting == 0: + return None, None m = klass.re_comment.match(data, offset) if m: offset = m.end() @@ -645,26 +641,22 @@ class PdfParser: if m: offset = m.end() result = {} - #print("<<") m = klass.re_dict_end.match(data, offset) while not m: key, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) - #print ("key " + str(key)) if offset is None: return result, None value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) result[key] = value - #print ("value " + str(value)) if offset is None: return result, None m = klass.re_dict_end.match(data, offset) - #print(">>") offset = m.end() m = klass.re_stream_start.match(data, offset) if m: try: stream_len = int(result[b"Length"]) - except: + except (TypeError, KeyError, ValueError): raise PdfFormatError("bad or missing Length in stream dict (%r)" % result.get(b"Length", None)) stream_data = data[m.end():m.end() + stream_len] m = klass.re_stream_end.match(data, m.end() + stream_len) @@ -682,7 +674,6 @@ class PdfParser: while not m: value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) result.append(value) - #print ("item " + str(value)) if offset is None: return result, None m = klass.re_array_end.match(data, offset) @@ -717,7 +708,6 @@ class PdfParser: #return None, offset # fallback (only for debugging) raise PdfFormatError("unrecognized object: " + repr(data[offset:offset+32])) - re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))") escaped_chars = { b"n": b"\n", @@ -737,6 +727,7 @@ class PdfParser: ord(b")"): b")", ord(b"\\"): b"\\", } + @classmethod def get_literal_string(klass, data, offset): nesting_depth = 0 @@ -746,7 +737,6 @@ class PdfParser: if m.group(1): result.extend(klass.escaped_chars[m.group(1)[1]]) elif m.group(2): - #result.append(eval(m.group(1))) result.append(int(m.group(2)[1:], 8)) elif m.group(3): pass @@ -763,10 +753,10 @@ class PdfParser: offset = m.end() raise PdfFormatError("unfinished literal string") - re_xref_section_start = re.compile(whitespace_optional + br"xref" + newline) re_xref_subsection_start = re.compile(whitespace_optional + br"([0-9]+)" + whitespace_mandatory + br"([0-9]+)" + whitespace_optional + newline_only) re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)") + def read_xref_table(self, xref_section_offset): subsection_found = False m = self.re_xref_section_start.match(self.buf, xref_section_offset + self.start_offset) @@ -793,20 +783,18 @@ class PdfParser: self.xref_table[i] = new_entry return offset - def read_indirect(self, ref, max_nesting=-1): offset, generation = self.xref_table[ref[0]] assert generation == ref[1] return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0] - def linearize_page_tree(self, node=None): if node is None: node = self.page_tree_root check_format_condition(node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages") pages = [] for kid in node[b"Kids"]: - kid_object = self.read_indirect(kid, max_nesting=3) + kid_object = self.read_indirect(kid) if kid_object[b"Type"] == b"Page": pages.append(kid) else: