From 24ecfe315ab5dbeeb3c532eca0bdf57907ddd6b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dvo=C5=99=C3=A1k=20V=C3=A1clav?= Date: Sat, 3 Mar 2018 23:32:47 +0100 Subject: [PATCH] issue #2959: fix wrong Parent of pre-existing Page objects when appending --- Tests/test_file_pdf.py | 20 +++++++++++++++++ src/PIL/PdfParser.py | 49 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/Tests/test_file_pdf.py b/Tests/test_file_pdf.py index 9bfacc247..8b22df325 100644 --- a/Tests/test_file_pdf.py +++ b/Tests/test_file_pdf.py @@ -147,6 +147,23 @@ class TestFilePdf(PillowTestCase): finally: os.rmdir(temp_dir) + def check_pdf_pages_consistency(self, pdf): + pages_info = pdf.read_indirect(pdf.pages_ref) + self.assertNotIn(b"Parent", pages_info) + self.assertIn(b"Kids", pages_info) + kids_not_used = pages_info[b"Kids"] + for page_ref in pdf.pages: + while True: + if page_ref in kids_not_used: + kids_not_used.remove(page_ref) + page_info = pdf.read_indirect(page_ref) + self.assertIn(b"Parent", page_info) + page_ref = page_info[b"Parent"] + if page_ref == pdf.pages_ref: + break + self.assertEqual(pdf.pages_ref, page_info[b"Parent"]) + self.assertEqual(kids_not_used, []) + def test_pdf_append(self): # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB", producer="PdfParser") @@ -156,6 +173,7 @@ class TestFilePdf(PillowTestCase): self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 1) self.assertEqual(pdf.info.Producer, "PdfParser") + self.check_pdf_pages_consistency(pdf) # append some info pdf.info.Title = "abc" @@ -171,6 +189,7 @@ class TestFilePdf(PillowTestCase): self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 6) self.assertEqual(pdf.info.Title, "abc") + self.check_pdf_pages_consistency(pdf) # append two images mode_CMYK = hopper("CMYK") @@ -186,6 +205,7 @@ class TestFilePdf(PillowTestCase): self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Subject, u"ghi\uABCD") + self.check_pdf_pages_consistency(pdf) def test_pdf_info(self): # make a PDF file diff --git a/src/PIL/PdfParser.py b/src/PIL/PdfParser.py index a5fb18a1a..b6938fdb7 100644 --- a/src/PIL/PdfParser.py +++ b/src/PIL/PdfParser.py @@ -101,6 +101,9 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec def __ne__(self, other): return not (self == other) + def __hash__(self): + return hash((self.object_id, self.generation)) + class IndirectObjectDef(IndirectReference): def __str__(self): @@ -192,6 +195,9 @@ class PdfName: else: self.name = name.encode("us-ascii") + def name_as_str(self): + return self.name.decode("us-ascii") + def __eq__(self, other): return (isinstance(other, PdfName) and other.name == self.name) or other == self.name @@ -358,6 +364,7 @@ class PdfParser: self.should_close_buf = True if not filename and hasattr(f, "name"): self.filename = f.name + self.cached_objects = {} if buf: self.read_pdf_info() else: @@ -368,6 +375,7 @@ class PdfParser: self.info_ref = None self.page_tree_root = {} self.pages = [] + self.orig_pages = [] self.pages_ref = None self.last_xref_section_offset = None self.trailer_dict = {} @@ -414,15 +422,45 @@ class PdfParser: self.del_root() self.root_ref = self.next_object_id(self.f.tell()) self.pages_ref = self.next_object_id(0) + self.rewrite_pages() self.write_obj(self.root_ref, Type=PdfName(b"Catalog"), Pages=self.pages_ref) self.write_obj(self.pages_ref, - Type=PdfName("Pages"), + Type=PdfName(b"Pages"), Count=len(self.pages), Kids=self.pages) return self.root_ref + def rewrite_pages(self): + pages_tree_nodes_to_delete = [] + for i, page_ref in enumerate(self.orig_pages): + page_info = self.cached_objects[page_ref] + del self.xref_table[page_ref.object_id] + pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")]) + if page_ref not in self.pages: + # the page has been deleted + continue + # make dict keys into strings for passing to write_page + stringified_page_info = {} + for key, value in page_info.items(): + # key should be a PdfName + stringified_page_info[key.name_as_str()] = value + stringified_page_info["Parent"] = self.pages_ref + new_page_ref = self.write_page(None, **stringified_page_info) + for j, cur_page_ref in enumerate(self.pages): + if cur_page_ref == page_ref: + # replace the page reference with the new one + self.pages[j] = new_page_ref + # delete redundant Pages tree nodes from xref table + for pages_tree_node_ref in pages_tree_nodes_to_delete: + while pages_tree_node_ref: + pages_tree_node = self.cached_objects[pages_tree_node_ref] + if pages_tree_node_ref.object_id in self.xref_table: + del self.xref_table[pages_tree_node_ref.object_id] + pages_tree_node_ref = pages_tree_node.get(b"Parent", None) + self.orig_pages = [] + def write_xref_and_trailer(self, new_root_ref=None): if new_root_ref: self.del_root() @@ -443,7 +481,7 @@ class PdfParser: if isinstance(ref, int): ref = self.pages[ref] if "Type" not in dict_obj: - dict_obj["Type"] = PdfName("Page") + dict_obj["Type"] = PdfName(b"Page") if "Parent" not in dict_obj: dict_obj["Parent"] = self.pages_ref return self.write_obj(ref, *objs, **dict_obj) @@ -474,7 +512,6 @@ class PdfParser: return del self.xref_table[self.root_ref.object_id] del self.xref_table[self.root[b"Pages"].object_id] - # XXX TODO delete Pages tree recursively @staticmethod def get_buf_from_file(f): @@ -506,6 +543,8 @@ class PdfParser: self.pages_ref = self.root[b"Pages"] self.page_tree_root = self.read_indirect(self.pages_ref) self.pages = self.linearize_page_tree(self.page_tree_root) + # save the original list of page references in case the user modifies, adds or deletes some pages and we need to rewrite the pages and their list + self.orig_pages = self.pages[:] def next_object_id(self, offset=None): try: @@ -789,7 +828,9 @@ class PdfParser: offset, generation = self.xref_table[ref[0]] check_format_condition(generation == ref[1], "expected to find generation %s for object ID %s in xref table, instead found generation %s at offset %s" \ % (ref[1], ref[0], generation, offset)) - return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0] + value = self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0] + self.cached_objects[ref] = value + return value def linearize_page_tree(self, node=None): if node is None: