issue #2959: fix wrong Parent of pre-existing Page objects when appending

This commit is contained in:
Dvořák Václav 2018-03-03 23:32:47 +01:00
parent 113d67214c
commit 24ecfe315a
2 changed files with 65 additions and 4 deletions

View File

@ -147,6 +147,23 @@ class TestFilePdf(PillowTestCase):
finally:
os.rmdir(temp_dir)
def check_pdf_pages_consistency(self, pdf):
pages_info = pdf.read_indirect(pdf.pages_ref)
self.assertNotIn(b"Parent", pages_info)
self.assertIn(b"Kids", pages_info)
kids_not_used = pages_info[b"Kids"]
for page_ref in pdf.pages:
while True:
if page_ref in kids_not_used:
kids_not_used.remove(page_ref)
page_info = pdf.read_indirect(page_ref)
self.assertIn(b"Parent", page_info)
page_ref = page_info[b"Parent"]
if page_ref == pdf.pages_ref:
break
self.assertEqual(pdf.pages_ref, page_info[b"Parent"])
self.assertEqual(kids_not_used, [])
def test_pdf_append(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", producer="PdfParser")
@ -156,6 +173,7 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 1)
self.assertEqual(pdf.info.Producer, "PdfParser")
self.check_pdf_pages_consistency(pdf)
# append some info
pdf.info.Title = "abc"
@ -171,6 +189,7 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "abc")
self.check_pdf_pages_consistency(pdf)
# append two images
mode_CMYK = hopper("CMYK")
@ -186,6 +205,7 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(pdf.info.Producer, "PdfParser")
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
self.check_pdf_pages_consistency(pdf)
def test_pdf_info(self):
# make a PDF file

View File

@ -101,6 +101,9 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash((self.object_id, self.generation))
class IndirectObjectDef(IndirectReference):
def __str__(self):
@ -192,6 +195,9 @@ class PdfName:
else:
self.name = name.encode("us-ascii")
def name_as_str(self):
return self.name.decode("us-ascii")
def __eq__(self, other):
return (isinstance(other, PdfName) and other.name == self.name) or other == self.name
@ -358,6 +364,7 @@ class PdfParser:
self.should_close_buf = True
if not filename and hasattr(f, "name"):
self.filename = f.name
self.cached_objects = {}
if buf:
self.read_pdf_info()
else:
@ -368,6 +375,7 @@ class PdfParser:
self.info_ref = None
self.page_tree_root = {}
self.pages = []
self.orig_pages = []
self.pages_ref = None
self.last_xref_section_offset = None
self.trailer_dict = {}
@ -414,15 +422,45 @@ class PdfParser:
self.del_root()
self.root_ref = self.next_object_id(self.f.tell())
self.pages_ref = self.next_object_id(0)
self.rewrite_pages()
self.write_obj(self.root_ref,
Type=PdfName(b"Catalog"),
Pages=self.pages_ref)
self.write_obj(self.pages_ref,
Type=PdfName("Pages"),
Type=PdfName(b"Pages"),
Count=len(self.pages),
Kids=self.pages)
return self.root_ref
def rewrite_pages(self):
pages_tree_nodes_to_delete = []
for i, page_ref in enumerate(self.orig_pages):
page_info = self.cached_objects[page_ref]
del self.xref_table[page_ref.object_id]
pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
if page_ref not in self.pages:
# the page has been deleted
continue
# make dict keys into strings for passing to write_page
stringified_page_info = {}
for key, value in page_info.items():
# key should be a PdfName
stringified_page_info[key.name_as_str()] = value
stringified_page_info["Parent"] = self.pages_ref
new_page_ref = self.write_page(None, **stringified_page_info)
for j, cur_page_ref in enumerate(self.pages):
if cur_page_ref == page_ref:
# replace the page reference with the new one
self.pages[j] = new_page_ref
# delete redundant Pages tree nodes from xref table
for pages_tree_node_ref in pages_tree_nodes_to_delete:
while pages_tree_node_ref:
pages_tree_node = self.cached_objects[pages_tree_node_ref]
if pages_tree_node_ref.object_id in self.xref_table:
del self.xref_table[pages_tree_node_ref.object_id]
pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
self.orig_pages = []
def write_xref_and_trailer(self, new_root_ref=None):
if new_root_ref:
self.del_root()
@ -443,7 +481,7 @@ class PdfParser:
if isinstance(ref, int):
ref = self.pages[ref]
if "Type" not in dict_obj:
dict_obj["Type"] = PdfName("Page")
dict_obj["Type"] = PdfName(b"Page")
if "Parent" not in dict_obj:
dict_obj["Parent"] = self.pages_ref
return self.write_obj(ref, *objs, **dict_obj)
@ -474,7 +512,6 @@ class PdfParser:
return
del self.xref_table[self.root_ref.object_id]
del self.xref_table[self.root[b"Pages"].object_id]
# XXX TODO delete Pages tree recursively
@staticmethod
def get_buf_from_file(f):
@ -506,6 +543,8 @@ class PdfParser:
self.pages_ref = self.root[b"Pages"]
self.page_tree_root = self.read_indirect(self.pages_ref)
self.pages = self.linearize_page_tree(self.page_tree_root)
# save the original list of page references in case the user modifies, adds or deletes some pages and we need to rewrite the pages and their list
self.orig_pages = self.pages[:]
def next_object_id(self, offset=None):
try:
@ -789,7 +828,9 @@ class PdfParser:
offset, generation = self.xref_table[ref[0]]
check_format_condition(generation == ref[1], "expected to find generation %s for object ID %s in xref table, instead found generation %s at offset %s" \
% (ref[1], ref[0], generation, offset))
return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0]
value = self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0]
self.cached_objects[ref] = value
return value
def linearize_page_tree(self, node=None):
if node is None: