issue #2959: add tests and fixes, text encoding, remove remnants of text writing from PdfImagePlugin

This commit is contained in:
Dvořák Václav 2018-01-24 02:28:39 +01:00
parent ba211ff549
commit a187a361cb
3 changed files with 143 additions and 49 deletions

View File

@ -1,6 +1,8 @@
from helper import unittest, PillowTestCase, hopper
from PIL import Image, pdfParser
import os
import os.path
import tempfile
class TestFilePdf(PillowTestCase):
@ -20,6 +22,8 @@ class TestFilePdf(PillowTestCase):
self.assertTrue(os.path.isfile(outfile))
self.assertGreater(os.path.getsize(outfile), 0)
return outfile
def test_monochrome(self):
# Arrange
mode = "1"
@ -97,6 +101,69 @@ class TestFilePdf(PillowTestCase):
self.assertTrue(os.path.isfile(outfile))
self.assertGreater(os.path.getsize(outfile), 0)
def test_pdf_open(self):
# fail on empty buffer
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray())
# fail on a buffer full of null bytes
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
# make an empty PDF object
empty_pdf = pdfParser.PdfParser()
self.assertEqual(len(empty_pdf.pages), 0)
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB")
# open the PDF file
hopper_pdf = pdfParser.PdfParser(filename=pdf_filename)
self.assertEqual(len(hopper_pdf.pages), 1)
# read a PDF file from a buffer with a non-zero offset
with open(pdf_filename, "rb") as f:
content = b"xyzzy" + f.read()
hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5)
self.assertEqual(len(hopper_pdf.pages), 1)
# read a PDF file from an already open file
with open(pdf_filename, "rb") as f:
hopper_pdf = pdfParser.PdfParser(f=f)
self.assertEqual(len(hopper_pdf.pages), 1)
def test_pdf_append_fails_on_nonexistent_file(self):
im = hopper("RGB")
temp_dir = tempfile.mkdtemp()
try:
self.assertRaises(OSError, im.save, os.path.join(temp_dir, "nonexistent.pdf"), append=True)
finally:
os.rmdir(temp_dir)
def test_pdf_append(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB")
# open it, check pages and info
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 0)
# append some info
pdf.info[b"Title"] = b"abc"
pdf.info[b"Author"] = b"def"
pdf.info[b"Subject"] = pdfParser.encode_text("ghi")
pdf.info[b"Keywords"] = b"jkl"
pdf.info[b"Creator"] = b"hopper()"
pdf.info[b"Producer"] = b"pdfParser"
with open(pdf_filename, "r+b") as f:
f.seek(0, os.SEEK_END)
pdf.write_xref_and_trailer(f)
# open it again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info[b"Title"], b"abc")
# append two images
mode_CMYK = hopper("CMYK")
mode_P = hopper("P")
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
# open the PDF again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 3)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info[b"Title"], b"abc")
def test_pdf_parser(self):
pdfParser.selftest()

View File

@ -58,29 +58,8 @@ def _save(im, fp, filename, save_all=False):
# make sure image data is available
im.load()
class TextWriter(object):
def __init__(self, fp):
self.fp = fp
def __getattr__(self, name):
return getattr(self.fp, name)
def write(self, value):
self.fp.write(value.encode('latin-1'))
#fp = TextWriter(fp)
fp.write(b"%PDF-1.2\n")
fp.write(b"% created by PIL PDF driver " + __version__.encode("us-ascii") + b"\n")
#
# catalogue
catalog_ref = existing_pdf.next_object_id(fp.tell())
pages_ref = existing_pdf.next_object_id(0)
existing_pdf.write_obj(fp, catalog_ref,
Type=pdfParser.PdfName(b"Catalog"),
Pages=pages_ref)
existing_pdf.write_header(fp)
existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__)
#
# pages
@ -109,10 +88,9 @@ def _save(im, fp, filename, save_all=False):
contents_refs.append(existing_pdf.next_object_id(0))
existing_pdf.pages.append(page_refs[-1])
existing_pdf.write_obj(fp, pages_ref,
Type=pdfParser.PdfName("Pages"),
Count=len(existing_pdf.pages),
Kids=existing_pdf.pages)
#
# catalog and list of pages
existing_pdf.write_catalog(fp)
pageNumber = 0
for imSequence in ims:
@ -190,9 +168,7 @@ def _save(im, fp, filename, save_all=False):
#
# page
existing_pdf.write_obj(fp, page_refs[pageNumber],
Type=pdfParser.PdfName("Page"),
Parent=pages_ref,
existing_pdf.write_page(fp, page_refs[pageNumber],
Resources=pdfParser.PdfDict(
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
@ -203,20 +179,18 @@ def _save(im, fp, filename, save_all=False):
#
# page contents
op = TextWriter(io.BytesIO())
op.write(
page_contents = pdfParser.make_bytes(
"q %d 0 0 %d 0 0 cm /image Do Q\n" % (
int(width * 72.0 / resolution),
int(height * 72.0 / resolution)))
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=op.fp.getvalue())
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents)
pageNumber += 1
#
# trailer
existing_pdf.write_xref_and_trailer(fp, catalog_ref)
existing_pdf.write_xref_and_trailer(fp)
if hasattr(fp, "flush"):
fp.flush()

View File

@ -1,3 +1,4 @@
import codecs
import collections
import io
import mmap
@ -14,7 +15,11 @@ if sys.version_info.major >= 3:
def make_bytes(s):
return s.encode("us-ascii")
else:
make_bytes = lambda s: s
make_bytes = lambda s: s # pragma: no cover
def encode_text(s):
return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
class PdfFormatError(RuntimeError):
@ -34,16 +39,16 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec
return self.__str__().encode("us-ascii")
def __eq__(self, other):
return isinstance(other, IndirectReference) and other.object_id == self.object_id and other.generation == self.generation
return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation
def __ne__(self, other):
return not (self == other)
class IndirectObjectDef(IndirectReference):
def __str__(self):
return "%s %s obj" % self
def __eq__(self, other):
return isinstance(other, IndirectObjectDef) and other.object_id == self.object_id and other.generation == self.generation
class XrefTable:
def __init__(self):
@ -251,11 +256,11 @@ class PdfParser:
self.filename = filename
self.buf = buf
self.start_offset = start_offset
if buf:
if buf is not None:
self.read_pdf_info()
elif f:
elif f is not None:
self.read_pdf_info_from_file(f)
elif filename:
elif filename is not None:
with open(filename, "rb") as f:
self.read_pdf_info_from_file(f)
else:
@ -266,18 +271,40 @@ class PdfParser:
self.info_ref = None
self.page_tree_root = {}
self.pages = []
self.pages_ref = None
self.last_xref_section_offset = None
self.trailer_dict = {}
self.xref_table = XrefTable()
self.xref_table.reading_finished = True
def write_xref_and_trailer(self, f, new_root_ref):
def write_header(self, f):
f.write(b"%PDF-1.4\n")
def write_comment(self, f, s):
f.write(("%% %s\n" % (s,)).encode("utf-8"))
def write_catalog(self, f):
self.del_root()
self.root_ref = self.next_object_id(f.tell())
self.pages_ref = self.next_object_id(0)
self.write_obj(f, self.root_ref,
Type=PdfName(b"Catalog"),
Pages=self.pages_ref)
self.write_obj(f, self.pages_ref,
Type=PdfName("Pages"),
Count=len(self.pages),
Kids=self.pages)
return self.root_ref
def write_xref_and_trailer(self, f, new_root_ref=None):
if new_root_ref:
self.del_root()
self.root_ref = new_root_ref
if self.info:
self.info_ref = self.write_obj(f, None, self.info)
start_xref = self.xref_table.write(f)
num_entries = len(self.xref_table)
trailer_dict = {b"Root": new_root_ref, b"Size": num_entries}
trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
if self.last_xref_section_offset is not None:
trailer_dict[b"Prev"] = self.last_xref_section_offset
if self.info:
@ -285,6 +312,15 @@ class PdfParser:
self.last_xref_section_offset = start_xref
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
def write_page(self, f, ref, *objs, **dict_obj):
if isinstance(ref, int):
ref = self.pages[ref]
if "Type" not in dict_obj:
dict_obj["Type"] = PdfName("Page")
if "Parent" not in dict_obj:
dict_obj["Parent"] = self.pages_ref
return self.write_obj(f, ref, *objs, **dict_obj)
def write_obj(self, f, ref, *objs, **dict_obj):
if ref is None:
ref = self.next_object_id(f.tell())
@ -336,7 +372,8 @@ class PdfParser:
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
self.page_tree_root = self.read_indirect(self.root[b"Pages"])
self.pages_ref = self.root[b"Pages"]
self.page_tree_root = self.read_indirect(self.pages_ref)
#print("page_tree_root: " + str(self.page_tree_root))
self.pages = self.linearize_page_tree(self.page_tree_root)
#print("pages: " + str(self.pages))
@ -361,15 +398,23 @@ class PdfParser:
newline = whitespace_optional + newline_only + whitespace_optional
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*?\>\>)" + newline \
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
def read_trailer(self):
search_start_offset = len(self.buf) - 16384
if search_start_offset < self.start_offset:
search_start_offset = self.start_offset
data_at_end = self.buf[search_start_offset:]
m = self.re_trailer_end.search(data_at_end)
#data_at_end = self.buf[search_start_offset:]
#m = self.re_trailer_end.search(data_at_end)
m = self.re_trailer_end.search(self.buf, search_start_offset)
check_format_condition(m, "trailer end not found")
# make sure we found the LAST trailer
last_match = m
while m:
last_match = m
m = self.re_trailer_end.search(self.buf, m.start()+16)
if not m:
m = last_match
trailer_data = m.group(1)
#print(trailer_data)
self.last_xref_section_offset = int(m.group(2))
@ -627,6 +672,14 @@ class PdfParser:
def selftest():
assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash"
assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash"
assert IndirectReference(1,2) == IndirectReference(1,2)
assert IndirectReference(1,2) != IndirectReference(1,3)
assert IndirectReference(1,2) != IndirectObjectDef(1,2)
assert IndirectReference(1,2) != (1,2)
assert IndirectObjectDef(1,2) == IndirectObjectDef(1,2)
assert IndirectObjectDef(1,2) != IndirectObjectDef(1,3)
assert IndirectObjectDef(1,2) != IndirectReference(1,2)
assert IndirectObjectDef(1,2) != (1,2)
assert bytes(IndirectReference(1,2)) == b"1 2 R"
assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj"
assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"