issue #2959: add tests and fixes, text encoding, remove remnants of text writing from PdfImagePlugin

This commit is contained in:
Dvořák Václav 2018-01-24 02:28:39 +01:00
parent ba211ff549
commit a187a361cb
3 changed files with 143 additions and 49 deletions

View File

@ -1,6 +1,8 @@
from helper import unittest, PillowTestCase, hopper from helper import unittest, PillowTestCase, hopper
from PIL import Image, pdfParser from PIL import Image, pdfParser
import os
import os.path import os.path
import tempfile
class TestFilePdf(PillowTestCase): class TestFilePdf(PillowTestCase):
@ -20,6 +22,8 @@ class TestFilePdf(PillowTestCase):
self.assertTrue(os.path.isfile(outfile)) self.assertTrue(os.path.isfile(outfile))
self.assertGreater(os.path.getsize(outfile), 0) self.assertGreater(os.path.getsize(outfile), 0)
return outfile
def test_monochrome(self): def test_monochrome(self):
# Arrange # Arrange
mode = "1" mode = "1"
@ -97,6 +101,69 @@ class TestFilePdf(PillowTestCase):
self.assertTrue(os.path.isfile(outfile)) self.assertTrue(os.path.isfile(outfile))
self.assertGreater(os.path.getsize(outfile), 0) self.assertGreater(os.path.getsize(outfile), 0)
def test_pdf_open(self):
# fail on empty buffer
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray())
# fail on a buffer full of null bytes
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
# make an empty PDF object
empty_pdf = pdfParser.PdfParser()
self.assertEqual(len(empty_pdf.pages), 0)
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB")
# open the PDF file
hopper_pdf = pdfParser.PdfParser(filename=pdf_filename)
self.assertEqual(len(hopper_pdf.pages), 1)
# read a PDF file from a buffer with a non-zero offset
with open(pdf_filename, "rb") as f:
content = b"xyzzy" + f.read()
hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5)
self.assertEqual(len(hopper_pdf.pages), 1)
# read a PDF file from an already open file
with open(pdf_filename, "rb") as f:
hopper_pdf = pdfParser.PdfParser(f=f)
self.assertEqual(len(hopper_pdf.pages), 1)
def test_pdf_append_fails_on_nonexistent_file(self):
im = hopper("RGB")
temp_dir = tempfile.mkdtemp()
try:
self.assertRaises(OSError, im.save, os.path.join(temp_dir, "nonexistent.pdf"), append=True)
finally:
os.rmdir(temp_dir)
def test_pdf_append(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB")
# open it, check pages and info
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 0)
# append some info
pdf.info[b"Title"] = b"abc"
pdf.info[b"Author"] = b"def"
pdf.info[b"Subject"] = pdfParser.encode_text("ghi")
pdf.info[b"Keywords"] = b"jkl"
pdf.info[b"Creator"] = b"hopper()"
pdf.info[b"Producer"] = b"pdfParser"
with open(pdf_filename, "r+b") as f:
f.seek(0, os.SEEK_END)
pdf.write_xref_and_trailer(f)
# open it again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info[b"Title"], b"abc")
# append two images
mode_CMYK = hopper("CMYK")
mode_P = hopper("P")
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
# open the PDF again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.pages), 3)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info[b"Title"], b"abc")
def test_pdf_parser(self): def test_pdf_parser(self):
pdfParser.selftest() pdfParser.selftest()

View File

@ -58,29 +58,8 @@ def _save(im, fp, filename, save_all=False):
# make sure image data is available # make sure image data is available
im.load() im.load()
class TextWriter(object): existing_pdf.write_header(fp)
def __init__(self, fp): existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__)
self.fp = fp
def __getattr__(self, name):
return getattr(self.fp, name)
def write(self, value):
self.fp.write(value.encode('latin-1'))
#fp = TextWriter(fp)
fp.write(b"%PDF-1.2\n")
fp.write(b"% created by PIL PDF driver " + __version__.encode("us-ascii") + b"\n")
#
# catalogue
catalog_ref = existing_pdf.next_object_id(fp.tell())
pages_ref = existing_pdf.next_object_id(0)
existing_pdf.write_obj(fp, catalog_ref,
Type=pdfParser.PdfName(b"Catalog"),
Pages=pages_ref)
# #
# pages # pages
@ -109,10 +88,9 @@ def _save(im, fp, filename, save_all=False):
contents_refs.append(existing_pdf.next_object_id(0)) contents_refs.append(existing_pdf.next_object_id(0))
existing_pdf.pages.append(page_refs[-1]) existing_pdf.pages.append(page_refs[-1])
existing_pdf.write_obj(fp, pages_ref, #
Type=pdfParser.PdfName("Pages"), # catalog and list of pages
Count=len(existing_pdf.pages), existing_pdf.write_catalog(fp)
Kids=existing_pdf.pages)
pageNumber = 0 pageNumber = 0
for imSequence in ims: for imSequence in ims:
@ -190,9 +168,7 @@ def _save(im, fp, filename, save_all=False):
# #
# page # page
existing_pdf.write_obj(fp, page_refs[pageNumber], existing_pdf.write_page(fp, page_refs[pageNumber],
Type=pdfParser.PdfName("Page"),
Parent=pages_ref,
Resources=pdfParser.PdfDict( Resources=pdfParser.PdfDict(
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)], ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])), XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
@ -203,20 +179,18 @@ def _save(im, fp, filename, save_all=False):
# #
# page contents # page contents
op = TextWriter(io.BytesIO()) page_contents = pdfParser.make_bytes(
op.write(
"q %d 0 0 %d 0 0 cm /image Do Q\n" % ( "q %d 0 0 %d 0 0 cm /image Do Q\n" % (
int(width * 72.0 / resolution), int(width * 72.0 / resolution),
int(height * 72.0 / resolution))) int(height * 72.0 / resolution)))
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=op.fp.getvalue()) existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents)
pageNumber += 1 pageNumber += 1
# #
# trailer # trailer
existing_pdf.write_xref_and_trailer(fp, catalog_ref) existing_pdf.write_xref_and_trailer(fp)
if hasattr(fp, "flush"): if hasattr(fp, "flush"):
fp.flush() fp.flush()

View File

@ -1,3 +1,4 @@
import codecs
import collections import collections
import io import io
import mmap import mmap
@ -14,7 +15,11 @@ if sys.version_info.major >= 3:
def make_bytes(s): def make_bytes(s):
return s.encode("us-ascii") return s.encode("us-ascii")
else: else:
make_bytes = lambda s: s make_bytes = lambda s: s # pragma: no cover
def encode_text(s):
return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
class PdfFormatError(RuntimeError): class PdfFormatError(RuntimeError):
@ -34,16 +39,16 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec
return self.__str__().encode("us-ascii") return self.__str__().encode("us-ascii")
def __eq__(self, other): def __eq__(self, other):
return isinstance(other, IndirectReference) and other.object_id == self.object_id and other.generation == self.generation return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation
def __ne__(self, other):
return not (self == other)
class IndirectObjectDef(IndirectReference): class IndirectObjectDef(IndirectReference):
def __str__(self): def __str__(self):
return "%s %s obj" % self return "%s %s obj" % self
def __eq__(self, other):
return isinstance(other, IndirectObjectDef) and other.object_id == self.object_id and other.generation == self.generation
class XrefTable: class XrefTable:
def __init__(self): def __init__(self):
@ -251,11 +256,11 @@ class PdfParser:
self.filename = filename self.filename = filename
self.buf = buf self.buf = buf
self.start_offset = start_offset self.start_offset = start_offset
if buf: if buf is not None:
self.read_pdf_info() self.read_pdf_info()
elif f: elif f is not None:
self.read_pdf_info_from_file(f) self.read_pdf_info_from_file(f)
elif filename: elif filename is not None:
with open(filename, "rb") as f: with open(filename, "rb") as f:
self.read_pdf_info_from_file(f) self.read_pdf_info_from_file(f)
else: else:
@ -266,18 +271,40 @@ class PdfParser:
self.info_ref = None self.info_ref = None
self.page_tree_root = {} self.page_tree_root = {}
self.pages = [] self.pages = []
self.pages_ref = None
self.last_xref_section_offset = None self.last_xref_section_offset = None
self.trailer_dict = {} self.trailer_dict = {}
self.xref_table = XrefTable() self.xref_table = XrefTable()
self.xref_table.reading_finished = True self.xref_table.reading_finished = True
def write_xref_and_trailer(self, f, new_root_ref): def write_header(self, f):
f.write(b"%PDF-1.4\n")
def write_comment(self, f, s):
f.write(("%% %s\n" % (s,)).encode("utf-8"))
def write_catalog(self, f):
self.del_root() self.del_root()
self.root_ref = self.next_object_id(f.tell())
self.pages_ref = self.next_object_id(0)
self.write_obj(f, self.root_ref,
Type=PdfName(b"Catalog"),
Pages=self.pages_ref)
self.write_obj(f, self.pages_ref,
Type=PdfName("Pages"),
Count=len(self.pages),
Kids=self.pages)
return self.root_ref
def write_xref_and_trailer(self, f, new_root_ref=None):
if new_root_ref:
self.del_root()
self.root_ref = new_root_ref
if self.info: if self.info:
self.info_ref = self.write_obj(f, None, self.info) self.info_ref = self.write_obj(f, None, self.info)
start_xref = self.xref_table.write(f) start_xref = self.xref_table.write(f)
num_entries = len(self.xref_table) num_entries = len(self.xref_table)
trailer_dict = {b"Root": new_root_ref, b"Size": num_entries} trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
if self.last_xref_section_offset is not None: if self.last_xref_section_offset is not None:
trailer_dict[b"Prev"] = self.last_xref_section_offset trailer_dict[b"Prev"] = self.last_xref_section_offset
if self.info: if self.info:
@ -285,6 +312,15 @@ class PdfParser:
self.last_xref_section_offset = start_xref self.last_xref_section_offset = start_xref
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref)) f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
def write_page(self, f, ref, *objs, **dict_obj):
if isinstance(ref, int):
ref = self.pages[ref]
if "Type" not in dict_obj:
dict_obj["Type"] = PdfName("Page")
if "Parent" not in dict_obj:
dict_obj["Parent"] = self.pages_ref
return self.write_obj(f, ref, *objs, **dict_obj)
def write_obj(self, f, ref, *objs, **dict_obj): def write_obj(self, f, ref, *objs, **dict_obj):
if ref is None: if ref is None:
ref = self.next_object_id(f.tell()) ref = self.next_object_id(f.tell())
@ -336,7 +372,8 @@ class PdfParser:
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog") check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
check_format_condition(b"Pages" in self.root, "/Pages missing in Root") check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference") check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
self.page_tree_root = self.read_indirect(self.root[b"Pages"]) self.pages_ref = self.root[b"Pages"]
self.page_tree_root = self.read_indirect(self.pages_ref)
#print("page_tree_root: " + str(self.page_tree_root)) #print("page_tree_root: " + str(self.page_tree_root))
self.pages = self.linearize_page_tree(self.page_tree_root) self.pages = self.linearize_page_tree(self.page_tree_root)
#print("pages: " + str(self.pages)) #print("pages: " + str(self.pages))
@ -361,15 +398,23 @@ class PdfParser:
newline = whitespace_optional + newline_only + whitespace_optional newline = whitespace_optional + newline_only + whitespace_optional
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \ re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL) + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \ re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*?\>\>)" + newline \
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL) + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
def read_trailer(self): def read_trailer(self):
search_start_offset = len(self.buf) - 16384 search_start_offset = len(self.buf) - 16384
if search_start_offset < self.start_offset: if search_start_offset < self.start_offset:
search_start_offset = self.start_offset search_start_offset = self.start_offset
data_at_end = self.buf[search_start_offset:] #data_at_end = self.buf[search_start_offset:]
m = self.re_trailer_end.search(data_at_end) #m = self.re_trailer_end.search(data_at_end)
m = self.re_trailer_end.search(self.buf, search_start_offset)
check_format_condition(m, "trailer end not found") check_format_condition(m, "trailer end not found")
# make sure we found the LAST trailer
last_match = m
while m:
last_match = m
m = self.re_trailer_end.search(self.buf, m.start()+16)
if not m:
m = last_match
trailer_data = m.group(1) trailer_data = m.group(1)
#print(trailer_data) #print(trailer_data)
self.last_xref_section_offset = int(m.group(2)) self.last_xref_section_offset = int(m.group(2))
@ -627,6 +672,14 @@ class PdfParser:
def selftest(): def selftest():
assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash" assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash"
assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash" assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash"
assert IndirectReference(1,2) == IndirectReference(1,2)
assert IndirectReference(1,2) != IndirectReference(1,3)
assert IndirectReference(1,2) != IndirectObjectDef(1,2)
assert IndirectReference(1,2) != (1,2)
assert IndirectObjectDef(1,2) == IndirectObjectDef(1,2)
assert IndirectObjectDef(1,2) != IndirectObjectDef(1,3)
assert IndirectObjectDef(1,2) != IndirectReference(1,2)
assert IndirectObjectDef(1,2) != (1,2)
assert bytes(IndirectReference(1,2)) == b"1 2 R" assert bytes(IndirectReference(1,2)) == b"1 2 R"
assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj" assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj"
assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash" assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"