diff --git a/Tests/test_file_pdf.py b/Tests/test_file_pdf.py index 3518e3b3d..d60905430 100644 --- a/Tests/test_file_pdf.py +++ b/Tests/test_file_pdf.py @@ -1,6 +1,8 @@ from helper import unittest, PillowTestCase, hopper from PIL import Image, pdfParser +import os import os.path +import tempfile class TestFilePdf(PillowTestCase): @@ -20,6 +22,8 @@ class TestFilePdf(PillowTestCase): self.assertTrue(os.path.isfile(outfile)) self.assertGreater(os.path.getsize(outfile), 0) + return outfile + def test_monochrome(self): # Arrange mode = "1" @@ -97,6 +101,69 @@ class TestFilePdf(PillowTestCase): self.assertTrue(os.path.isfile(outfile)) self.assertGreater(os.path.getsize(outfile), 0) + def test_pdf_open(self): + # fail on empty buffer + self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray()) + # fail on a buffer full of null bytes + self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536)) + # make an empty PDF object + empty_pdf = pdfParser.PdfParser() + self.assertEqual(len(empty_pdf.pages), 0) + # make a PDF file + pdf_filename = self.helper_save_as_pdf("RGB") + # open the PDF file + hopper_pdf = pdfParser.PdfParser(filename=pdf_filename) + self.assertEqual(len(hopper_pdf.pages), 1) + # read a PDF file from a buffer with a non-zero offset + with open(pdf_filename, "rb") as f: + content = b"xyzzy" + f.read() + hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5) + self.assertEqual(len(hopper_pdf.pages), 1) + # read a PDF file from an already open file + with open(pdf_filename, "rb") as f: + hopper_pdf = pdfParser.PdfParser(f=f) + self.assertEqual(len(hopper_pdf.pages), 1) + + def test_pdf_append_fails_on_nonexistent_file(self): + im = hopper("RGB") + temp_dir = tempfile.mkdtemp() + try: + self.assertRaises(OSError, im.save, os.path.join(temp_dir, "nonexistent.pdf"), append=True) + finally: + os.rmdir(temp_dir) + + def test_pdf_append(self): + # make a PDF file + pdf_filename = self.helper_save_as_pdf("RGB") + # open it, check pages and info + pdf = pdfParser.PdfParser(pdf_filename) + self.assertEqual(len(pdf.pages), 1) + self.assertEqual(len(pdf.info), 0) + # append some info + pdf.info[b"Title"] = b"abc" + pdf.info[b"Author"] = b"def" + pdf.info[b"Subject"] = pdfParser.encode_text("ghi") + pdf.info[b"Keywords"] = b"jkl" + pdf.info[b"Creator"] = b"hopper()" + pdf.info[b"Producer"] = b"pdfParser" + with open(pdf_filename, "r+b") as f: + f.seek(0, os.SEEK_END) + pdf.write_xref_and_trailer(f) + # open it again, check pages and info again + pdf = pdfParser.PdfParser(pdf_filename) + self.assertEqual(len(pdf.pages), 1) + self.assertEqual(len(pdf.info), 6) + self.assertEqual(pdf.info[b"Title"], b"abc") + # append two images + mode_CMYK = hopper("CMYK") + mode_P = hopper("P") + mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) + # open the PDF again, check pages and info again + pdf = pdfParser.PdfParser(pdf_filename) + self.assertEqual(len(pdf.pages), 3) + self.assertEqual(len(pdf.info), 6) + self.assertEqual(pdf.info[b"Title"], b"abc") + def test_pdf_parser(self): pdfParser.selftest() diff --git a/src/PIL/PdfImagePlugin.py b/src/PIL/PdfImagePlugin.py index 35b9b5cee..be467d014 100644 --- a/src/PIL/PdfImagePlugin.py +++ b/src/PIL/PdfImagePlugin.py @@ -58,29 +58,8 @@ def _save(im, fp, filename, save_all=False): # make sure image data is available im.load() - class TextWriter(object): - def __init__(self, fp): - self.fp = fp - - def __getattr__(self, name): - return getattr(self.fp, name) - - def write(self, value): - self.fp.write(value.encode('latin-1')) - - #fp = TextWriter(fp) - - fp.write(b"%PDF-1.2\n") - fp.write(b"% created by PIL PDF driver " + __version__.encode("us-ascii") + b"\n") - - # - # catalogue - - catalog_ref = existing_pdf.next_object_id(fp.tell()) - pages_ref = existing_pdf.next_object_id(0) - existing_pdf.write_obj(fp, catalog_ref, - Type=pdfParser.PdfName(b"Catalog"), - Pages=pages_ref) + existing_pdf.write_header(fp) + existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__) # # pages @@ -109,10 +88,9 @@ def _save(im, fp, filename, save_all=False): contents_refs.append(existing_pdf.next_object_id(0)) existing_pdf.pages.append(page_refs[-1]) - existing_pdf.write_obj(fp, pages_ref, - Type=pdfParser.PdfName("Pages"), - Count=len(existing_pdf.pages), - Kids=existing_pdf.pages) + # + # catalog and list of pages + existing_pdf.write_catalog(fp) pageNumber = 0 for imSequence in ims: @@ -190,9 +168,7 @@ def _save(im, fp, filename, save_all=False): # # page - existing_pdf.write_obj(fp, page_refs[pageNumber], - Type=pdfParser.PdfName("Page"), - Parent=pages_ref, + existing_pdf.write_page(fp, page_refs[pageNumber], Resources=pdfParser.PdfDict( ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)], XObject=pdfParser.PdfDict(image=image_refs[pageNumber])), @@ -203,20 +179,18 @@ def _save(im, fp, filename, save_all=False): # # page contents - op = TextWriter(io.BytesIO()) - - op.write( + page_contents = pdfParser.make_bytes( "q %d 0 0 %d 0 0 cm /image Do Q\n" % ( int(width * 72.0 / resolution), int(height * 72.0 / resolution))) - existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=op.fp.getvalue()) + existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents) pageNumber += 1 # # trailer - existing_pdf.write_xref_and_trailer(fp, catalog_ref) + existing_pdf.write_xref_and_trailer(fp) if hasattr(fp, "flush"): fp.flush() diff --git a/src/PIL/pdfParser.py b/src/PIL/pdfParser.py index bdf55dc93..ade599d05 100644 --- a/src/PIL/pdfParser.py +++ b/src/PIL/pdfParser.py @@ -1,3 +1,4 @@ +import codecs import collections import io import mmap @@ -14,7 +15,11 @@ if sys.version_info.major >= 3: def make_bytes(s): return s.encode("us-ascii") else: - make_bytes = lambda s: s + make_bytes = lambda s: s # pragma: no cover + + +def encode_text(s): + return codecs.BOM_UTF16_BE + s.encode("utf_16_be") class PdfFormatError(RuntimeError): @@ -34,16 +39,16 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec return self.__str__().encode("us-ascii") def __eq__(self, other): - return isinstance(other, IndirectReference) and other.object_id == self.object_id and other.generation == self.generation + return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation + + def __ne__(self, other): + return not (self == other) class IndirectObjectDef(IndirectReference): def __str__(self): return "%s %s obj" % self - def __eq__(self, other): - return isinstance(other, IndirectObjectDef) and other.object_id == self.object_id and other.generation == self.generation - class XrefTable: def __init__(self): @@ -251,11 +256,11 @@ class PdfParser: self.filename = filename self.buf = buf self.start_offset = start_offset - if buf: + if buf is not None: self.read_pdf_info() - elif f: + elif f is not None: self.read_pdf_info_from_file(f) - elif filename: + elif filename is not None: with open(filename, "rb") as f: self.read_pdf_info_from_file(f) else: @@ -266,18 +271,40 @@ class PdfParser: self.info_ref = None self.page_tree_root = {} self.pages = [] + self.pages_ref = None self.last_xref_section_offset = None self.trailer_dict = {} self.xref_table = XrefTable() self.xref_table.reading_finished = True - def write_xref_and_trailer(self, f, new_root_ref): + def write_header(self, f): + f.write(b"%PDF-1.4\n") + + def write_comment(self, f, s): + f.write(("%% %s\n" % (s,)).encode("utf-8")) + + def write_catalog(self, f): self.del_root() + self.root_ref = self.next_object_id(f.tell()) + self.pages_ref = self.next_object_id(0) + self.write_obj(f, self.root_ref, + Type=PdfName(b"Catalog"), + Pages=self.pages_ref) + self.write_obj(f, self.pages_ref, + Type=PdfName("Pages"), + Count=len(self.pages), + Kids=self.pages) + return self.root_ref + + def write_xref_and_trailer(self, f, new_root_ref=None): + if new_root_ref: + self.del_root() + self.root_ref = new_root_ref if self.info: self.info_ref = self.write_obj(f, None, self.info) start_xref = self.xref_table.write(f) num_entries = len(self.xref_table) - trailer_dict = {b"Root": new_root_ref, b"Size": num_entries} + trailer_dict = {b"Root": self.root_ref, b"Size": num_entries} if self.last_xref_section_offset is not None: trailer_dict[b"Prev"] = self.last_xref_section_offset if self.info: @@ -285,6 +312,15 @@ class PdfParser: self.last_xref_section_offset = start_xref f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref)) + def write_page(self, f, ref, *objs, **dict_obj): + if isinstance(ref, int): + ref = self.pages[ref] + if "Type" not in dict_obj: + dict_obj["Type"] = PdfName("Page") + if "Parent" not in dict_obj: + dict_obj["Parent"] = self.pages_ref + return self.write_obj(f, ref, *objs, **dict_obj) + def write_obj(self, f, ref, *objs, **dict_obj): if ref is None: ref = self.next_object_id(f.tell()) @@ -336,7 +372,8 @@ class PdfParser: check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog") check_format_condition(b"Pages" in self.root, "/Pages missing in Root") check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference") - self.page_tree_root = self.read_indirect(self.root[b"Pages"]) + self.pages_ref = self.root[b"Pages"] + self.page_tree_root = self.read_indirect(self.pages_ref) #print("page_tree_root: " + str(self.page_tree_root)) self.pages = self.linearize_page_tree(self.page_tree_root) #print("pages: " + str(self.pages)) @@ -361,15 +398,23 @@ class PdfParser: newline = whitespace_optional + newline_only + whitespace_optional re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \ + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL) - re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \ + re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*?\>\>)" + newline \ + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL) def read_trailer(self): search_start_offset = len(self.buf) - 16384 if search_start_offset < self.start_offset: search_start_offset = self.start_offset - data_at_end = self.buf[search_start_offset:] - m = self.re_trailer_end.search(data_at_end) + #data_at_end = self.buf[search_start_offset:] + #m = self.re_trailer_end.search(data_at_end) + m = self.re_trailer_end.search(self.buf, search_start_offset) check_format_condition(m, "trailer end not found") + # make sure we found the LAST trailer + last_match = m + while m: + last_match = m + m = self.re_trailer_end.search(self.buf, m.start()+16) + if not m: + m = last_match trailer_data = m.group(1) #print(trailer_data) self.last_xref_section_offset = int(m.group(2)) @@ -627,6 +672,14 @@ class PdfParser: def selftest(): assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash" assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash" + assert IndirectReference(1,2) == IndirectReference(1,2) + assert IndirectReference(1,2) != IndirectReference(1,3) + assert IndirectReference(1,2) != IndirectObjectDef(1,2) + assert IndirectReference(1,2) != (1,2) + assert IndirectObjectDef(1,2) == IndirectObjectDef(1,2) + assert IndirectObjectDef(1,2) != IndirectObjectDef(1,3) + assert IndirectObjectDef(1,2) != IndirectReference(1,2) + assert IndirectObjectDef(1,2) != (1,2) assert bytes(IndirectReference(1,2)) == b"1 2 R" assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj" assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"