issue #2959: keep file open, add context manager, add methods to support writing, eliminate the passing of file or buffer

This commit is contained in:
Dvořák Václav 2018-01-26 19:07:43 +01:00
parent 78fe32a431
commit ede57b91e0
3 changed files with 134 additions and 87 deletions

View File

@ -19,6 +19,11 @@ class TestFilePdf(PillowTestCase):
# Assert # Assert
self.assertTrue(os.path.isfile(outfile)) self.assertTrue(os.path.isfile(outfile))
self.assertGreater(os.path.getsize(outfile), 0) self.assertGreater(os.path.getsize(outfile), 0)
with pdfParser.PdfParser(outfile) as pdf:
if kwargs.get("append_images", False) or kwargs.get("append", False):
self.assertGreater(len(pdf.pages), 1)
else:
self.assertGreater(len(pdf.pages), 0)
return outfile return outfile
@ -100,27 +105,34 @@ class TestFilePdf(PillowTestCase):
self.assertGreater(os.path.getsize(outfile), 0) self.assertGreater(os.path.getsize(outfile), 0)
def test_pdf_open(self): def test_pdf_open(self):
# fail on empty buffer
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray())
# fail on a buffer full of null bytes # fail on a buffer full of null bytes
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536)) self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
# make an empty PDF object # make an empty PDF object
empty_pdf = pdfParser.PdfParser() with pdfParser.PdfParser() as empty_pdf:
self.assertEqual(len(empty_pdf.pages), 0) self.assertEqual(len(empty_pdf.pages), 0)
self.assertEqual(len(empty_pdf.info), 0)
self.assertFalse(empty_pdf.should_close_buf)
self.assertFalse(empty_pdf.should_close_file)
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB") pdf_filename = self.helper_save_as_pdf("RGB")
# open the PDF file # open the PDF file
hopper_pdf = pdfParser.PdfParser(filename=pdf_filename) with pdfParser.PdfParser(filename=pdf_filename) as hopper_pdf:
self.assertEqual(len(hopper_pdf.pages), 1) self.assertEqual(len(hopper_pdf.pages), 1)
self.assertTrue(hopper_pdf.should_close_buf)
self.assertTrue(hopper_pdf.should_close_file)
# read a PDF file from a buffer with a non-zero offset # read a PDF file from a buffer with a non-zero offset
with open(pdf_filename, "rb") as f: with open(pdf_filename, "rb") as f:
content = b"xyzzy" + f.read() content = b"xyzzy" + f.read()
hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5) with pdfParser.PdfParser(buf=content, start_offset=5) as hopper_pdf:
self.assertEqual(len(hopper_pdf.pages), 1) self.assertEqual(len(hopper_pdf.pages), 1)
self.assertFalse(hopper_pdf.should_close_buf)
self.assertFalse(hopper_pdf.should_close_file)
# read a PDF file from an already open file # read a PDF file from an already open file
with open(pdf_filename, "rb") as f: with open(pdf_filename, "rb") as f:
hopper_pdf = pdfParser.PdfParser(f=f) with pdfParser.PdfParser(f=f) as hopper_pdf:
self.assertEqual(len(hopper_pdf.pages), 1) self.assertEqual(len(hopper_pdf.pages), 1)
self.assertTrue(hopper_pdf.should_close_buf)
self.assertFalse(hopper_pdf.should_close_file)
def test_pdf_append_fails_on_nonexistent_file(self): def test_pdf_append_fails_on_nonexistent_file(self):
im = hopper("RGB") im = hopper("RGB")
@ -134,50 +146,49 @@ class TestFilePdf(PillowTestCase):
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser") pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser")
# open it, check pages and info # open it, check pages and info
pdf = pdfParser.PdfParser(pdf_filename) with pdfParser.PdfParser(pdf_filename, mode="r+b") as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 1) self.assertEqual(len(pdf.info), 1)
self.assertEqual(pdf.info.Producer, "pdfParser") self.assertEqual(pdf.info.Producer, "pdfParser")
# append some info # append some info
pdf.info.Title = "abc" pdf.info.Title = "abc"
pdf.info.Author = "def" pdf.info.Author = "def"
pdf.info.Subject = u"ghi\uABCD" pdf.info.Subject = u"ghi\uABCD"
pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Keywords = "qw)e\\r(ty"
pdf.info.Creator = "hopper()" pdf.info.Creator = "hopper()"
with open(pdf_filename, "r+b") as f: pdf.start_writing()
f.seek(0, os.SEEK_END)
pdf.write_xref_and_trailer(f) pdf.write_xref_and_trailer(f)
# open it again, check pages and info again # open it again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename) with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Title, "abc")
# append two images # append two images
mode_CMYK = hopper("CMYK") mode_CMYK = hopper("CMYK")
mode_P = hopper("P") mode_P = hopper("P")
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
# open the PDF again, check pages and info again # open the PDF again, check pages and info again
pdf = pdfParser.PdfParser(pdf_filename) with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 3) self.assertEqual(len(pdf.pages), 3)
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdfParser.decode_text(pdf.info[b"Title"]), "abc") self.assertEqual(pdfParser.decode_text(pdf.info[b"Title"]), "abc")
self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Title, "abc")
self.assertEqual(pdf.info.Producer, "pdfParser") self.assertEqual(pdf.info.Producer, "pdfParser")
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD") self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
def test_pdf_append(self): def test_pdf_append(self):
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer") pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer")
# open it, check pages and info # open it, check pages and info
pdf = pdfParser.PdfParser(pdf_filename) with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "title") self.assertEqual(pdf.info.Title, "title")
self.assertEqual(pdf.info.Author, "author") self.assertEqual(pdf.info.Author, "author")
self.assertEqual(pdf.info.Subject, "subject") self.assertEqual(pdf.info.Subject, "subject")
self.assertEqual(pdf.info.Keywords, "keywords") self.assertEqual(pdf.info.Keywords, "keywords")
self.assertEqual(pdf.info.Creator, "creator") self.assertEqual(pdf.info.Creator, "creator")
self.assertEqual(pdf.info.Producer, "producer") self.assertEqual(pdf.info.Producer, "producer")
def test_pdf_append_to_bytesio(self): def test_pdf_append_to_bytesio(self):
im = hopper("RGB") im = hopper("RGB")

View File

@ -56,10 +56,9 @@ def _save(im, fp, filename, save_all=False):
producer = im.encoderinfo.get("producer", None) producer = im.encoderinfo.get("producer", None)
if is_appending: if is_appending:
existing_pdf = pdfParser.PdfParser(f=fp, filename=filename) existing_pdf = pdfParser.PdfParser(f=fp, filename=filename, mode="r+b")
fp.seek(0, io.SEEK_END)
else: else:
existing_pdf = pdfParser.PdfParser() existing_pdf = pdfParser.PdfParser(f=fp, filename=filename, mode="w+b")
if title: if title:
existing_pdf.info.Title = title existing_pdf.info.Title = title
@ -78,8 +77,9 @@ def _save(im, fp, filename, save_all=False):
# make sure image data is available # make sure image data is available
im.load() im.load()
existing_pdf.write_header(fp) existing_pdf.start_writing()
existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__) existing_pdf.write_header()
existing_pdf.write_comment("created by PIL PDF driver " + __version__)
# #
# pages # pages
@ -110,7 +110,7 @@ def _save(im, fp, filename, save_all=False):
# #
# catalog and list of pages # catalog and list of pages
existing_pdf.write_catalog(fp) existing_pdf.write_catalog()
pageNumber = 0 pageNumber = 0
for imSequence in ims: for imSequence in ims:
@ -175,7 +175,7 @@ def _save(im, fp, filename, save_all=False):
width, height = im.size width, height = im.size
existing_pdf.write_obj(fp, image_refs[pageNumber], stream=op.getvalue(), existing_pdf.write_obj(image_refs[pageNumber], stream=op.getvalue(),
Type=pdfParser.PdfName("XObject"), Type=pdfParser.PdfName("XObject"),
Subtype=pdfParser.PdfName("Image"), Subtype=pdfParser.PdfName("Image"),
Width=width, # * 72.0 / resolution, Width=width, # * 72.0 / resolution,
@ -188,7 +188,7 @@ def _save(im, fp, filename, save_all=False):
# #
# page # page
existing_pdf.write_page(fp, page_refs[pageNumber], existing_pdf.write_page(page_refs[pageNumber],
Resources=pdfParser.PdfDict( Resources=pdfParser.PdfDict(
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)], ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])), XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
@ -204,15 +204,16 @@ def _save(im, fp, filename, save_all=False):
int(width * 72.0 / resolution), int(width * 72.0 / resolution),
int(height * 72.0 / resolution))) int(height * 72.0 / resolution)))
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents) existing_pdf.write_obj(contents_refs[pageNumber], stream=page_contents)
pageNumber += 1 pageNumber += 1
# #
# trailer # trailer
existing_pdf.write_xref_and_trailer(fp) existing_pdf.write_xref_and_trailer()
if hasattr(fp, "flush"): if hasattr(fp, "flush"):
fp.flush() fp.flush()
existing_pdf.close()
# #
# -------------------------------------------------------------------- # --------------------------------------------------------------------

View File

@ -2,6 +2,7 @@ import codecs
import collections import collections
import io import io
import mmap import mmap
import os
import re import re
import sys import sys
import zlib import zlib
@ -336,17 +337,25 @@ class PdfParser:
Supports PDF up to 1.4 Supports PDF up to 1.4
""" """
def __init__(self, filename=None, f=None, buf=None, start_offset=0): def __init__(self, filename=None, f=None, buf=None, start_offset=0, mode="rb"):
# type: (PdfParser, str, file, Union[bytes, bytearray], int, str) -> None
assert not (buf and f)
self.filename = filename self.filename = filename
self.buf = buf self.buf = buf
self.f = f
self.start_offset = start_offset self.start_offset = start_offset
if buf is not None: self.should_close_buf = False
self.should_close_file = False
if filename is not None and f is None:
self.f = f = open(filename, mode)
self.should_close_file = True
if f is not None:
self.buf = buf = self.get_buf_from_file(f)
self.should_close_buf = True
if not filename and hasattr(f, "name"):
self.filename = f.name
if buf:
self.read_pdf_info() self.read_pdf_info()
elif f is not None:
self.read_pdf_info_from_file(f)
elif filename is not None:
with open(filename, "rb") as f:
self.read_pdf_info_from_file(f)
else: else:
self.file_size_total = self.file_size_this = 0 self.file_size_total = self.file_size_this = 0
self.root = PdfDict() self.root = PdfDict()
@ -360,33 +369,63 @@ class PdfParser:
self.trailer_dict = {} self.trailer_dict = {}
self.xref_table = XrefTable() self.xref_table = XrefTable()
self.xref_table.reading_finished = True self.xref_table.reading_finished = True
if f:
self.seek_end()
def write_header(self, f): def __enter__(self):
f.write(b"%PDF-1.4\n") return self
def write_comment(self, f, s): def __exit__(self, exc_type, exc_value, traceback):
f.write(("%% %s\n" % (s,)).encode("utf-8")) self.close()
return False # do not suppress exceptions
def write_catalog(self, f): def start_writing(self):
self.close_buf()
self.seek_end()
def close_buf(self):
try:
self.buf.close()
except AttributeError:
pass
self.buf = None
def close(self):
if self.should_close_buf:
self.close_buf()
if self.f is not None and self.should_close_file:
self.f.close()
self.f = None
def seek_end(self):
self.f.seek(0, os.SEEK_END)
def write_header(self):
self.f.write(b"%PDF-1.4\n")
def write_comment(self, s):
self.f.write(("%% %s\n" % (s,)).encode("utf-8"))
def write_catalog(self):
self.del_root() self.del_root()
self.root_ref = self.next_object_id(f.tell()) self.root_ref = self.next_object_id(self.f.tell())
self.pages_ref = self.next_object_id(0) self.pages_ref = self.next_object_id(0)
self.write_obj(f, self.root_ref, self.write_obj(self.root_ref,
Type=PdfName(b"Catalog"), Type=PdfName(b"Catalog"),
Pages=self.pages_ref) Pages=self.pages_ref)
self.write_obj(f, self.pages_ref, self.write_obj(self.pages_ref,
Type=PdfName("Pages"), Type=PdfName("Pages"),
Count=len(self.pages), Count=len(self.pages),
Kids=self.pages) Kids=self.pages)
return self.root_ref return self.root_ref
def write_xref_and_trailer(self, f, new_root_ref=None): def write_xref_and_trailer(self, new_root_ref=None):
if new_root_ref: if new_root_ref:
self.del_root() self.del_root()
self.root_ref = new_root_ref self.root_ref = new_root_ref
if self.info: if self.info:
self.info_ref = self.write_obj(f, None, self.info) self.info_ref = self.write_obj(None, self.info)
start_xref = self.xref_table.write(f) start_xref = self.xref_table.write(self.f)
num_entries = len(self.xref_table) num_entries = len(self.xref_table)
trailer_dict = {b"Root": self.root_ref, b"Size": num_entries} trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
if self.last_xref_section_offset is not None: if self.last_xref_section_offset is not None:
@ -394,18 +433,19 @@ class PdfParser:
if self.info: if self.info:
trailer_dict[b"Info"] = self.info_ref trailer_dict[b"Info"] = self.info_ref
self.last_xref_section_offset = start_xref self.last_xref_section_offset = start_xref
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref)) self.f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
def write_page(self, f, ref, *objs, **dict_obj): def write_page(self, ref, *objs, **dict_obj):
if isinstance(ref, int): if isinstance(ref, int):
ref = self.pages[ref] ref = self.pages[ref]
if "Type" not in dict_obj: if "Type" not in dict_obj:
dict_obj["Type"] = PdfName("Page") dict_obj["Type"] = PdfName("Page")
if "Parent" not in dict_obj: if "Parent" not in dict_obj:
dict_obj["Parent"] = self.pages_ref dict_obj["Parent"] = self.pages_ref
return self.write_obj(f, ref, *objs, **dict_obj) return self.write_obj(ref, *objs, **dict_obj)
def write_obj(self, f, ref, *objs, **dict_obj): def write_obj(self, ref, *objs, **dict_obj):
f = self.f
if ref is None: if ref is None:
ref = self.next_object_id(f.tell()) ref = self.next_object_id(f.tell())
else: else:
@ -432,22 +472,17 @@ class PdfParser:
del self.xref_table[self.root[b"Pages"].object_id] del self.xref_table[self.root[b"Pages"].object_id]
# XXX TODO delete Pages tree recursively # XXX TODO delete Pages tree recursively
def read_pdf_info_from_file(self, f): @staticmethod
def get_buf_from_file(f):
if hasattr(f, "getbuffer"): if hasattr(f, "getbuffer"):
self.buf = f.getbuffer() return f.getbuffer()
need_close = False
elif hasattr(f, "getvalue"): elif hasattr(f, "getvalue"):
self.buf = f.getvalue() return f.getvalue()
need_close = False
else: else:
self.buf = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) try:
need_close = True return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
try: except ValueError: # cannot mmap an empty file
self.read_pdf_info() return b""
finally:
if need_close:
self.buf.close()
self.buf = None
def read_pdf_info(self): def read_pdf_info(self):
self.file_size_total = len(self.buf) self.file_size_total = len(self.buf)