mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-01-26 17:24:31 +03:00
issue #2959: add tests and fixes, text encoding, remove remnants of text writing from PdfImagePlugin
This commit is contained in:
parent
ba211ff549
commit
a187a361cb
|
@ -1,6 +1,8 @@
|
|||
from helper import unittest, PillowTestCase, hopper
|
||||
from PIL import Image, pdfParser
|
||||
import os
|
||||
import os.path
|
||||
import tempfile
|
||||
|
||||
|
||||
class TestFilePdf(PillowTestCase):
|
||||
|
@ -20,6 +22,8 @@ class TestFilePdf(PillowTestCase):
|
|||
self.assertTrue(os.path.isfile(outfile))
|
||||
self.assertGreater(os.path.getsize(outfile), 0)
|
||||
|
||||
return outfile
|
||||
|
||||
def test_monochrome(self):
|
||||
# Arrange
|
||||
mode = "1"
|
||||
|
@ -97,6 +101,69 @@ class TestFilePdf(PillowTestCase):
|
|||
self.assertTrue(os.path.isfile(outfile))
|
||||
self.assertGreater(os.path.getsize(outfile), 0)
|
||||
|
||||
def test_pdf_open(self):
|
||||
# fail on empty buffer
|
||||
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray())
|
||||
# fail on a buffer full of null bytes
|
||||
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
|
||||
# make an empty PDF object
|
||||
empty_pdf = pdfParser.PdfParser()
|
||||
self.assertEqual(len(empty_pdf.pages), 0)
|
||||
# make a PDF file
|
||||
pdf_filename = self.helper_save_as_pdf("RGB")
|
||||
# open the PDF file
|
||||
hopper_pdf = pdfParser.PdfParser(filename=pdf_filename)
|
||||
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||
# read a PDF file from a buffer with a non-zero offset
|
||||
with open(pdf_filename, "rb") as f:
|
||||
content = b"xyzzy" + f.read()
|
||||
hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5)
|
||||
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||
# read a PDF file from an already open file
|
||||
with open(pdf_filename, "rb") as f:
|
||||
hopper_pdf = pdfParser.PdfParser(f=f)
|
||||
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||
|
||||
def test_pdf_append_fails_on_nonexistent_file(self):
|
||||
im = hopper("RGB")
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
self.assertRaises(OSError, im.save, os.path.join(temp_dir, "nonexistent.pdf"), append=True)
|
||||
finally:
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
def test_pdf_append(self):
|
||||
# make a PDF file
|
||||
pdf_filename = self.helper_save_as_pdf("RGB")
|
||||
# open it, check pages and info
|
||||
pdf = pdfParser.PdfParser(pdf_filename)
|
||||
self.assertEqual(len(pdf.pages), 1)
|
||||
self.assertEqual(len(pdf.info), 0)
|
||||
# append some info
|
||||
pdf.info[b"Title"] = b"abc"
|
||||
pdf.info[b"Author"] = b"def"
|
||||
pdf.info[b"Subject"] = pdfParser.encode_text("ghi")
|
||||
pdf.info[b"Keywords"] = b"jkl"
|
||||
pdf.info[b"Creator"] = b"hopper()"
|
||||
pdf.info[b"Producer"] = b"pdfParser"
|
||||
with open(pdf_filename, "r+b") as f:
|
||||
f.seek(0, os.SEEK_END)
|
||||
pdf.write_xref_and_trailer(f)
|
||||
# open it again, check pages and info again
|
||||
pdf = pdfParser.PdfParser(pdf_filename)
|
||||
self.assertEqual(len(pdf.pages), 1)
|
||||
self.assertEqual(len(pdf.info), 6)
|
||||
self.assertEqual(pdf.info[b"Title"], b"abc")
|
||||
# append two images
|
||||
mode_CMYK = hopper("CMYK")
|
||||
mode_P = hopper("P")
|
||||
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
|
||||
# open the PDF again, check pages and info again
|
||||
pdf = pdfParser.PdfParser(pdf_filename)
|
||||
self.assertEqual(len(pdf.pages), 3)
|
||||
self.assertEqual(len(pdf.info), 6)
|
||||
self.assertEqual(pdf.info[b"Title"], b"abc")
|
||||
|
||||
def test_pdf_parser(self):
|
||||
pdfParser.selftest()
|
||||
|
||||
|
|
|
@ -58,29 +58,8 @@ def _save(im, fp, filename, save_all=False):
|
|||
# make sure image data is available
|
||||
im.load()
|
||||
|
||||
class TextWriter(object):
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.fp, name)
|
||||
|
||||
def write(self, value):
|
||||
self.fp.write(value.encode('latin-1'))
|
||||
|
||||
#fp = TextWriter(fp)
|
||||
|
||||
fp.write(b"%PDF-1.2\n")
|
||||
fp.write(b"% created by PIL PDF driver " + __version__.encode("us-ascii") + b"\n")
|
||||
|
||||
#
|
||||
# catalogue
|
||||
|
||||
catalog_ref = existing_pdf.next_object_id(fp.tell())
|
||||
pages_ref = existing_pdf.next_object_id(0)
|
||||
existing_pdf.write_obj(fp, catalog_ref,
|
||||
Type=pdfParser.PdfName(b"Catalog"),
|
||||
Pages=pages_ref)
|
||||
existing_pdf.write_header(fp)
|
||||
existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__)
|
||||
|
||||
#
|
||||
# pages
|
||||
|
@ -109,10 +88,9 @@ def _save(im, fp, filename, save_all=False):
|
|||
contents_refs.append(existing_pdf.next_object_id(0))
|
||||
existing_pdf.pages.append(page_refs[-1])
|
||||
|
||||
existing_pdf.write_obj(fp, pages_ref,
|
||||
Type=pdfParser.PdfName("Pages"),
|
||||
Count=len(existing_pdf.pages),
|
||||
Kids=existing_pdf.pages)
|
||||
#
|
||||
# catalog and list of pages
|
||||
existing_pdf.write_catalog(fp)
|
||||
|
||||
pageNumber = 0
|
||||
for imSequence in ims:
|
||||
|
@ -190,9 +168,7 @@ def _save(im, fp, filename, save_all=False):
|
|||
#
|
||||
# page
|
||||
|
||||
existing_pdf.write_obj(fp, page_refs[pageNumber],
|
||||
Type=pdfParser.PdfName("Page"),
|
||||
Parent=pages_ref,
|
||||
existing_pdf.write_page(fp, page_refs[pageNumber],
|
||||
Resources=pdfParser.PdfDict(
|
||||
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
|
||||
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
|
||||
|
@ -203,20 +179,18 @@ def _save(im, fp, filename, save_all=False):
|
|||
#
|
||||
# page contents
|
||||
|
||||
op = TextWriter(io.BytesIO())
|
||||
|
||||
op.write(
|
||||
page_contents = pdfParser.make_bytes(
|
||||
"q %d 0 0 %d 0 0 cm /image Do Q\n" % (
|
||||
int(width * 72.0 / resolution),
|
||||
int(height * 72.0 / resolution)))
|
||||
|
||||
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=op.fp.getvalue())
|
||||
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents)
|
||||
|
||||
pageNumber += 1
|
||||
|
||||
#
|
||||
# trailer
|
||||
existing_pdf.write_xref_and_trailer(fp, catalog_ref)
|
||||
existing_pdf.write_xref_and_trailer(fp)
|
||||
if hasattr(fp, "flush"):
|
||||
fp.flush()
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import codecs
|
||||
import collections
|
||||
import io
|
||||
import mmap
|
||||
|
@ -14,7 +15,11 @@ if sys.version_info.major >= 3:
|
|||
def make_bytes(s):
|
||||
return s.encode("us-ascii")
|
||||
else:
|
||||
make_bytes = lambda s: s
|
||||
make_bytes = lambda s: s # pragma: no cover
|
||||
|
||||
|
||||
def encode_text(s):
|
||||
return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
|
||||
|
||||
|
||||
class PdfFormatError(RuntimeError):
|
||||
|
@ -34,16 +39,16 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec
|
|||
return self.__str__().encode("us-ascii")
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, IndirectReference) and other.object_id == self.object_id and other.generation == self.generation
|
||||
return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
|
||||
class IndirectObjectDef(IndirectReference):
|
||||
def __str__(self):
|
||||
return "%s %s obj" % self
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, IndirectObjectDef) and other.object_id == self.object_id and other.generation == self.generation
|
||||
|
||||
|
||||
class XrefTable:
|
||||
def __init__(self):
|
||||
|
@ -251,11 +256,11 @@ class PdfParser:
|
|||
self.filename = filename
|
||||
self.buf = buf
|
||||
self.start_offset = start_offset
|
||||
if buf:
|
||||
if buf is not None:
|
||||
self.read_pdf_info()
|
||||
elif f:
|
||||
elif f is not None:
|
||||
self.read_pdf_info_from_file(f)
|
||||
elif filename:
|
||||
elif filename is not None:
|
||||
with open(filename, "rb") as f:
|
||||
self.read_pdf_info_from_file(f)
|
||||
else:
|
||||
|
@ -266,18 +271,40 @@ class PdfParser:
|
|||
self.info_ref = None
|
||||
self.page_tree_root = {}
|
||||
self.pages = []
|
||||
self.pages_ref = None
|
||||
self.last_xref_section_offset = None
|
||||
self.trailer_dict = {}
|
||||
self.xref_table = XrefTable()
|
||||
self.xref_table.reading_finished = True
|
||||
|
||||
def write_xref_and_trailer(self, f, new_root_ref):
|
||||
def write_header(self, f):
|
||||
f.write(b"%PDF-1.4\n")
|
||||
|
||||
def write_comment(self, f, s):
|
||||
f.write(("%% %s\n" % (s,)).encode("utf-8"))
|
||||
|
||||
def write_catalog(self, f):
|
||||
self.del_root()
|
||||
self.root_ref = self.next_object_id(f.tell())
|
||||
self.pages_ref = self.next_object_id(0)
|
||||
self.write_obj(f, self.root_ref,
|
||||
Type=PdfName(b"Catalog"),
|
||||
Pages=self.pages_ref)
|
||||
self.write_obj(f, self.pages_ref,
|
||||
Type=PdfName("Pages"),
|
||||
Count=len(self.pages),
|
||||
Kids=self.pages)
|
||||
return self.root_ref
|
||||
|
||||
def write_xref_and_trailer(self, f, new_root_ref=None):
|
||||
if new_root_ref:
|
||||
self.del_root()
|
||||
self.root_ref = new_root_ref
|
||||
if self.info:
|
||||
self.info_ref = self.write_obj(f, None, self.info)
|
||||
start_xref = self.xref_table.write(f)
|
||||
num_entries = len(self.xref_table)
|
||||
trailer_dict = {b"Root": new_root_ref, b"Size": num_entries}
|
||||
trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
|
||||
if self.last_xref_section_offset is not None:
|
||||
trailer_dict[b"Prev"] = self.last_xref_section_offset
|
||||
if self.info:
|
||||
|
@ -285,6 +312,15 @@ class PdfParser:
|
|||
self.last_xref_section_offset = start_xref
|
||||
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
|
||||
|
||||
def write_page(self, f, ref, *objs, **dict_obj):
|
||||
if isinstance(ref, int):
|
||||
ref = self.pages[ref]
|
||||
if "Type" not in dict_obj:
|
||||
dict_obj["Type"] = PdfName("Page")
|
||||
if "Parent" not in dict_obj:
|
||||
dict_obj["Parent"] = self.pages_ref
|
||||
return self.write_obj(f, ref, *objs, **dict_obj)
|
||||
|
||||
def write_obj(self, f, ref, *objs, **dict_obj):
|
||||
if ref is None:
|
||||
ref = self.next_object_id(f.tell())
|
||||
|
@ -336,7 +372,8 @@ class PdfParser:
|
|||
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
|
||||
check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
|
||||
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
|
||||
self.page_tree_root = self.read_indirect(self.root[b"Pages"])
|
||||
self.pages_ref = self.root[b"Pages"]
|
||||
self.page_tree_root = self.read_indirect(self.pages_ref)
|
||||
#print("page_tree_root: " + str(self.page_tree_root))
|
||||
self.pages = self.linearize_page_tree(self.page_tree_root)
|
||||
#print("pages: " + str(self.pages))
|
||||
|
@ -361,15 +398,23 @@ class PdfParser:
|
|||
newline = whitespace_optional + newline_only + whitespace_optional
|
||||
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
|
||||
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
|
||||
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
|
||||
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*?\>\>)" + newline \
|
||||
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
|
||||
def read_trailer(self):
|
||||
search_start_offset = len(self.buf) - 16384
|
||||
if search_start_offset < self.start_offset:
|
||||
search_start_offset = self.start_offset
|
||||
data_at_end = self.buf[search_start_offset:]
|
||||
m = self.re_trailer_end.search(data_at_end)
|
||||
#data_at_end = self.buf[search_start_offset:]
|
||||
#m = self.re_trailer_end.search(data_at_end)
|
||||
m = self.re_trailer_end.search(self.buf, search_start_offset)
|
||||
check_format_condition(m, "trailer end not found")
|
||||
# make sure we found the LAST trailer
|
||||
last_match = m
|
||||
while m:
|
||||
last_match = m
|
||||
m = self.re_trailer_end.search(self.buf, m.start()+16)
|
||||
if not m:
|
||||
m = last_match
|
||||
trailer_data = m.group(1)
|
||||
#print(trailer_data)
|
||||
self.last_xref_section_offset = int(m.group(2))
|
||||
|
@ -627,6 +672,14 @@ class PdfParser:
|
|||
def selftest():
|
||||
assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash"
|
||||
assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash"
|
||||
assert IndirectReference(1,2) == IndirectReference(1,2)
|
||||
assert IndirectReference(1,2) != IndirectReference(1,3)
|
||||
assert IndirectReference(1,2) != IndirectObjectDef(1,2)
|
||||
assert IndirectReference(1,2) != (1,2)
|
||||
assert IndirectObjectDef(1,2) == IndirectObjectDef(1,2)
|
||||
assert IndirectObjectDef(1,2) != IndirectObjectDef(1,3)
|
||||
assert IndirectObjectDef(1,2) != IndirectReference(1,2)
|
||||
assert IndirectObjectDef(1,2) != (1,2)
|
||||
assert bytes(IndirectReference(1,2)) == b"1 2 R"
|
||||
assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj"
|
||||
assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"
|
||||
|
|
Loading…
Reference in New Issue
Block a user