mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-02-05 14:10:52 +03:00
issue #2959: add tests and fixes, text encoding, remove remnants of text writing from PdfImagePlugin
This commit is contained in:
parent
ba211ff549
commit
a187a361cb
|
@ -1,6 +1,8 @@
|
||||||
from helper import unittest, PillowTestCase, hopper
|
from helper import unittest, PillowTestCase, hopper
|
||||||
from PIL import Image, pdfParser
|
from PIL import Image, pdfParser
|
||||||
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
class TestFilePdf(PillowTestCase):
|
class TestFilePdf(PillowTestCase):
|
||||||
|
@ -20,6 +22,8 @@ class TestFilePdf(PillowTestCase):
|
||||||
self.assertTrue(os.path.isfile(outfile))
|
self.assertTrue(os.path.isfile(outfile))
|
||||||
self.assertGreater(os.path.getsize(outfile), 0)
|
self.assertGreater(os.path.getsize(outfile), 0)
|
||||||
|
|
||||||
|
return outfile
|
||||||
|
|
||||||
def test_monochrome(self):
|
def test_monochrome(self):
|
||||||
# Arrange
|
# Arrange
|
||||||
mode = "1"
|
mode = "1"
|
||||||
|
@ -97,6 +101,69 @@ class TestFilePdf(PillowTestCase):
|
||||||
self.assertTrue(os.path.isfile(outfile))
|
self.assertTrue(os.path.isfile(outfile))
|
||||||
self.assertGreater(os.path.getsize(outfile), 0)
|
self.assertGreater(os.path.getsize(outfile), 0)
|
||||||
|
|
||||||
|
def test_pdf_open(self):
|
||||||
|
# fail on empty buffer
|
||||||
|
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray())
|
||||||
|
# fail on a buffer full of null bytes
|
||||||
|
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
|
||||||
|
# make an empty PDF object
|
||||||
|
empty_pdf = pdfParser.PdfParser()
|
||||||
|
self.assertEqual(len(empty_pdf.pages), 0)
|
||||||
|
# make a PDF file
|
||||||
|
pdf_filename = self.helper_save_as_pdf("RGB")
|
||||||
|
# open the PDF file
|
||||||
|
hopper_pdf = pdfParser.PdfParser(filename=pdf_filename)
|
||||||
|
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||||
|
# read a PDF file from a buffer with a non-zero offset
|
||||||
|
with open(pdf_filename, "rb") as f:
|
||||||
|
content = b"xyzzy" + f.read()
|
||||||
|
hopper_pdf = pdfParser.PdfParser(buf=content, start_offset=5)
|
||||||
|
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||||
|
# read a PDF file from an already open file
|
||||||
|
with open(pdf_filename, "rb") as f:
|
||||||
|
hopper_pdf = pdfParser.PdfParser(f=f)
|
||||||
|
self.assertEqual(len(hopper_pdf.pages), 1)
|
||||||
|
|
||||||
|
def test_pdf_append_fails_on_nonexistent_file(self):
|
||||||
|
im = hopper("RGB")
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
try:
|
||||||
|
self.assertRaises(OSError, im.save, os.path.join(temp_dir, "nonexistent.pdf"), append=True)
|
||||||
|
finally:
|
||||||
|
os.rmdir(temp_dir)
|
||||||
|
|
||||||
|
def test_pdf_append(self):
|
||||||
|
# make a PDF file
|
||||||
|
pdf_filename = self.helper_save_as_pdf("RGB")
|
||||||
|
# open it, check pages and info
|
||||||
|
pdf = pdfParser.PdfParser(pdf_filename)
|
||||||
|
self.assertEqual(len(pdf.pages), 1)
|
||||||
|
self.assertEqual(len(pdf.info), 0)
|
||||||
|
# append some info
|
||||||
|
pdf.info[b"Title"] = b"abc"
|
||||||
|
pdf.info[b"Author"] = b"def"
|
||||||
|
pdf.info[b"Subject"] = pdfParser.encode_text("ghi")
|
||||||
|
pdf.info[b"Keywords"] = b"jkl"
|
||||||
|
pdf.info[b"Creator"] = b"hopper()"
|
||||||
|
pdf.info[b"Producer"] = b"pdfParser"
|
||||||
|
with open(pdf_filename, "r+b") as f:
|
||||||
|
f.seek(0, os.SEEK_END)
|
||||||
|
pdf.write_xref_and_trailer(f)
|
||||||
|
# open it again, check pages and info again
|
||||||
|
pdf = pdfParser.PdfParser(pdf_filename)
|
||||||
|
self.assertEqual(len(pdf.pages), 1)
|
||||||
|
self.assertEqual(len(pdf.info), 6)
|
||||||
|
self.assertEqual(pdf.info[b"Title"], b"abc")
|
||||||
|
# append two images
|
||||||
|
mode_CMYK = hopper("CMYK")
|
||||||
|
mode_P = hopper("P")
|
||||||
|
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
|
||||||
|
# open the PDF again, check pages and info again
|
||||||
|
pdf = pdfParser.PdfParser(pdf_filename)
|
||||||
|
self.assertEqual(len(pdf.pages), 3)
|
||||||
|
self.assertEqual(len(pdf.info), 6)
|
||||||
|
self.assertEqual(pdf.info[b"Title"], b"abc")
|
||||||
|
|
||||||
def test_pdf_parser(self):
|
def test_pdf_parser(self):
|
||||||
pdfParser.selftest()
|
pdfParser.selftest()
|
||||||
|
|
||||||
|
|
|
@ -58,29 +58,8 @@ def _save(im, fp, filename, save_all=False):
|
||||||
# make sure image data is available
|
# make sure image data is available
|
||||||
im.load()
|
im.load()
|
||||||
|
|
||||||
class TextWriter(object):
|
existing_pdf.write_header(fp)
|
||||||
def __init__(self, fp):
|
existing_pdf.write_comment(fp, "created by PIL PDF driver " + __version__)
|
||||||
self.fp = fp
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self.fp, name)
|
|
||||||
|
|
||||||
def write(self, value):
|
|
||||||
self.fp.write(value.encode('latin-1'))
|
|
||||||
|
|
||||||
#fp = TextWriter(fp)
|
|
||||||
|
|
||||||
fp.write(b"%PDF-1.2\n")
|
|
||||||
fp.write(b"% created by PIL PDF driver " + __version__.encode("us-ascii") + b"\n")
|
|
||||||
|
|
||||||
#
|
|
||||||
# catalogue
|
|
||||||
|
|
||||||
catalog_ref = existing_pdf.next_object_id(fp.tell())
|
|
||||||
pages_ref = existing_pdf.next_object_id(0)
|
|
||||||
existing_pdf.write_obj(fp, catalog_ref,
|
|
||||||
Type=pdfParser.PdfName(b"Catalog"),
|
|
||||||
Pages=pages_ref)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# pages
|
# pages
|
||||||
|
@ -109,10 +88,9 @@ def _save(im, fp, filename, save_all=False):
|
||||||
contents_refs.append(existing_pdf.next_object_id(0))
|
contents_refs.append(existing_pdf.next_object_id(0))
|
||||||
existing_pdf.pages.append(page_refs[-1])
|
existing_pdf.pages.append(page_refs[-1])
|
||||||
|
|
||||||
existing_pdf.write_obj(fp, pages_ref,
|
#
|
||||||
Type=pdfParser.PdfName("Pages"),
|
# catalog and list of pages
|
||||||
Count=len(existing_pdf.pages),
|
existing_pdf.write_catalog(fp)
|
||||||
Kids=existing_pdf.pages)
|
|
||||||
|
|
||||||
pageNumber = 0
|
pageNumber = 0
|
||||||
for imSequence in ims:
|
for imSequence in ims:
|
||||||
|
@ -190,9 +168,7 @@ def _save(im, fp, filename, save_all=False):
|
||||||
#
|
#
|
||||||
# page
|
# page
|
||||||
|
|
||||||
existing_pdf.write_obj(fp, page_refs[pageNumber],
|
existing_pdf.write_page(fp, page_refs[pageNumber],
|
||||||
Type=pdfParser.PdfName("Page"),
|
|
||||||
Parent=pages_ref,
|
|
||||||
Resources=pdfParser.PdfDict(
|
Resources=pdfParser.PdfDict(
|
||||||
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
|
ProcSet=[pdfParser.PdfName("PDF"), pdfParser.PdfName(procset)],
|
||||||
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
|
XObject=pdfParser.PdfDict(image=image_refs[pageNumber])),
|
||||||
|
@ -203,20 +179,18 @@ def _save(im, fp, filename, save_all=False):
|
||||||
#
|
#
|
||||||
# page contents
|
# page contents
|
||||||
|
|
||||||
op = TextWriter(io.BytesIO())
|
page_contents = pdfParser.make_bytes(
|
||||||
|
|
||||||
op.write(
|
|
||||||
"q %d 0 0 %d 0 0 cm /image Do Q\n" % (
|
"q %d 0 0 %d 0 0 cm /image Do Q\n" % (
|
||||||
int(width * 72.0 / resolution),
|
int(width * 72.0 / resolution),
|
||||||
int(height * 72.0 / resolution)))
|
int(height * 72.0 / resolution)))
|
||||||
|
|
||||||
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=op.fp.getvalue())
|
existing_pdf.write_obj(fp, contents_refs[pageNumber], stream=page_contents)
|
||||||
|
|
||||||
pageNumber += 1
|
pageNumber += 1
|
||||||
|
|
||||||
#
|
#
|
||||||
# trailer
|
# trailer
|
||||||
existing_pdf.write_xref_and_trailer(fp, catalog_ref)
|
existing_pdf.write_xref_and_trailer(fp)
|
||||||
if hasattr(fp, "flush"):
|
if hasattr(fp, "flush"):
|
||||||
fp.flush()
|
fp.flush()
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import codecs
|
||||||
import collections
|
import collections
|
||||||
import io
|
import io
|
||||||
import mmap
|
import mmap
|
||||||
|
@ -14,7 +15,11 @@ if sys.version_info.major >= 3:
|
||||||
def make_bytes(s):
|
def make_bytes(s):
|
||||||
return s.encode("us-ascii")
|
return s.encode("us-ascii")
|
||||||
else:
|
else:
|
||||||
make_bytes = lambda s: s
|
make_bytes = lambda s: s # pragma: no cover
|
||||||
|
|
||||||
|
|
||||||
|
def encode_text(s):
|
||||||
|
return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatError(RuntimeError):
|
class PdfFormatError(RuntimeError):
|
||||||
|
@ -34,16 +39,16 @@ class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["objec
|
||||||
return self.__str__().encode("us-ascii")
|
return self.__str__().encode("us-ascii")
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return isinstance(other, IndirectReference) and other.object_id == self.object_id and other.generation == self.generation
|
return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation
|
||||||
|
|
||||||
|
def __ne__(self, other):
|
||||||
|
return not (self == other)
|
||||||
|
|
||||||
|
|
||||||
class IndirectObjectDef(IndirectReference):
|
class IndirectObjectDef(IndirectReference):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "%s %s obj" % self
|
return "%s %s obj" % self
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return isinstance(other, IndirectObjectDef) and other.object_id == self.object_id and other.generation == self.generation
|
|
||||||
|
|
||||||
|
|
||||||
class XrefTable:
|
class XrefTable:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -251,11 +256,11 @@ class PdfParser:
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.buf = buf
|
self.buf = buf
|
||||||
self.start_offset = start_offset
|
self.start_offset = start_offset
|
||||||
if buf:
|
if buf is not None:
|
||||||
self.read_pdf_info()
|
self.read_pdf_info()
|
||||||
elif f:
|
elif f is not None:
|
||||||
self.read_pdf_info_from_file(f)
|
self.read_pdf_info_from_file(f)
|
||||||
elif filename:
|
elif filename is not None:
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
self.read_pdf_info_from_file(f)
|
self.read_pdf_info_from_file(f)
|
||||||
else:
|
else:
|
||||||
|
@ -266,18 +271,40 @@ class PdfParser:
|
||||||
self.info_ref = None
|
self.info_ref = None
|
||||||
self.page_tree_root = {}
|
self.page_tree_root = {}
|
||||||
self.pages = []
|
self.pages = []
|
||||||
|
self.pages_ref = None
|
||||||
self.last_xref_section_offset = None
|
self.last_xref_section_offset = None
|
||||||
self.trailer_dict = {}
|
self.trailer_dict = {}
|
||||||
self.xref_table = XrefTable()
|
self.xref_table = XrefTable()
|
||||||
self.xref_table.reading_finished = True
|
self.xref_table.reading_finished = True
|
||||||
|
|
||||||
def write_xref_and_trailer(self, f, new_root_ref):
|
def write_header(self, f):
|
||||||
|
f.write(b"%PDF-1.4\n")
|
||||||
|
|
||||||
|
def write_comment(self, f, s):
|
||||||
|
f.write(("%% %s\n" % (s,)).encode("utf-8"))
|
||||||
|
|
||||||
|
def write_catalog(self, f):
|
||||||
self.del_root()
|
self.del_root()
|
||||||
|
self.root_ref = self.next_object_id(f.tell())
|
||||||
|
self.pages_ref = self.next_object_id(0)
|
||||||
|
self.write_obj(f, self.root_ref,
|
||||||
|
Type=PdfName(b"Catalog"),
|
||||||
|
Pages=self.pages_ref)
|
||||||
|
self.write_obj(f, self.pages_ref,
|
||||||
|
Type=PdfName("Pages"),
|
||||||
|
Count=len(self.pages),
|
||||||
|
Kids=self.pages)
|
||||||
|
return self.root_ref
|
||||||
|
|
||||||
|
def write_xref_and_trailer(self, f, new_root_ref=None):
|
||||||
|
if new_root_ref:
|
||||||
|
self.del_root()
|
||||||
|
self.root_ref = new_root_ref
|
||||||
if self.info:
|
if self.info:
|
||||||
self.info_ref = self.write_obj(f, None, self.info)
|
self.info_ref = self.write_obj(f, None, self.info)
|
||||||
start_xref = self.xref_table.write(f)
|
start_xref = self.xref_table.write(f)
|
||||||
num_entries = len(self.xref_table)
|
num_entries = len(self.xref_table)
|
||||||
trailer_dict = {b"Root": new_root_ref, b"Size": num_entries}
|
trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
|
||||||
if self.last_xref_section_offset is not None:
|
if self.last_xref_section_offset is not None:
|
||||||
trailer_dict[b"Prev"] = self.last_xref_section_offset
|
trailer_dict[b"Prev"] = self.last_xref_section_offset
|
||||||
if self.info:
|
if self.info:
|
||||||
|
@ -285,6 +312,15 @@ class PdfParser:
|
||||||
self.last_xref_section_offset = start_xref
|
self.last_xref_section_offset = start_xref
|
||||||
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
|
f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
|
||||||
|
|
||||||
|
def write_page(self, f, ref, *objs, **dict_obj):
|
||||||
|
if isinstance(ref, int):
|
||||||
|
ref = self.pages[ref]
|
||||||
|
if "Type" not in dict_obj:
|
||||||
|
dict_obj["Type"] = PdfName("Page")
|
||||||
|
if "Parent" not in dict_obj:
|
||||||
|
dict_obj["Parent"] = self.pages_ref
|
||||||
|
return self.write_obj(f, ref, *objs, **dict_obj)
|
||||||
|
|
||||||
def write_obj(self, f, ref, *objs, **dict_obj):
|
def write_obj(self, f, ref, *objs, **dict_obj):
|
||||||
if ref is None:
|
if ref is None:
|
||||||
ref = self.next_object_id(f.tell())
|
ref = self.next_object_id(f.tell())
|
||||||
|
@ -336,7 +372,8 @@ class PdfParser:
|
||||||
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
|
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
|
||||||
check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
|
check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
|
||||||
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
|
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
|
||||||
self.page_tree_root = self.read_indirect(self.root[b"Pages"])
|
self.pages_ref = self.root[b"Pages"]
|
||||||
|
self.page_tree_root = self.read_indirect(self.pages_ref)
|
||||||
#print("page_tree_root: " + str(self.page_tree_root))
|
#print("page_tree_root: " + str(self.page_tree_root))
|
||||||
self.pages = self.linearize_page_tree(self.page_tree_root)
|
self.pages = self.linearize_page_tree(self.page_tree_root)
|
||||||
#print("pages: " + str(self.pages))
|
#print("pages: " + str(self.pages))
|
||||||
|
@ -361,15 +398,23 @@ class PdfParser:
|
||||||
newline = whitespace_optional + newline_only + whitespace_optional
|
newline = whitespace_optional + newline_only + whitespace_optional
|
||||||
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
|
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
|
||||||
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
|
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
|
||||||
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*\>\>)" + newline \
|
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_mandatory + br"\<\<(.*?\>\>)" + newline \
|
||||||
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
|
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
|
||||||
def read_trailer(self):
|
def read_trailer(self):
|
||||||
search_start_offset = len(self.buf) - 16384
|
search_start_offset = len(self.buf) - 16384
|
||||||
if search_start_offset < self.start_offset:
|
if search_start_offset < self.start_offset:
|
||||||
search_start_offset = self.start_offset
|
search_start_offset = self.start_offset
|
||||||
data_at_end = self.buf[search_start_offset:]
|
#data_at_end = self.buf[search_start_offset:]
|
||||||
m = self.re_trailer_end.search(data_at_end)
|
#m = self.re_trailer_end.search(data_at_end)
|
||||||
|
m = self.re_trailer_end.search(self.buf, search_start_offset)
|
||||||
check_format_condition(m, "trailer end not found")
|
check_format_condition(m, "trailer end not found")
|
||||||
|
# make sure we found the LAST trailer
|
||||||
|
last_match = m
|
||||||
|
while m:
|
||||||
|
last_match = m
|
||||||
|
m = self.re_trailer_end.search(self.buf, m.start()+16)
|
||||||
|
if not m:
|
||||||
|
m = last_match
|
||||||
trailer_data = m.group(1)
|
trailer_data = m.group(1)
|
||||||
#print(trailer_data)
|
#print(trailer_data)
|
||||||
self.last_xref_section_offset = int(m.group(2))
|
self.last_xref_section_offset = int(m.group(2))
|
||||||
|
@ -627,6 +672,14 @@ class PdfParser:
|
||||||
def selftest():
|
def selftest():
|
||||||
assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash"
|
assert PdfParser.interpret_name(b"Name#23Hash") == b"Name#Hash"
|
||||||
assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash"
|
assert PdfParser.interpret_name(b"Name#23Hash", as_text=True) == "Name#Hash"
|
||||||
|
assert IndirectReference(1,2) == IndirectReference(1,2)
|
||||||
|
assert IndirectReference(1,2) != IndirectReference(1,3)
|
||||||
|
assert IndirectReference(1,2) != IndirectObjectDef(1,2)
|
||||||
|
assert IndirectReference(1,2) != (1,2)
|
||||||
|
assert IndirectObjectDef(1,2) == IndirectObjectDef(1,2)
|
||||||
|
assert IndirectObjectDef(1,2) != IndirectObjectDef(1,3)
|
||||||
|
assert IndirectObjectDef(1,2) != IndirectReference(1,2)
|
||||||
|
assert IndirectObjectDef(1,2) != (1,2)
|
||||||
assert bytes(IndirectReference(1,2)) == b"1 2 R"
|
assert bytes(IndirectReference(1,2)) == b"1 2 R"
|
||||||
assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj"
|
assert bytes(IndirectObjectDef(*IndirectReference(1,2))) == b"1 2 obj"
|
||||||
assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"
|
assert bytes(PdfName(b"Name#Hash")) == b"/Name#23Hash"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user