issue #2959: support streams, add some tests

This commit is contained in:
Dvořák Václav 2018-01-26 17:15:38 +01:00
parent 971837cc00
commit 78fe32a431
3 changed files with 79 additions and 4 deletions

View File

@ -166,6 +166,19 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
def test_pdf_append(self):
# make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer")
# open it, check pages and info
pdf = pdfParser.PdfParser(pdf_filename)
self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "title")
self.assertEqual(pdf.info.Author, "author")
self.assertEqual(pdf.info.Subject, "subject")
self.assertEqual(pdf.info.Keywords, "keywords")
self.assertEqual(pdf.info.Creator, "creator")
self.assertEqual(pdf.info.Producer, "producer")
def test_pdf_append_to_bytesio(self):
im = hopper("RGB")
f = io.BytesIO()

View File

@ -44,6 +44,24 @@ class TestPdfParser(PillowTestCase):
self.assertEqual(PdfParser.get_value(b"(\\53)", 0), (b"\x2B", 5))
self.assertEqual(PdfParser.get_value(b"(\\53a)", 0), (b"\x2Ba", 6))
self.assertEqual(PdfParser.get_value(b"(\\1111)", 0), (b"\x491", 7))
self.assertEqual(PdfParser.get_value(b" 123 (", 0), (123, 4))
self.assertAlmostEqual(PdfParser.get_value(b" 123.4 %", 0)[0], 123.4)
self.assertEqual(PdfParser.get_value(b" 123.4 %", 0)[1], 6)
self.assertRaises(PdfFormatError, PdfParser.get_value, b"]", 0)
d = PdfParser.get_value(b"<</Name (value) /N /V>>", 0)[0]
self.assertIsInstance(d, PdfDict)
self.assertEqual(len(d), 2)
self.assertEqual(d.Name, "value")
self.assertEqual(d[b"Name"], b"value")
self.assertEqual(d.N, PdfName("V"))
a = PdfParser.get_value(b"[/Name (value) /N /V]", 0)[0]
self.assertIsInstance(a, list)
self.assertEqual(len(a), 4)
self.assertEqual(a[0], PdfName("Name"))
s = PdfParser.get_value(b"<</Name (value) /Length 5>>\nstream\nabcde\nendstream<<...", 0)[0]
self.assertIsInstance(s, PdfStream)
self.assertEqual(s.dictionary.Name, "value")
self.assertEqual(s.decode(), b"abcde")
def test_pdf_repr(self):
self.assertEqual(bytes(IndirectReference(1,2)), b"1 2 R")

View File

@ -4,6 +4,7 @@ import io
import mmap
import re
import sys
import zlib
try:
from UserDict import UserDict
@ -188,6 +189,15 @@ class PdfName():
else:
self.name = name.encode("us-ascii")
def __eq__(self, other):
return (isinstance(other, PdfName) and other.name == self.name) or other == self.name
def __hash__(self):
return hash(self.name)
def __repr__(self):
return "PdfName(%s)" % repr(self.name)
@classmethod
def from_pdf_stream(klass, data):
return klass(PdfParser.interpret_name(data))
@ -260,7 +270,6 @@ class PdfDict(UserDict):
out.extend(value)
out.extend(b"\n>>")
return bytes(out)
#return out + b"\n>>"
if str == bytes:
__str__ = __bytes__
@ -279,6 +288,26 @@ class PdfBinary:
return make_bytes("<%s>" % "".join("%02X" % b for b in self.data))
class PdfStream:
def __init__(self, dictionary, buf):
self.dictionary = dictionary
self.buf = buf
def decode(self):
try:
filter = self.dictionary.Filter
except AttributeError:
return self.buf
if filter == b"FlateDecode":
try:
expected_length = self.dictionary.DL
except AttributeError:
expected_length = self.dictionary.Length
return zlib.decompress(self.buf, bufsize=int(expected_length))
else:
raise NotImplementedError("stream filter %s unknown/unsupported" % repr(self.dictionary.Filter))
def pdf_repr(x):
if x is True:
return b"true"
@ -550,6 +579,8 @@ class PdfParser:
re_indirect_def_start = re.compile(whitespace_optional + br"([-+]?[0-9]+)" + whitespace_mandatory + br"([-+]?[0-9]+)" + whitespace_mandatory + br"obj(?=" + delimiter_or_ws + br")")
re_indirect_def_end = re.compile(whitespace_optional + br"endobj(?=" + delimiter_or_ws + br")")
re_comment = re.compile(br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*")
re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
re_stream_end = re.compile(whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")")
@classmethod
def get_value(klass, data, offset, expect_indirect=None, max_nesting=-1):
#if max_nesting == 0:
@ -593,7 +624,21 @@ class PdfParser:
return result, None
m = klass.re_dict_end.match(data, offset)
#print(">>")
return result, m.end()
offset = m.end()
m = klass.re_stream_start.match(data, offset)
if m:
try:
stream_len = int(result[b"Length"])
except:
raise PdfFormatError("bad or missing Length in stream dict (%r)" % result.get(b"Length", None))
stream_data = data[m.end():m.end() + stream_len]
m = klass.re_stream_end.match(data, m.end() + stream_len)
check_format_condition(m, "stream end not found")
offset = m.end()
result = PdfStream(PdfDict(result), stream_data)
else:
result = PdfDict(result)
return result, offset
m = klass.re_array_start.match(data, offset)
if m:
offset = m.end()
@ -618,7 +663,7 @@ class PdfParser:
return False, m.end()
m = klass.re_name.match(data, offset)
if m:
return klass.interpret_name(m.group(1)), m.end()
return PdfName(klass.interpret_name(m.group(1))), m.end()
m = klass.re_int.match(data, offset)
if m:
return int(m.group(1)), m.end()
@ -634,7 +679,6 @@ class PdfParser:
m = klass.re_string_lit.match(data, offset)
if m:
return klass.get_literal_string(data, m.end())
# XXX TODO: stream
#return None, offset # fallback (only for debugging)
raise PdfFormatError("unrecognized object: " + repr(data[offset:offset+32]))