issue #2959: changes based on @hugovk's review

This commit is contained in:
Dvořák Václav 2018-01-31 00:25:04 +01:00
parent ede57b91e0
commit 9be8d669f9
6 changed files with 54 additions and 56 deletions

View File

@ -107,19 +107,23 @@ class TestFilePdf(PillowTestCase):
def test_pdf_open(self): def test_pdf_open(self):
# fail on a buffer full of null bytes # fail on a buffer full of null bytes
self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536)) self.assertRaises(pdfParser.PdfFormatError, pdfParser.PdfParser, buf=bytearray(65536))
# make an empty PDF object # make an empty PDF object
with pdfParser.PdfParser() as empty_pdf: with pdfParser.PdfParser() as empty_pdf:
self.assertEqual(len(empty_pdf.pages), 0) self.assertEqual(len(empty_pdf.pages), 0)
self.assertEqual(len(empty_pdf.info), 0) self.assertEqual(len(empty_pdf.info), 0)
self.assertFalse(empty_pdf.should_close_buf) self.assertFalse(empty_pdf.should_close_buf)
self.assertFalse(empty_pdf.should_close_file) self.assertFalse(empty_pdf.should_close_file)
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB") pdf_filename = self.helper_save_as_pdf("RGB")
# open the PDF file # open the PDF file
with pdfParser.PdfParser(filename=pdf_filename) as hopper_pdf: with pdfParser.PdfParser(filename=pdf_filename) as hopper_pdf:
self.assertEqual(len(hopper_pdf.pages), 1) self.assertEqual(len(hopper_pdf.pages), 1)
self.assertTrue(hopper_pdf.should_close_buf) self.assertTrue(hopper_pdf.should_close_buf)
self.assertTrue(hopper_pdf.should_close_file) self.assertTrue(hopper_pdf.should_close_file)
# read a PDF file from a buffer with a non-zero offset # read a PDF file from a buffer with a non-zero offset
with open(pdf_filename, "rb") as f: with open(pdf_filename, "rb") as f:
content = b"xyzzy" + f.read() content = b"xyzzy" + f.read()
@ -127,6 +131,7 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(len(hopper_pdf.pages), 1) self.assertEqual(len(hopper_pdf.pages), 1)
self.assertFalse(hopper_pdf.should_close_buf) self.assertFalse(hopper_pdf.should_close_buf)
self.assertFalse(hopper_pdf.should_close_file) self.assertFalse(hopper_pdf.should_close_file)
# read a PDF file from an already open file # read a PDF file from an already open file
with open(pdf_filename, "rb") as f: with open(pdf_filename, "rb") as f:
with pdfParser.PdfParser(f=f) as hopper_pdf: with pdfParser.PdfParser(f=f) as hopper_pdf:
@ -145,11 +150,13 @@ class TestFilePdf(PillowTestCase):
def test_pdf_append(self): def test_pdf_append(self):
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser") pdf_filename = self.helper_save_as_pdf("RGB", producer="pdfParser")
# open it, check pages and info # open it, check pages and info
with pdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: with pdfParser.PdfParser(pdf_filename, mode="r+b") as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 1) self.assertEqual(len(pdf.info), 1)
self.assertEqual(pdf.info.Producer, "pdfParser") self.assertEqual(pdf.info.Producer, "pdfParser")
# append some info # append some info
pdf.info.Title = "abc" pdf.info.Title = "abc"
pdf.info.Author = "def" pdf.info.Author = "def"
@ -157,16 +164,19 @@ class TestFilePdf(PillowTestCase):
pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Keywords = "qw)e\\r(ty"
pdf.info.Creator = "hopper()" pdf.info.Creator = "hopper()"
pdf.start_writing() pdf.start_writing()
pdf.write_xref_and_trailer(f) pdf.write_xref_and_trailer()
# open it again, check pages and info again # open it again, check pages and info again
with pdfParser.PdfParser(pdf_filename) as pdf: with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 6)
self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Title, "abc")
# append two images # append two images
mode_CMYK = hopper("CMYK") mode_CMYK = hopper("CMYK")
mode_P = hopper("P") mode_P = hopper("P")
mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P])
# open the PDF again, check pages and info again # open the PDF again, check pages and info again
with pdfParser.PdfParser(pdf_filename) as pdf: with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 3) self.assertEqual(len(pdf.pages), 3)
@ -177,9 +187,10 @@ class TestFilePdf(PillowTestCase):
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD") self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
def test_pdf_append(self): def test_pdf_info(self):
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer") pdf_filename = self.helper_save_as_pdf("RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer")
# open it, check pages and info # open it, check pages and info
with pdfParser.PdfParser(pdf_filename) as pdf: with pdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 6)

View File

@ -250,7 +250,7 @@ class TestCffi(AccessTest):
class TestEmbeddable(unittest.TestCase): class TestEmbeddable(unittest.TestCase):
@unittest.skipIf(not sys.platform.startswith('win32') or @unittest.skipIf(sys.platform.startswith('win32') or
sys.version_info[:2] == (3, 4) or sys.version_info[:2] == (3, 4) or
on_appveyor(), # failing on appveyor when run from on_appveyor(), # failing on appveyor when run from
# subprocess, not from shell # subprocess, not from shell

View File

@ -1,6 +1,6 @@
from helper import unittest, PillowTestCase from helper import unittest, PillowTestCase
from PIL.pdfParser import * from PIL.pdfParser import IndirectObjectDef, IndirectReference, PdfBinary, PdfDict, PdfFormatError, PdfName, PdfParser, PdfStream, decode_text, encode_text, pdf_repr
class TestPdfParser(PillowTestCase): class TestPdfParser(PillowTestCase):
@ -12,14 +12,14 @@ class TestPdfParser(PillowTestCase):
self.assertEqual(decode_text(b"\x1B a \x1C"), u"\u02D9 a \u02DD") self.assertEqual(decode_text(b"\x1B a \x1C"), u"\u02D9 a \u02DD")
def test_indirect_refs(self): def test_indirect_refs(self):
self.assertEqual(IndirectReference(1,2), IndirectReference(1,2)) self.assertEqual(IndirectReference(1, 2), IndirectReference(1, 2))
self.assertNotEqual(IndirectReference(1,2), IndirectReference(1,3)) self.assertNotEqual(IndirectReference(1, 2), IndirectReference(1, 3))
self.assertNotEqual(IndirectReference(1,2), IndirectObjectDef(1,2)) self.assertNotEqual(IndirectReference(1, 2), IndirectObjectDef(1, 2))
self.assertNotEqual(IndirectReference(1,2), (1,2)) self.assertNotEqual(IndirectReference(1, 2), (1, 2))
self.assertEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,2)) self.assertEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 2))
self.assertNotEqual(IndirectObjectDef(1,2), IndirectObjectDef(1,3)) self.assertNotEqual(IndirectObjectDef(1, 2), IndirectObjectDef(1, 3))
self.assertNotEqual(IndirectObjectDef(1,2), IndirectReference(1,2)) self.assertNotEqual(IndirectObjectDef(1, 2), IndirectReference(1, 2))
self.assertNotEqual(IndirectObjectDef(1,2), (1,2)) self.assertNotEqual(IndirectObjectDef(1, 2), (1, 2))
def test_parsing(self): def test_parsing(self):
self.assertEqual(PdfParser.interpret_name(b"Name#23Hash"), b"Name#Hash") self.assertEqual(PdfParser.interpret_name(b"Name#23Hash"), b"Name#Hash")
@ -64,18 +64,18 @@ class TestPdfParser(PillowTestCase):
self.assertEqual(s.decode(), b"abcde") self.assertEqual(s.decode(), b"abcde")
def test_pdf_repr(self): def test_pdf_repr(self):
self.assertEqual(bytes(IndirectReference(1,2)), b"1 2 R") self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R")
self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj") self.assertEqual(bytes(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj")
self.assertEqual(bytes(PdfName(b"Name#Hash")), b"/Name#23Hash") self.assertEqual(bytes(PdfName(b"Name#Hash")), b"/Name#23Hash")
self.assertEqual(bytes(PdfName("Name#Hash")), b"/Name#23Hash") self.assertEqual(bytes(PdfName("Name#Hash")), b"/Name#23Hash")
self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") self.assertEqual(bytes(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") self.assertEqual(bytes(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(IndirectReference(1,2)), b"1 2 R") self.assertEqual(pdf_repr(IndirectReference(1, 2)), b"1 2 R")
self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1,2))), b"1 2 obj") self.assertEqual(pdf_repr(IndirectObjectDef(*IndirectReference(1, 2))), b"1 2 obj")
self.assertEqual(pdf_repr(PdfName(b"Name#Hash")), b"/Name#23Hash") self.assertEqual(pdf_repr(PdfName(b"Name#Hash")), b"/Name#23Hash")
self.assertEqual(pdf_repr(PdfName("Name#Hash")), b"/Name#23Hash") self.assertEqual(pdf_repr(PdfName("Name#Hash")), b"/Name#23Hash")
self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") self.assertEqual(pdf_repr(PdfDict({b"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1,2)})), b"<<\n/Name 1 2 R\n>>") self.assertEqual(pdf_repr(PdfDict({"Name": IndirectReference(1, 2)})), b"<<\n/Name 1 2 R\n>>")
self.assertEqual(pdf_repr(123), b"123") self.assertEqual(pdf_repr(123), b"123")
self.assertEqual(pdf_repr(True), b"true") self.assertEqual(pdf_repr(True), b"true")
self.assertEqual(pdf_repr(False), b"false") self.assertEqual(pdf_repr(False), b"false")

View File

@ -616,7 +616,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
A list of images to append as additional frames. Each of the A list of images to append as additional frames. Each of the
images in the list can be single or multiframe images. Note however, that for images in the list can be single or multiframe images. Note however, that for
correct results, all the appended images should have the same correct results, all the appended images should have the same
encoderinfo and encoderconfig properties. ``encoderinfo`` and ``encoderconfig`` properties.
.. versionadded:: 4.2.0 .. versionadded:: 4.2.0
@ -973,7 +973,7 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
**append** **append**
Set to True to append pages to an existing PDF file. If the file doesn't Set to True to append pages to an existing PDF file. If the file doesn't
exist, an IOError will be raised. exist, an :py:exc:`IOError` will be raised.
.. versionadded:: 5.1.0 .. versionadded:: 5.1.0

View File

@ -21,7 +21,6 @@
## ##
from . import Image, ImageFile, ImageSequence, pdfParser from . import Image, ImageFile, ImageSequence, pdfParser
from ._binary import i8
import io import io
__version__ = "0.5" __version__ = "0.5"

View File

@ -1,20 +1,19 @@
import codecs import codecs
import collections import collections
import io
import mmap import mmap
import os import os
import re import re
import sys
import zlib import zlib
try: try:
from UserDict import UserDict from UserDict import UserDict # Python 2.x
except ImportError: except ImportError:
UserDict = collections.UserDict UserDict = collections.UserDict # Python 3.x
if str == bytes: # Python 2.x if str == bytes: # Python 2.x
make_bytes = lambda s: s # pragma: no cover def make_bytes(s): # pragma: no cover
return s # pragma: no cover
else: # Python 3.x else: # Python 3.x
def make_bytes(s): def make_bytes(s):
return s.encode("us-ascii") return s.encode("us-ascii")
@ -68,6 +67,8 @@ PDFDocEncoding = {
0x9E: u"\u017E", 0x9E: u"\u017E",
0xA0: u"\u20AC", 0xA0: u"\u20AC",
} }
def decode_text(b): def decode_text(b):
if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE: if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be") return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be")
@ -181,7 +182,7 @@ class XrefTable:
return startxref return startxref
class PdfName(): class PdfName:
def __init__(self, name): def __init__(self, name):
if isinstance(name, PdfName): if isinstance(name, PdfName):
self.name = name.name self.name = name.name
@ -203,7 +204,8 @@ class PdfName():
def from_pdf_stream(klass, data): def from_pdf_stream(klass, data):
return klass(PdfParser.interpret_name(data)) return klass(PdfParser.interpret_name(data))
allowed_chars = set(range(33,127)) - set((ord(c) for c in "#%/()<>[]{}")) allowed_chars = set(range(33,127)) - set(ord(c) for c in "#%/()<>[]{}")
def __bytes__(self): def __bytes__(self):
if str == bytes: # Python 2.x if str == bytes: # Python 2.x
result = bytearray(b"/") result = bytearray(b"/")
@ -495,16 +497,13 @@ class PdfParser:
self.info = PdfDict() self.info = PdfDict()
else: else:
self.info = PdfDict(self.read_indirect(self.info_ref)) self.info = PdfDict(self.read_indirect(self.info_ref))
#print(repr(self.root))
check_format_condition(b"Type" in self.root, "/Type missing in Root") check_format_condition(b"Type" in self.root, "/Type missing in Root")
check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog") check_format_condition(self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog")
check_format_condition(b"Pages" in self.root, "/Pages missing in Root") check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference") check_format_condition(isinstance(self.root[b"Pages"], IndirectReference), "/Pages in Root is not an indirect reference")
self.pages_ref = self.root[b"Pages"] self.pages_ref = self.root[b"Pages"]
self.page_tree_root = self.read_indirect(self.pages_ref) self.page_tree_root = self.read_indirect(self.pages_ref)
#print("page_tree_root: " + str(self.page_tree_root))
self.pages = self.linearize_page_tree(self.page_tree_root) self.pages = self.linearize_page_tree(self.page_tree_root)
#print("pages: " + str(self.pages))
def next_object_id(self, offset=None): def next_object_id(self, offset=None):
try: try:
@ -524,16 +523,15 @@ class PdfParser:
whitespace_mandatory = whitespace + b"+" whitespace_mandatory = whitespace + b"+"
newline_only = br"[\r\n]+" newline_only = br"[\r\n]+"
newline = whitespace_optional + newline_only + whitespace_optional newline = whitespace_optional + newline_only + whitespace_optional
re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline \ re_trailer_end = re.compile(whitespace_mandatory + br"trailer" + whitespace_optional + br"\<\<(.*\>\>)" + newline
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL) + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline \ re_trailer_prev = re.compile(whitespace_optional + br"trailer" + whitespace_optional + br"\<\<(.*?\>\>)" + newline
+ br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL) + br"startxref" + newline + br"([0-9]+)" + newline + br"%%EOF" + whitespace_optional, re.DOTALL)
def read_trailer(self): def read_trailer(self):
search_start_offset = len(self.buf) - 16384 search_start_offset = len(self.buf) - 16384
if search_start_offset < self.start_offset: if search_start_offset < self.start_offset:
search_start_offset = self.start_offset search_start_offset = self.start_offset
#data_at_end = self.buf[search_start_offset:]
#m = self.re_trailer_end.search(data_at_end)
m = self.re_trailer_end.search(self.buf, search_start_offset) m = self.re_trailer_end.search(self.buf, search_start_offset)
check_format_condition(m, "trailer end not found") check_format_condition(m, "trailer end not found")
# make sure we found the LAST trailer # make sure we found the LAST trailer
@ -544,12 +542,10 @@ class PdfParser:
if not m: if not m:
m = last_match m = last_match
trailer_data = m.group(1) trailer_data = m.group(1)
#print(trailer_data)
self.last_xref_section_offset = int(m.group(2)) self.last_xref_section_offset = int(m.group(2))
self.trailer_dict = self.interpret_trailer(trailer_data) self.trailer_dict = self.interpret_trailer(trailer_data)
self.xref_table = XrefTable() self.xref_table = XrefTable()
self.read_xref_table(xref_section_offset=self.last_xref_section_offset) self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
#print(self.xref_table)
if b"Prev" in self.trailer_dict: if b"Prev" in self.trailer_dict:
self.read_prev_trailer(self.trailer_dict[b"Prev"]) self.read_prev_trailer(self.trailer_dict[b"Prev"])
@ -558,7 +554,6 @@ class PdfParser:
m = self.re_trailer_prev.search(self.buf[trailer_offset:trailer_offset+16384]) m = self.re_trailer_prev.search(self.buf[trailer_offset:trailer_offset+16384])
check_format_condition(m, "previous trailer not found") check_format_condition(m, "previous trailer not found")
trailer_data = m.group(1) trailer_data = m.group(1)
#print(trailer_data)
check_format_condition(int(m.group(2)) == xref_section_offset, "xref section offset in previous trailer doesn't match what was expected") check_format_condition(int(m.group(2)) == xref_section_offset, "xref section offset in previous trailer doesn't match what was expected")
trailer_dict = self.interpret_trailer(trailer_data) trailer_dict = self.interpret_trailer(trailer_data)
if b"Prev" in trailer_dict: if b"Prev" in trailer_dict:
@ -568,6 +563,7 @@ class PdfParser:
re_name = re.compile(whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" + delimiter_or_ws + br")") re_name = re.compile(whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" + delimiter_or_ws + br")")
re_dict_start = re.compile(whitespace_optional + br"\<\<") re_dict_start = re.compile(whitespace_optional + br"\<\<")
re_dict_end = re.compile(whitespace_optional + br"\>\>" + whitespace_optional) re_dict_end = re.compile(whitespace_optional + br"\>\>" + whitespace_optional)
@classmethod @classmethod
def interpret_trailer(klass, trailer_data): def interpret_trailer(klass, trailer_data):
trailer = {} trailer = {}
@ -579,15 +575,14 @@ class PdfParser:
check_format_condition(m and m.end() == len(trailer_data), "name not found in trailer, remaining data: " + repr(trailer_data[offset:])) check_format_condition(m and m.end() == len(trailer_data), "name not found in trailer, remaining data: " + repr(trailer_data[offset:]))
break break
key = klass.interpret_name(m.group(1)) key = klass.interpret_name(m.group(1))
#print(key)
value, offset = klass.get_value(trailer_data, m.end()) value, offset = klass.get_value(trailer_data, m.end())
#print(value)
trailer[key] = value trailer[key] = value
check_format_condition(b"Size" in trailer and isinstance(trailer[b"Size"], int), "/Size not in trailer or not an integer") check_format_condition(b"Size" in trailer and isinstance(trailer[b"Size"], int), "/Size not in trailer or not an integer")
check_format_condition(b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference), "/Root not in trailer or not an indirect reference") check_format_condition(b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference), "/Root not in trailer or not an indirect reference")
return trailer return trailer
re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?") re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?")
@classmethod @classmethod
def interpret_name(klass, raw, as_text=False): def interpret_name(klass, raw, as_text=False):
name = b"" name = b""
@ -616,10 +611,11 @@ class PdfParser:
re_comment = re.compile(br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*") re_comment = re.compile(br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*")
re_stream_start = re.compile(whitespace_optional + br"stream\r?\n") re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
re_stream_end = re.compile(whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")") re_stream_end = re.compile(whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")")
@classmethod @classmethod
def get_value(klass, data, offset, expect_indirect=None, max_nesting=-1): def get_value(klass, data, offset, expect_indirect=None, max_nesting=-1):
#if max_nesting == 0: if max_nesting == 0:
# return None, None return None, None
m = klass.re_comment.match(data, offset) m = klass.re_comment.match(data, offset)
if m: if m:
offset = m.end() offset = m.end()
@ -645,26 +641,22 @@ class PdfParser:
if m: if m:
offset = m.end() offset = m.end()
result = {} result = {}
#print("<<")
m = klass.re_dict_end.match(data, offset) m = klass.re_dict_end.match(data, offset)
while not m: while not m:
key, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) key, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
#print ("key " + str(key))
if offset is None: if offset is None:
return result, None return result, None
value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
result[key] = value result[key] = value
#print ("value " + str(value))
if offset is None: if offset is None:
return result, None return result, None
m = klass.re_dict_end.match(data, offset) m = klass.re_dict_end.match(data, offset)
#print(">>")
offset = m.end() offset = m.end()
m = klass.re_stream_start.match(data, offset) m = klass.re_stream_start.match(data, offset)
if m: if m:
try: try:
stream_len = int(result[b"Length"]) stream_len = int(result[b"Length"])
except: except (TypeError, KeyError, ValueError):
raise PdfFormatError("bad or missing Length in stream dict (%r)" % result.get(b"Length", None)) raise PdfFormatError("bad or missing Length in stream dict (%r)" % result.get(b"Length", None))
stream_data = data[m.end():m.end() + stream_len] stream_data = data[m.end():m.end() + stream_len]
m = klass.re_stream_end.match(data, m.end() + stream_len) m = klass.re_stream_end.match(data, m.end() + stream_len)
@ -682,7 +674,6 @@ class PdfParser:
while not m: while not m:
value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1) value, offset = klass.get_value(data, offset, max_nesting=max_nesting-1)
result.append(value) result.append(value)
#print ("item " + str(value))
if offset is None: if offset is None:
return result, None return result, None
m = klass.re_array_end.match(data, offset) m = klass.re_array_end.match(data, offset)
@ -717,7 +708,6 @@ class PdfParser:
#return None, offset # fallback (only for debugging) #return None, offset # fallback (only for debugging)
raise PdfFormatError("unrecognized object: " + repr(data[offset:offset+32])) raise PdfFormatError("unrecognized object: " + repr(data[offset:offset+32]))
re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))") re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))")
escaped_chars = { escaped_chars = {
b"n": b"\n", b"n": b"\n",
@ -737,6 +727,7 @@ class PdfParser:
ord(b")"): b")", ord(b")"): b")",
ord(b"\\"): b"\\", ord(b"\\"): b"\\",
} }
@classmethod @classmethod
def get_literal_string(klass, data, offset): def get_literal_string(klass, data, offset):
nesting_depth = 0 nesting_depth = 0
@ -746,7 +737,6 @@ class PdfParser:
if m.group(1): if m.group(1):
result.extend(klass.escaped_chars[m.group(1)[1]]) result.extend(klass.escaped_chars[m.group(1)[1]])
elif m.group(2): elif m.group(2):
#result.append(eval(m.group(1)))
result.append(int(m.group(2)[1:], 8)) result.append(int(m.group(2)[1:], 8))
elif m.group(3): elif m.group(3):
pass pass
@ -763,10 +753,10 @@ class PdfParser:
offset = m.end() offset = m.end()
raise PdfFormatError("unfinished literal string") raise PdfFormatError("unfinished literal string")
re_xref_section_start = re.compile(whitespace_optional + br"xref" + newline) re_xref_section_start = re.compile(whitespace_optional + br"xref" + newline)
re_xref_subsection_start = re.compile(whitespace_optional + br"([0-9]+)" + whitespace_mandatory + br"([0-9]+)" + whitespace_optional + newline_only) re_xref_subsection_start = re.compile(whitespace_optional + br"([0-9]+)" + whitespace_mandatory + br"([0-9]+)" + whitespace_optional + newline_only)
re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)") re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
def read_xref_table(self, xref_section_offset): def read_xref_table(self, xref_section_offset):
subsection_found = False subsection_found = False
m = self.re_xref_section_start.match(self.buf, xref_section_offset + self.start_offset) m = self.re_xref_section_start.match(self.buf, xref_section_offset + self.start_offset)
@ -793,20 +783,18 @@ class PdfParser:
self.xref_table[i] = new_entry self.xref_table[i] = new_entry
return offset return offset
def read_indirect(self, ref, max_nesting=-1): def read_indirect(self, ref, max_nesting=-1):
offset, generation = self.xref_table[ref[0]] offset, generation = self.xref_table[ref[0]]
assert generation == ref[1] assert generation == ref[1]
return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0] return self.get_value(self.buf, offset + self.start_offset, expect_indirect=IndirectReference(*ref), max_nesting=max_nesting)[0]
def linearize_page_tree(self, node=None): def linearize_page_tree(self, node=None):
if node is None: if node is None:
node = self.page_tree_root node = self.page_tree_root
check_format_condition(node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages") check_format_condition(node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages")
pages = [] pages = []
for kid in node[b"Kids"]: for kid in node[b"Kids"]:
kid_object = self.read_indirect(kid, max_nesting=3) kid_object = self.read_indirect(kid)
if kid_object[b"Type"] == b"Page": if kid_object[b"Type"] == b"Page":
pages.append(kid) pages.append(kid)
else: else: