Merge pull request #3274 from radarhere/pdf

Improve PDF document info
This commit is contained in:
Hugo 2018-09-29 17:43:05 +03:00 committed by GitHub
commit e2deb07608
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 92 additions and 29 deletions

View File

@ -4,6 +4,7 @@ import io
import os import os
import os.path import os.path
import tempfile import tempfile
import time
class TestFilePdf(PillowTestCase): class TestFilePdf(PillowTestCase):
@ -187,8 +188,13 @@ class TestFilePdf(PillowTestCase):
# open it, check pages and info # open it, check pages and info
with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 1) self.assertEqual(len(pdf.info), 4)
self.assertEqual(pdf.info.Title, os.path.splitext(
os.path.basename(pdf_filename)
)[0])
self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Producer, "PdfParser")
self.assertIn(b"CreationDate", pdf.info)
self.assertIn(b"ModDate", pdf.info)
self.check_pdf_pages_consistency(pdf) self.check_pdf_pages_consistency(pdf)
# append some info # append some info
@ -203,8 +209,10 @@ class TestFilePdf(PillowTestCase):
# open it again, check pages and info again # open it again, check pages and info again
with PdfParser.PdfParser(pdf_filename) as pdf: with PdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.pages), 1)
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 8)
self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Title, "abc")
self.assertIn(b"CreationDate", pdf.info)
self.assertIn(b"ModDate", pdf.info)
self.check_pdf_pages_consistency(pdf) self.check_pdf_pages_consistency(pdf)
# append two images # append two images
@ -216,29 +224,36 @@ class TestFilePdf(PillowTestCase):
# open the PDF again, check pages and info again # open the PDF again, check pages and info again
with PdfParser.PdfParser(pdf_filename) as pdf: with PdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.pages), 3) self.assertEqual(len(pdf.pages), 3)
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 8)
self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc") self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc")
self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Title, "abc")
self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Producer, "PdfParser")
self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
self.assertEqual(pdf.info.Subject, u"ghi\uABCD") self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
self.assertIn(b"CreationDate", pdf.info)
self.assertIn(b"ModDate", pdf.info)
self.check_pdf_pages_consistency(pdf) self.check_pdf_pages_consistency(pdf)
def test_pdf_info(self): def test_pdf_info(self):
# make a PDF file # make a PDF file
pdf_filename = self.helper_save_as_pdf( pdf_filename = self.helper_save_as_pdf(
"RGB", title="title", author="author", subject="subject", "RGB", title="title", author="author", subject="subject",
keywords="keywords", creator="creator", producer="producer") keywords="keywords", creator="creator", producer="producer",
creationDate=time.strptime("2000", "%Y"),
modDate=time.strptime("2001", "%Y"))
# open it, check pages and info # open it, check pages and info
with PdfParser.PdfParser(pdf_filename) as pdf: with PdfParser.PdfParser(pdf_filename) as pdf:
self.assertEqual(len(pdf.info), 6) self.assertEqual(len(pdf.info), 8)
self.assertEqual(pdf.info.Title, "title") self.assertEqual(pdf.info.Title, "title")
self.assertEqual(pdf.info.Author, "author") self.assertEqual(pdf.info.Author, "author")
self.assertEqual(pdf.info.Subject, "subject") self.assertEqual(pdf.info.Subject, "subject")
self.assertEqual(pdf.info.Keywords, "keywords") self.assertEqual(pdf.info.Keywords, "keywords")
self.assertEqual(pdf.info.Creator, "creator") self.assertEqual(pdf.info.Creator, "creator")
self.assertEqual(pdf.info.Producer, "producer") self.assertEqual(pdf.info.Producer, "producer")
self.assertEqual(pdf.info.CreationDate,
time.strptime("2000", "%Y"))
self.assertEqual(pdf.info.ModDate, time.strptime("2001", "%Y"))
self.check_pdf_pages_consistency(pdf) self.check_pdf_pages_consistency(pdf)
def test_pdf_append_to_bytesio(self): def test_pdf_append_to_bytesio(self):

View File

@ -3,6 +3,7 @@ from helper import unittest, PillowTestCase
from PIL.PdfParser import IndirectObjectDef, IndirectReference, PdfBinary, \ from PIL.PdfParser import IndirectObjectDef, IndirectReference, PdfBinary, \
PdfDict, PdfFormatError, PdfName, PdfParser, \ PdfDict, PdfFormatError, PdfName, PdfParser, \
PdfStream, decode_text, encode_text, pdf_repr PdfStream, decode_text, encode_text, pdf_repr
import time
class TestPdfParser(PillowTestCase): class TestPdfParser(PillowTestCase):
@ -80,6 +81,19 @@ class TestPdfParser(PillowTestCase):
self.assertIsInstance(s, PdfStream) self.assertIsInstance(s, PdfStream)
self.assertEqual(s.dictionary.Name, "value") self.assertEqual(s.dictionary.Name, "value")
self.assertEqual(s.decode(), b"abcde") self.assertEqual(s.decode(), b"abcde")
for name in ["CreationDate", "ModDate"]:
for date, value in {
b"20180729214124": "20180729214124",
b"D:20180729214124": "20180729214124",
b"D:2018072921": "20180729210000",
b"D:20180729214124Z": "20180729214124",
b"D:20180729214124+08'00'": "20180729134124",
b"D:20180729214124-05'00'": "20180730024124"
}.items():
d = PdfParser.get_value(
b"<</"+name.encode()+b" ("+date+b")>>", 0)[0]
self.assertEqual(
time.strftime("%Y%m%d%H%M%S", getattr(d, name)), value)
def test_pdf_repr(self): def test_pdf_repr(self):
self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R") self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R")

View File

@ -1029,7 +1029,8 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
saved in the PDF. saved in the PDF.
**title** **title**
The documents title. The documents title. If not appending to an existing PDF file, this will
default to the filename.
.. versionadded:: 5.1.0 .. versionadded:: 5.1.0
@ -1061,6 +1062,18 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
.. versionadded:: 5.1.0 .. versionadded:: 5.1.0
**creationDate**
The creation date of the document. If not appending to an existing PDF
file, this will default to the current time.
.. versionadded:: 5.3.0
**modDate**
The modification date of the document. If not appending to an existing PDF
file, this will default to the current time.
.. versionadded:: 5.3.0
XV Thumbnails XV Thumbnails
^^^^^^^^^^^^^ ^^^^^^^^^^^^^

View File

@ -22,6 +22,8 @@
from . import Image, ImageFile, ImageSequence, PdfParser from . import Image, ImageFile, ImageSequence, PdfParser
import io import io
import os
import time
__version__ = "0.5" __version__ = "0.5"
@ -45,32 +47,30 @@ def _save_all(im, fp, filename):
# (Internal) Image save plugin for the PDF format. # (Internal) Image save plugin for the PDF format.
def _save(im, fp, filename, save_all=False): def _save(im, fp, filename, save_all=False):
resolution = im.encoderinfo.get("resolution", 72.0)
is_appending = im.encoderinfo.get("append", False) is_appending = im.encoderinfo.get("append", False)
title = im.encoderinfo.get("title", None)
author = im.encoderinfo.get("author", None)
subject = im.encoderinfo.get("subject", None)
keywords = im.encoderinfo.get("keywords", None)
creator = im.encoderinfo.get("creator", None)
producer = im.encoderinfo.get("producer", None)
if is_appending: if is_appending:
existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="r+b") existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="r+b")
else: else:
existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="w+b") existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="w+b")
if title: resolution = im.encoderinfo.get("resolution", 72.0)
existing_pdf.info.Title = title
if author: info = {
existing_pdf.info.Author = author "title": None if is_appending else os.path.splitext(
if subject: os.path.basename(filename)
existing_pdf.info.Subject = subject )[0],
if keywords: "author": None,
existing_pdf.info.Keywords = keywords "subject": None,
if creator: "keywords": None,
existing_pdf.info.Creator = creator "creator": None,
if producer: "producer": None,
existing_pdf.info.Producer = producer "creationDate": None if is_appending else time.gmtime(),
"modDate": None if is_appending else time.gmtime()
}
for k, default in info.items():
v = im.encoderinfo.get(k) if k in im.encoderinfo else default
if v:
existing_pdf.info[k[0].upper() + k[1:]] = v
# #
# make sure image data is available # make sure image data is available

View File

@ -1,8 +1,10 @@
import calendar
import codecs import codecs
import collections import collections
import mmap import mmap
import os import os
import re import re
import time
import zlib import zlib
from ._util import py3 from ._util import py3
@ -280,9 +282,26 @@ class PdfDict(UserDict):
except KeyError: except KeyError:
raise AttributeError(key) raise AttributeError(key)
if isinstance(value, bytes): if isinstance(value, bytes):
return decode_text(value) value = decode_text(value)
else: if key.endswith("Date"):
return value if value.startswith("D:"):
value = value[2:]
relationship = 'Z'
if len(value) > 17:
relationship = value[14]
offset = int(value[15:17]) * 60
if len(value) > 20:
offset += int(value[18:20])
format = '%Y%m%d%H%M%S'[:len(value) - 2]
value = time.strptime(value[:len(format)+2], format)
if relationship in ['+', '-']:
offset *= 60
if relationship == '+':
offset *= -1
value = time.gmtime(calendar.timegm(value) + offset)
return value
def __bytes__(self): def __bytes__(self):
out = bytearray(b"<<") out = bytearray(b"<<")
@ -347,6 +366,8 @@ def pdf_repr(x):
return bytes(x) return bytes(x)
elif isinstance(x, int): elif isinstance(x, int):
return str(x).encode("us-ascii") return str(x).encode("us-ascii")
elif isinstance(x, time.struct_time):
return b'(D:'+time.strftime('%Y%m%d%H%M%SZ', x).encode("us-ascii")+b')'
elif isinstance(x, dict): elif isinstance(x, dict):
return bytes(PdfDict(x)) return bytes(PdfDict(x))
elif isinstance(x, list): elif isinstance(x, list):