From fc57658635cae5cb40c72025a950b6287450b8b8 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Sun, 29 Jul 2018 22:33:59 +1000 Subject: [PATCH] Added PDF creation and modification date info --- Tests/test_file_pdf.py | 25 ++++++++++++++---- Tests/test_pdfparser.py | 14 ++++++++++ docs/handbook/image-file-formats.rst | 15 ++++++++++- src/PIL/PdfImagePlugin.py | 39 ++++++++++++++-------------- src/PIL/PdfParser.py | 27 ++++++++++++++++--- 5 files changed, 91 insertions(+), 29 deletions(-) diff --git a/Tests/test_file_pdf.py b/Tests/test_file_pdf.py index f012fb9d8..57f1c2118 100644 --- a/Tests/test_file_pdf.py +++ b/Tests/test_file_pdf.py @@ -4,6 +4,7 @@ import io import os import os.path import tempfile +import time class TestFilePdf(PillowTestCase): @@ -187,8 +188,13 @@ class TestFilePdf(PillowTestCase): # open it, check pages and info with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: self.assertEqual(len(pdf.pages), 1) - self.assertEqual(len(pdf.info), 1) + self.assertEqual(len(pdf.info), 4) + self.assertEqual(pdf.info.Title, os.path.splitext( + os.path.basename(pdf_filename) + )[0]) self.assertEqual(pdf.info.Producer, "PdfParser") + self.assertIn(b"CreationDate", pdf.info) + self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append some info @@ -203,8 +209,10 @@ class TestFilePdf(PillowTestCase): # open it again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 1) - self.assertEqual(len(pdf.info), 6) + self.assertEqual(len(pdf.info), 8) self.assertEqual(pdf.info.Title, "abc") + self.assertIn(b"CreationDate", pdf.info) + self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append two images @@ -216,29 +224,36 @@ class TestFilePdf(PillowTestCase): # open the PDF again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 3) - self.assertEqual(len(pdf.info), 6) + self.assertEqual(len(pdf.info), 8) self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc") self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Subject, u"ghi\uABCD") + self.assertIn(b"CreationDate", pdf.info) + self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) def test_pdf_info(self): # make a PDF file pdf_filename = self.helper_save_as_pdf( "RGB", title="title", author="author", subject="subject", - keywords="keywords", creator="creator", producer="producer") + keywords="keywords", creator="creator", producer="producer", + creationDate=time.strptime("2000", "%Y"), + modDate=time.strptime("2001", "%Y")) # open it, check pages and info with PdfParser.PdfParser(pdf_filename) as pdf: - self.assertEqual(len(pdf.info), 6) + self.assertEqual(len(pdf.info), 8) self.assertEqual(pdf.info.Title, "title") self.assertEqual(pdf.info.Author, "author") self.assertEqual(pdf.info.Subject, "subject") self.assertEqual(pdf.info.Keywords, "keywords") self.assertEqual(pdf.info.Creator, "creator") self.assertEqual(pdf.info.Producer, "producer") + self.assertEqual(pdf.info.CreationDate, + time.strptime("2000", "%Y")) + self.assertEqual(pdf.info.ModDate, time.strptime("2001", "%Y")) self.check_pdf_pages_consistency(pdf) def test_pdf_append_to_bytesio(self): diff --git a/Tests/test_pdfparser.py b/Tests/test_pdfparser.py index 42c813520..660405635 100644 --- a/Tests/test_pdfparser.py +++ b/Tests/test_pdfparser.py @@ -3,6 +3,7 @@ from helper import unittest, PillowTestCase from PIL.PdfParser import IndirectObjectDef, IndirectReference, PdfBinary, \ PdfDict, PdfFormatError, PdfName, PdfParser, \ PdfStream, decode_text, encode_text, pdf_repr +import time class TestPdfParser(PillowTestCase): @@ -76,6 +77,19 @@ class TestPdfParser(PillowTestCase): self.assertIsInstance(s, PdfStream) self.assertEqual(s.dictionary.Name, "value") self.assertEqual(s.decode(), b"abcde") + for name in ["CreationDate", "ModDate"]: + for date, value in { + b"20180729214124": "20180729214124", + b"D:20180729214124": "20180729214124", + b"D:2018072921": "20180729210000", + b"D:20180729214124Z": "20180729214124", + b"D:20180729214124+08'00'": "20180729134124", + b"D:20180729214124-05'00'": "20180730024124" + }.items(): + d = PdfParser.get_value( + b"<>", 0)[0] + self.assertEqual( + time.strftime("%Y%m%d%H%M%S", getattr(d, name)), value) def test_pdf_repr(self): self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R") diff --git a/docs/handbook/image-file-formats.rst b/docs/handbook/image-file-formats.rst index eb50ff23d..e1138726f 100644 --- a/docs/handbook/image-file-formats.rst +++ b/docs/handbook/image-file-formats.rst @@ -1029,7 +1029,8 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum saved in the PDF. **title** - The document’s title. + The document’s title. If not appending to an existing PDF file, this will + default to the filename. .. versionadded:: 5.1.0 @@ -1061,6 +1062,18 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum .. versionadded:: 5.1.0 +**creationDate** + The creation date of the document. If not appending to an existing PDF + file, this will default to the current time. + + .. versionadded:: 5.3.0 + +**modDate** + The modification date of the document. If not appending to an existing PDF + file, this will default to the current time. + + .. versionadded:: 5.3.0 + XV Thumbnails ^^^^^^^^^^^^^ diff --git a/src/PIL/PdfImagePlugin.py b/src/PIL/PdfImagePlugin.py index d90e06a72..b42502762 100644 --- a/src/PIL/PdfImagePlugin.py +++ b/src/PIL/PdfImagePlugin.py @@ -23,6 +23,7 @@ from . import Image, ImageFile, ImageSequence, PdfParser import io import os +import time __version__ = "0.5" @@ -46,32 +47,30 @@ def _save_all(im, fp, filename): # (Internal) Image save plugin for the PDF format. def _save(im, fp, filename, save_all=False): - resolution = im.encoderinfo.get("resolution", 72.0) is_appending = im.encoderinfo.get("append", False) - title = None if is_appending else im.encoderinfo.get("title", os.path.splitext(filename)[0]) - author = im.encoderinfo.get("author", None) - subject = im.encoderinfo.get("subject", None) - keywords = im.encoderinfo.get("keywords", None) - creator = im.encoderinfo.get("creator", None) - producer = im.encoderinfo.get("producer", None) - if is_appending: existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="r+b") else: existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="w+b") - if title: - existing_pdf.info.Title = title - if author: - existing_pdf.info.Author = author - if subject: - existing_pdf.info.Subject = subject - if keywords: - existing_pdf.info.Keywords = keywords - if creator: - existing_pdf.info.Creator = creator - if producer: - existing_pdf.info.Producer = producer + resolution = im.encoderinfo.get("resolution", 72.0) + + info = { + "title": None if is_appending else os.path.splitext( + os.path.basename(filename) + )[0], + "author": None, + "subject": None, + "keywords": None, + "creator": None, + "producer": None, + "creationDate": None if is_appending else time.gmtime(), + "modDate": None if is_appending else time.gmtime() + } + for k, default in info.items(): + v = im.encoderinfo.get(k) if k in im.encoderinfo else default + if v: + existing_pdf.info[k[0].upper() + k[1:]] = v # # make sure image data is available diff --git a/src/PIL/PdfParser.py b/src/PIL/PdfParser.py index 971f44514..ad6d9f3fe 100644 --- a/src/PIL/PdfParser.py +++ b/src/PIL/PdfParser.py @@ -1,8 +1,10 @@ +import calendar import codecs import collections import mmap import os import re +import time import zlib from ._util import py3 @@ -280,9 +282,26 @@ class PdfDict(UserDict): except KeyError: raise AttributeError(key) if isinstance(value, bytes): - return decode_text(value) - else: - return value + value = decode_text(value) + if key.endswith("Date"): + if value.startswith("D:"): + value = value[2:] + + relationship = 'Z' + if len(value) > 17: + relationship = value[14] + offset = int(value[15:17]) * 60 + if len(value) > 20: + offset += int(value[18:20]) + + format = '%Y%m%d%H%M%S'[:len(value) - 2] + value = time.strptime(value[:len(format)+2], format) + if relationship in ['+', '-']: + offset *= 60 + if relationship == '+': + offset *= -1 + value = time.gmtime(calendar.timegm(value) + offset) + return value def __bytes__(self): out = bytearray(b"<<") @@ -347,6 +366,8 @@ def pdf_repr(x): return bytes(x) elif isinstance(x, int): return str(x).encode("us-ascii") + elif isinstance(x, time.struct_time): + return b'(D:'+time.strftime('%Y%m%d%H%M%SZ', x).encode("us-ascii")+b')' elif isinstance(x, dict): return bytes(PdfDict(x)) elif isinstance(x, list):