Merge pull request #3274 from radarhere/pdf

Improve PDF document info
2026-02-20 22:20:26 +03:00 · 2018-09-29 17:43:05 +03:00 · 2018-09-29 17:43:05 +03:00 · e2deb07608
commit e2deb07608
parent 0c71792818 fc57658635
5 changed files with 92 additions and 29 deletions
--- a/Tests/test_file_pdf.py
+++ b/Tests/test_file_pdf.py
@ -4,6 +4,7 @@ import io
 import os
 import os.path
 import tempfile
+import time


 class TestFilePdf(PillowTestCase):
@ -187,8 +188,13 @@ class TestFilePdf(PillowTestCase):
        # open it, check pages and info
        with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf:
            self.assertEqual(len(pdf.pages), 1)
-            self.assertEqual(len(pdf.info), 1)
+            self.assertEqual(len(pdf.info), 4)
+            self.assertEqual(pdf.info.Title, os.path.splitext(
+                                                os.path.basename(pdf_filename)
+                                             )[0])
            self.assertEqual(pdf.info.Producer, "PdfParser")
+            self.assertIn(b"CreationDate", pdf.info)
+            self.assertIn(b"ModDate", pdf.info)
            self.check_pdf_pages_consistency(pdf)

            # append some info
@ -203,8 +209,10 @@ class TestFilePdf(PillowTestCase):
        # open it again, check pages and info again
        with PdfParser.PdfParser(pdf_filename) as pdf:
            self.assertEqual(len(pdf.pages), 1)
-            self.assertEqual(len(pdf.info), 6)
+            self.assertEqual(len(pdf.info), 8)
            self.assertEqual(pdf.info.Title, "abc")
+            self.assertIn(b"CreationDate", pdf.info)
+            self.assertIn(b"ModDate", pdf.info)
            self.check_pdf_pages_consistency(pdf)

        # append two images
@ -216,29 +224,36 @@ class TestFilePdf(PillowTestCase):
        # open the PDF again, check pages and info again
        with PdfParser.PdfParser(pdf_filename) as pdf:
            self.assertEqual(len(pdf.pages), 3)
-            self.assertEqual(len(pdf.info), 6)
+            self.assertEqual(len(pdf.info), 8)
            self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc")
            self.assertEqual(pdf.info.Title, "abc")
            self.assertEqual(pdf.info.Producer, "PdfParser")
            self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty")
            self.assertEqual(pdf.info.Subject, u"ghi\uABCD")
+            self.assertIn(b"CreationDate", pdf.info)
+            self.assertIn(b"ModDate", pdf.info)
            self.check_pdf_pages_consistency(pdf)

    def test_pdf_info(self):
        # make a PDF file
        pdf_filename = self.helper_save_as_pdf(
            "RGB", title="title", author="author", subject="subject",
-            keywords="keywords", creator="creator", producer="producer")
+            keywords="keywords", creator="creator", producer="producer",
+            creationDate=time.strptime("2000", "%Y"),
+            modDate=time.strptime("2001", "%Y"))

        # open it, check pages and info
        with PdfParser.PdfParser(pdf_filename) as pdf:
-            self.assertEqual(len(pdf.info), 6)
+            self.assertEqual(len(pdf.info), 8)
            self.assertEqual(pdf.info.Title, "title")
            self.assertEqual(pdf.info.Author, "author")
            self.assertEqual(pdf.info.Subject, "subject")
            self.assertEqual(pdf.info.Keywords, "keywords")
            self.assertEqual(pdf.info.Creator, "creator")
            self.assertEqual(pdf.info.Producer, "producer")
+            self.assertEqual(pdf.info.CreationDate,
+                             time.strptime("2000", "%Y"))
+            self.assertEqual(pdf.info.ModDate, time.strptime("2001", "%Y"))
            self.check_pdf_pages_consistency(pdf)

    def test_pdf_append_to_bytesio(self):
--- a/Tests/test_pdfparser.py
+++ b/Tests/test_pdfparser.py
@ -3,6 +3,7 @@ from helper import unittest, PillowTestCase
 from PIL.PdfParser import IndirectObjectDef, IndirectReference, PdfBinary, \
                          PdfDict, PdfFormatError, PdfName, PdfParser, \
                          PdfStream, decode_text, encode_text, pdf_repr
+import time


 class TestPdfParser(PillowTestCase):
@ -80,6 +81,19 @@ class TestPdfParser(PillowTestCase):
        self.assertIsInstance(s, PdfStream)
        self.assertEqual(s.dictionary.Name, "value")
        self.assertEqual(s.decode(), b"abcde")
+        for name in ["CreationDate", "ModDate"]:
+            for date, value in {
+                b"20180729214124": "20180729214124",
+                b"D:20180729214124": "20180729214124",
+                b"D:2018072921": "20180729210000",
+                b"D:20180729214124Z": "20180729214124",
+                b"D:20180729214124+08'00'": "20180729134124",
+                b"D:20180729214124-05'00'": "20180730024124"
+            }.items():
+                d = PdfParser.get_value(
+                    b"<</"+name.encode()+b" ("+date+b")>>", 0)[0]
+                self.assertEqual(
+                    time.strftime("%Y%m%d%H%M%S", getattr(d, name)), value)

    def test_pdf_repr(self):
        self.assertEqual(bytes(IndirectReference(1, 2)), b"1 2 R")
--- a/docs/handbook/image-file-formats.rst
+++ b/docs/handbook/image-file-formats.rst
@ -1029,7 +1029,8 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum
    saved in the PDF.

 **title**
-    The document’s title.
+    The document’s title. If not appending to an existing PDF file, this will
+    default to the filename.

    .. versionadded:: 5.1.0

@ -1061,6 +1062,18 @@ The :py:meth:`~PIL.Image.Image.save` method can take the following keyword argum

    .. versionadded:: 5.1.0

+**creationDate**
+    The creation date of the document. If not appending to an existing PDF
+    file, this will default to the current time.
+
+    .. versionadded:: 5.3.0
+
+**modDate**
+    The modification date of the document. If not appending to an existing PDF
+    file, this will default to the current time.
+
+    .. versionadded:: 5.3.0
+
 XV Thumbnails
 ^^^^^^^^^^^^^

--- a/src/PIL/PdfImagePlugin.py
+++ b/src/PIL/PdfImagePlugin.py
@ -22,6 +22,8 @@

 from . import Image, ImageFile, ImageSequence, PdfParser
 import io
+import os
+import time

 __version__ = "0.5"

@ -45,32 +47,30 @@ def _save_all(im, fp, filename):
 # (Internal) Image save plugin for the PDF format.

 def _save(im, fp, filename, save_all=False):
-    resolution = im.encoderinfo.get("resolution", 72.0)
    is_appending = im.encoderinfo.get("append", False)
-    title = im.encoderinfo.get("title", None)
-    author = im.encoderinfo.get("author", None)
-    subject = im.encoderinfo.get("subject", None)
-    keywords = im.encoderinfo.get("keywords", None)
-    creator = im.encoderinfo.get("creator", None)
-    producer = im.encoderinfo.get("producer", None)
-
    if is_appending:
        existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="r+b")
    else:
        existing_pdf = PdfParser.PdfParser(f=fp, filename=filename, mode="w+b")

-    if title:
-        existing_pdf.info.Title = title
-    if author:
-        existing_pdf.info.Author = author
-    if subject:
-        existing_pdf.info.Subject = subject
-    if keywords:
-        existing_pdf.info.Keywords = keywords
-    if creator:
-        existing_pdf.info.Creator = creator
-    if producer:
-        existing_pdf.info.Producer = producer
+    resolution = im.encoderinfo.get("resolution", 72.0)
+
+    info = {
+        "title": None if is_appending else os.path.splitext(
+                                               os.path.basename(filename)
+                                           )[0],
+        "author": None,
+        "subject": None,
+        "keywords": None,
+        "creator": None,
+        "producer": None,
+        "creationDate": None if is_appending else time.gmtime(),
+        "modDate": None if is_appending else time.gmtime()
+    }
+    for k, default in info.items():
+        v = im.encoderinfo.get(k) if k in im.encoderinfo else default
+        if v:
+            existing_pdf.info[k[0].upper() + k[1:]] = v

    #
    # make sure image data is available
--- a/src/PIL/PdfParser.py
+++ b/src/PIL/PdfParser.py
@ -1,8 +1,10 @@
+import calendar
 import codecs
 import collections
 import mmap
 import os
 import re
+import time
 import zlib
 from ._util import py3

@ -280,9 +282,26 @@ class PdfDict(UserDict):
            except KeyError:
                raise AttributeError(key)
        if isinstance(value, bytes):
-            return decode_text(value)
-        else:
-            return value
+            value = decode_text(value)
+        if key.endswith("Date"):
+            if value.startswith("D:"):
+                value = value[2:]
+
+            relationship = 'Z'
+            if len(value) > 17:
+                relationship = value[14]
+                offset = int(value[15:17]) * 60
+                if len(value) > 20:
+                    offset += int(value[18:20])
+
+            format = '%Y%m%d%H%M%S'[:len(value) - 2]
+            value = time.strptime(value[:len(format)+2], format)
+            if relationship in ['+', '-']:
+                offset *= 60
+                if relationship == '+':
+                    offset *= -1
+                value = time.gmtime(calendar.timegm(value) + offset)
+        return value

    def __bytes__(self):
        out = bytearray(b"<<")
@ -347,6 +366,8 @@ def pdf_repr(x):
        return bytes(x)
    elif isinstance(x, int):
        return str(x).encode("us-ascii")
+    elif isinstance(x, time.struct_time):
+        return b'(D:'+time.strftime('%Y%m%d%H%M%SZ', x).encode("us-ascii")+b')'
    elif isinstance(x, dict):
        return bytes(PdfDict(x))
    elif isinstance(x, list):