From a6336e7eccafe8ba42423588db7f4072fe9b545f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20=C5=A0uppa?= Date: Thu, 16 Jan 2014 12:20:19 +0100 Subject: [PATCH 001/101] Removed empty line It seems that this empty line is what makes this code in the tutorial not highlighted http://pillow.readthedocs.org/en/latest/handbook/tutorial.html#a-sequence-iterator-class --- docs/handbook/tutorial.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/handbook/tutorial.rst b/docs/handbook/tutorial.rst index d7bb98386..9ce50da7d 100644 --- a/docs/handbook/tutorial.rst +++ b/docs/handbook/tutorial.rst @@ -105,7 +105,6 @@ Create JPEG thumbnails except IOError: print("cannot create thumbnail for", infile) - It is important to note that the library doesn’t decode or load the raster data unless it really has to. When you open a file, the file header is read to determine the file format and extract things like mode, size, and other From 8324a9a3e08413423c8ade7fb02db7ace0637c43 Mon Sep 17 00:00:00 2001 From: Sandro Mani Date: Sun, 26 Jan 2014 00:11:15 +0100 Subject: [PATCH 002/101] Fix issue 447: Apparently some drivers only emit SANE_STATUS_EOF once, and SANE_STATUS_IO_ERROR after that. The code however assumed that the driver keeps emitting SANE_STATUS_EOF. This commit fixes this. --- Sane/_sane.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/Sane/_sane.c b/Sane/_sane.c index 1c62610be..2ebcb1834 100644 --- a/Sane/_sane.c +++ b/Sane/_sane.c @@ -916,10 +916,13 @@ SaneDev_snap(SaneDevObject *self, PyObject *args) call which returns SANE_STATUS_EOF in order to start a new frame. */ - do { - st = sane_read(self->h, buffer, READSIZE, &len); - } - while (st == SANE_STATUS_GOOD); + if (st != SANE_STATUS_EOF) + { + do { + st = sane_read(self->h, buffer, READSIZE, &len); + } + while (st == SANE_STATUS_GOOD); + } if (st != SANE_STATUS_EOF) { Py_BLOCK_THREADS @@ -937,10 +940,13 @@ SaneDev_snap(SaneDevObject *self, PyObject *args) } } /* enforce SANE_STATUS_EOF. Can be necessary for ADF scans for some backends */ - do { - st = sane_read(self->h, buffer, READSIZE, &len); - } - while (st == SANE_STATUS_GOOD); + if (st != SANE_STATUS_EOF) + { + do { + st = sane_read(self->h, buffer, READSIZE, &len); + } + while (st == SANE_STATUS_GOOD); + } if (st != SANE_STATUS_EOF) { sane_cancel(self->h); From f05f8001c556a25d409b03aa50e967c11008dfba Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:13:14 +0200 Subject: [PATCH 003/101] Original version of OleFileIO.py from PIL, dated 2005-03-25, still current in PIL 1.1.7 on the 2011-10-20 --- PIL/OleFileIO-README.txt | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 PIL/OleFileIO-README.txt diff --git a/PIL/OleFileIO-README.txt b/PIL/OleFileIO-README.txt new file mode 100644 index 000000000..3e037a00c --- /dev/null +++ b/PIL/OleFileIO-README.txt @@ -0,0 +1,30 @@ +OleFileIO_PL module: + +OleFileIO_PL is a Python module to read Microsoft OLE2 files (Structured +Storage), such as Microsoft Office documents, Image Composer and FlashPix files, + Outlook messages, ... + +This is an improved version of the OleFileIO module from PIL library v1.1.6 +(See: http://www.pythonware.com/products/pil/index.htm) + +WARNING: THIS IS (STILL) WORK IN PROGRESS. + + + +INSTALLATION: + +- on Windows, launch install.bat +- on other systems, launch: setup.py install + + + +HOW TO USE THIS MODULE: + +See http://www.decalage.info/python/olefileio +See main at the end of the module, and also docstrings. + + + +LICENSE: + +See LICENSE.txt. From fad61ba20c086206d74c5278e2fade0efdbb3673 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:23:32 +0200 Subject: [PATCH 004/101] version 0.13 2007-09-04 --- PIL/OleFileIO.py | 429 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 394 insertions(+), 35 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 36e598a9b..8e2052a0b 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -1,3 +1,59 @@ +#!/usr/local/bin/python +# -*- coding: latin-1 -*- +""" +OleFileIO_PL: + Module to read Microsoft OLE2 files (Structured Storage), such as + Microsoft Office documents, Image Composer and FlashPix files, + Outlook messages, ... + +version 0.13 2007-09-04 Philippe Lagadec - http://lagasoft.free.fr + +Improved version of OleFileIO module from PIL library v1.1.6 +See: http://www.pythonware.com/products/pil/index.htm + +The Python Imaging Library (PIL) is + Copyright (c) 1997-2005 by Secret Labs AB + Copyright (c) 1995-2005 by Fredrik Lundh +OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec + +See source code and LICENSE.txt for information on usage and redistribution. + +WARNING: THIS IS (STILL) WORK IN PROGRESS. +""" + +__author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" +__date__ = "2007-09-04" +__version__ = '0.13' + +#----------------------------------------------------------------------------- +# CHANGELOG: (OleFileIO_PL changes only) +# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility +# (all changes flagged with [PL]) +# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise +# exceptions in _OleStream.__init__() +# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) +# - added some constants +# - added header values checks +# - added some docstrings +# - getsect: bugfix in case sectors >512 bytes +# - getsect: added conformity checks +# - DEBUG_MODE constant to activate debug display +# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments +# - updated license +# - converted tabs to 4 spaces + +#----------------------------------------------------------------------------- +# TODO: +# - fix Unicode names handling +# - fix handling of DIFSECT blocks in FAT (not stop) +# - add stricter checks in decoding +# - add (optional) checks on FAT block chains integrity to detect crossed +# sectors, loops, ... +# - in __main__ display the whole object tree (not only 1st level), and allow +# to extract objects, or provide a sample script to do it. +# - see also original notes and FIXME below +#----------------------------------------------------------------------------- + # # THIS IS WORK IN PROGRESS # @@ -36,18 +92,70 @@ # See the README file for information on usage and redistribution. # -import string, StringIO +#--- LICENSE ------------------------------------------------------------------ +# OleFileIO_PL is an improved version of the OleFileIO module from the +# Python Imaging Library (PIL). + +# OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its associated +# documentation, you agree that you have read, understood, and will comply with +# the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and its +# associated documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appears in all copies, and that both +# that copyright notice and this permission notice appear in supporting +# documentation, and that the name of Secret Labs AB or the author(s) not be used +# in advertising or publicity pertaining to distribution of the software +# without specific, written prior permission. +# +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +#------------------------------------------------------------------------------ + +import string, StringIO, struct, array, os.path + +#[PL] DEBUG display mode: +DEBUG_MODE = False + +if DEBUG_MODE: + def debug(msg): + print msg +else: + def debug(msg): + pass def i16(c, o = 0): return ord(c[o])+(ord(c[o+1])<<8) def i32(c, o = 0): - return ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24) + return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) + # [PL]: added int() because "<<" gives long int since Python 2.4 MAGIC = '\320\317\021\340\241\261\032\341' +# [PL]: added constants (from AAF specifications) +MAXREGSECT = 0xFFFFFFFAL; # maximum SECT +DIFSECT = 0xFFFFFFFCL; # denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFDL; # denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFEL; # end of a virtual stream chain +FREESECT = 0xFFFFFFFFL; # unallocated sector +MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID +NOSTREAM = 0xFFFFFFFFL; # unallocated directory entry + # # -------------------------------------------------------------------- # property types @@ -80,7 +188,6 @@ WORD_CLSID = "00020900-0000-0000-C000-000000000046" # -------------------------------------------------------------------- class _OleStream(StringIO.StringIO): - """OLE2 Stream Returns a read-only file object which can be used to read @@ -97,12 +204,15 @@ class _OleStream(StringIO.StringIO): # loading it all in one go. def __init__(self, fp, sect, size, offset, sectorsize, fat): - data = [] - while sect != -2: # 0xFFFFFFFEL: + # [PL] while sect != -2: # 0xFFFFFFFEL: + while sect != ENDOFCHAIN: fp.seek(offset + sectorsize * sect) data.append(fp.read(sectorsize)) + # [PL] if pointer is out of the FAT an exception is raised + if sect >= len(fat) : + raise IOError, 'incorrect FAT' sect = fat[sect] data = string.join(data, "") @@ -132,6 +242,10 @@ class _OleDirectoryEntry: # a complete list of directory entries, as read from # the directory stream. + # [PL] conformity check + if sid >= len(sidlist) : + raise IOError, 'incorrect SID' + name, type, sect, size, sids, clsid = sidlist[sid] self.sid = sid @@ -147,7 +261,12 @@ class _OleDirectoryEntry: sid = sidlist[sid][4][2] - if sid != -1: + # [PL]: original code from PIL 1.1.5 + #if sid != -1 + # [PL]: necessary fix for Python 2.4 + #if sid != -1 and sid != 0xFFFFFFFFL: + # [PL]: new fix 22/02/2006 + if sid != NOSTREAM: # the directory entries are organized as a red-black tree. # the following piece of code does an ordered traversal of @@ -159,7 +278,8 @@ class _OleDirectoryEntry: left, right, child = sidlist[sid][4] - while left != -1: # 0xFFFFFFFFL: + #[PL] while left != -1 and left != 0xFFFFFFFFL: + if left != NOSTREAM: stack.append(sid) sid = left left, right, child = sidlist[sid][4] @@ -169,13 +289,25 @@ class _OleDirectoryEntry: self.kids.append(_OleDirectoryEntry(sidlist, sid)) # try to move right + + # [PL] conformity check + if sid >= len(sidlist) : + raise IOError, 'incorrect SID' + left, right, child = sidlist[sid][4] - if right != -1: # 0xFFFFFFFFL: + #[PL] if right != -1 and right != 0xFFFFFFFFL: + if right != NOSTREAM: # and then back to the left sid = right while 1: + + # [PL] conformity check + if sid >= len(sidlist) : + raise IOError, 'incorrect SID' + left, right, child = sidlist[sid][4] - if left == -1: # 0xFFFFFFFFL: + #[PL] if left == -1 or left == 0xFFFFFFFFL: + if left == NOSTREAM: break stack.append(sid) sid = left @@ -199,12 +331,10 @@ class _OleDirectoryEntry: def __cmp__(self, other): "Compare entries by name" - return cmp(self.name, other.name) def dump(self, tab = 0): "Dump this entry, and all its subentries (for debug purposes only)" - TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", "(property)", "(root)"] @@ -255,7 +385,6 @@ class OleFileIO: """ def __init__(self, filename = None): - if filename: self.open(filename) @@ -264,7 +393,6 @@ class OleFileIO: def open(self, filename): """Open an OLE2 file""" - if type(filename) == type(""): self.fp = open(filename, "rb") else: @@ -275,6 +403,93 @@ class OleFileIO: if len(header) != 512 or header[:8] != MAGIC: raise IOError, "not an OLE2 structured storage file" + # [PL] header structure according to AAF specifications: + ##Header + ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] + ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, + ## // 0x1a, 0xe1} for current version + ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ + ## // GetClassFile uses root directory class id) + ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is + ## // written by reference implementation + ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for + ## // 512-byte sectors, 4 for 4 KB sectors + ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering + ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; + ## // typically 9 indicating 512-byte sectors + ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; + ## // typically 6 indicating 64-byte mini-sectors + ##USHORT _usReserved; // [22H,02] reserved, must be zero + ##ULONG _ulReserved1; // [24H,04] reserved, must be zero + ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, + ## // number of SECTs in directory chain for 4 KB + ## // sectors + ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain + ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain + ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must + ## // be zero. The reference implementation + ## // does not support transactions + ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; + ## // typically 4096 bytes + ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain + ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain + ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain + ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain + ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors + ##}; + + # [PL] header decoding: + # '<' indicates little-endian byte ordering for Intel (cf. struct module help) + fmt_header = '<8s16sHHHHHHLLLLLLLLLL' + header_size = struct.calcsize(fmt_header) + debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) + header1 = header[:header_size] + (Sig, clsid, MinorVersion, DllVersion, ByteOrder, SectorShift, + MiniSectorShift, Reserved, Reserved1, csectDir, self.csectFat, sectDirStart, + signature, MiniSectorCutoff, MiniFatStart, csectMiniFat, self.sectDifStart, + self.csectDif) = struct.unpack(fmt_header, header1) + debug( struct.unpack(fmt_header, header1)) + + if Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + raise IOError, "incorrect OLE signature" + if clsid != '\x00'*16: + raise IOError, "incorrect CLSID in OLE header" + debug( "MinorVersion = %d" % MinorVersion ) + debug( "DllVersion = %d" % DllVersion ) + if DllVersion not in [3, 4]: + raise IOError, "incorrect DllVersion in OLE header" + debug( "ByteOrder = %X" % ByteOrder ) + if ByteOrder != 0xFFFE: + raise IOError, "incorrect ByteOrder in OLE header" + SectorSize = 2**SectorShift + debug( "SectorSize = %d" % SectorSize ) + if SectorSize not in [512, 4096]: + raise IOError, "incorrect SectorSize in OLE header" + MiniSectorSize = 2**MiniSectorShift + debug( "MiniSectorSize = %d" % MiniSectorSize ) + if MiniSectorSize not in [64]: + raise IOError, "incorrect MiniSectorSize in OLE header" + if Reserved != 0 or Reserved1 != 0: + raise IOError, "incorrect OLE header" + debug( "csectDir = %d" % csectDir ) + if SectorSize==512 and csectDir!=0: + raise IOError, "incorrect csectDir in OLE header" + debug( "csectFat = %d" % self.csectFat ) + debug( "sectDirStart = %X" % sectDirStart ) + debug( "signature = %d" % signature ) + if signature != 0: + raise IOError, "incorrect OLE header" + debug( "MiniSectorCutoff = %d" % MiniSectorCutoff ) + debug( "MiniFatStart = %X" % MiniFatStart ) + debug( "csectMiniFat = %d" % csectMiniFat ) + debug( "sectDifStart = %X" % self.sectDifStart ) + debug( "csectDif = %d" % self.csectDif ) + + # calculate the number of sectors in the file + # (-1 because header doesn't count) + self.nb_sect = (os.path.getsize(filename) / SectorSize) - 1 + debug( "Number of sectors in the file: %d" % self.nb_sect ) + # file clsid (probably never used, so we don't store it) clsid = self._clsid(header[8:24]) @@ -295,34 +510,169 @@ class OleFileIO: self.ministream = None self.minifatsect = i32(header, 60) + def dumpfat(self, fat, firstindex=0): + "Displays a part of FAT in human-readable form for debugging purpose" + # [PL] added only for debug + if not DEBUG_MODE: + return + # dictionary to convert special FAT values in human-readable strings + VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) + fatnames = { + FREESECT: "..free..", + ENDOFCHAIN: "[ END. ]", + FATSECT: "FATSECT ", + DIFSECT: "DIFSECT " + } + nbsect = len(fat) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = fat[i] + if sect in fatnames: + nom = fatnames[sect] + else: + if sect == i+1: + nom = " --->" + else: + nom = "%8X" % sect + print nom, + print "" + + def dumpsect(self, sector, firstindex=0): + "Displays a sector in a human-readable form, for debugging purpose." + if not DEBUG_MODE: + return + VPL=8 # number of values per line (8+1 * 8+1 = 81) + tab = array.array('L', sector) + nbsect = len(tab) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = tab[i] + nom = "%8X" % sect + print nom, + print "" + + + + def loadfat_sect(self, sect): + "Adds the indexes of the given sector to the FAT" + # un secteur de FAT est un tableau d'ulong + if isinstance(sect, array.array): + fat1 = sect + else: + fat1 = array.array('L', sect) + self.dumpsect(sect) + # la FAT est une chaîne de secteurs débutant au 1er index d'elle-même + for isect in fat1: + #print "isect = %X" % isect + if isect == ENDOFCHAIN or isect == FREESECT: + break + s = self.getsect(isect) + self.fat = self.fat + array.array('L', s) + return isect + + def loadfat(self, header): - # Load the FAT table. The header contains a sector numbers + """ + Load the FAT table. + """ + # The header contains a sector numbers # for the first 109 FAT sectors. Additional sectors are - # described by DIF blocks (FIXME: not yet implemented) + # described by DIF blocks sect = header[76:512] - fat = [] - for i in range(0, len(sect), 4): - ix = i32(sect, i) - if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: - break - s = self.getsect(ix) - fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) - self.fat = fat + debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) + #fat = [] + # [PL] FAT is an array of 32 bits unsigned ints, it's more effective + # to use an array than a list in Python. + # It's initialized as empty first: + self.fat = array.array('L') + self.loadfat_sect(sect) + #self.dumpfat(self.fat) +## for i in range(0, len(sect), 4): +## ix = i32(sect, i) +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## break +## s = self.getsect(ix) +## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) +## fat = fat + array.array('L', s) + if self.csectDif != 0: + # [PL] There's a DIFAT because file is larger than 6.8MB + # some checks just in case: + if self.csectFat <= 109: + # there must be at least 109 blocks in header and the rest in + # DIFAT, so number of sectors must be >109. + raise IOError, 'incorrect DIFAT, not enough sectors' + if self.sectDifStart >= self.nb_sect: + # initial DIFAT block index must be valid + raise IOError, 'incorrect DIFAT, first index out of range' + debug( "DIFAT analysis..." ) + # We compute the necessary number of DIFAT sectors : + # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) + nb_difat = (self.csectFat-109 + 126)/127 + debug( "nb_difat = %d" % nb_difat ) + if self.csectDif != nb_difat: + raise IOError, 'incorrect DIFAT' + isect_difat = self.sectDifStart + for i in xrange(nb_difat): + debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) + sector_difat = self.getsect(isect_difat) + difat = array.array('L', sector_difat) + self.dumpsect(sector_difat) + self.loadfat_sect(difat[:127]) + # last DIFAT pointer is next DIFAT sector: + isect_difat = difat[127] + debug( "next DIFAT sector: %X" % isect_difat ) + # checks: + if isect_difat not in [ENDOFCHAIN, FREESECT]: + # last DIFAT pointer value must be ENDOFCHAIN or FREESECT + raise IOError, 'incorrect end of DIFAT' +## if len(self.fat) != self.csectFat: +## # FAT should contain csectFat blocks +## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) +## raise IOError, 'incorrect DIFAT' + self.dumpfat(self.fat) def loadminifat(self): - # Load the MINIFAT table. This is stored in a standard sub- + "Load the MINIFAT table." + # This is stored in a standard sub- # stream, pointed to by a header field. - s = self._open(self.minifatsect).read() - self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) def getsect(self, sect): - # Read given sector - - self.fp.seek(512 + self.sectorsize * sect) - return self.fp.read(self.sectorsize) + "Read given sector" + # [PL] this original code was wrong when sectors are 4KB instead of + # 512 bytes: + #self.fp.seek(512 + self.sectorsize * sect) + #[PL]: added safety checks: + #print "getsect(%X)" % sect + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + raise IOError, 'wrong index for OLE sector' + sector = self.fp.read(self.sectorsize) + if len(sector) != self.sectorsize: + raise IOError, 'incomplete OLE sector' + return sector def _unicode(self, s): # Map unicode string to Latin 1 @@ -332,7 +682,10 @@ class OleFileIO: return filter(ord, s) def loaddirectory(self, sect): - # Load the directory. The directory is stored in a standard + """ + Load the directory. + """ + # The directory is stored in a standard # substream, independent of its size. # read directory stream @@ -356,7 +709,6 @@ class OleFileIO: def dumpdirectory(self): # Dump directory (for debugging only) - self.root.dump() def _clsid(self, clsid): @@ -509,8 +861,15 @@ if __name__ == "__main__": import sys + # [PL] display quick usage info if launched from command-line + if len(sys.argv) <= 1: + print __doc__ + print "Launched from command line, this script parses OLE files and prints info." + print "" + sys.exit("usage: OleFileIO_PL.py [file2 ...]") + for file in sys.argv[1:]: - try: +## try: ole = OleFileIO(file) print "-" * 68 print file @@ -524,5 +883,5 @@ if __name__ == "__main__": props.sort() for k, v in props: print " ", k, v - except IOError, v: - print "***", "cannot read", file, "-", v +## except IOError, v: +## print "***", "cannot read", file, "-", v From d6d3f50205d58488139c7d0fe92dc087b7e3f7e1 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:25:30 +0200 Subject: [PATCH 005/101] version 0.14 2007-11-19 --- PIL/OleFileIO.py | 284 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 213 insertions(+), 71 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 8e2052a0b..ffa1a12ab 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.13 2007-09-04 Philippe Lagadec - http://lagasoft.free.fr +version 0.14 2007-11-19 Philippe Lagadec - http://lagasoft.free.fr Improved version of OleFileIO module from PIL library v1.1.6 See: http://www.pythonware.com/products/pil/index.htm @@ -22,8 +22,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2007-09-04" -__version__ = '0.13' +__date__ = "2007-11-19" +__version__ = '0.14' #----------------------------------------------------------------------------- # CHANGELOG: (OleFileIO_PL changes only) @@ -41,17 +41,22 @@ __version__ = '0.13' # 2007-09-04 v0.13 PL: - improved/translated (lots of) comments # - updated license # - converted tabs to 4 spaces +# 2007-11-19 v0.14 PL: - added OleFileIO.raise_defect() to adapt sensitivity +# - improved _unicode() to use Python 2.x unicode support +# - fixed bug in _OleDirectoryEntry #----------------------------------------------------------------------------- # TODO: -# - fix Unicode names handling +# - replace all raised exceptions with raise_defect (at least in OleFileIO) +# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) +# => if possible avoid converting names to Latin-1 # - fix handling of DIFSECT blocks in FAT (not stop) # - add stricter checks in decoding # - add (optional) checks on FAT block chains integrity to detect crossed # sectors, loops, ... -# - in __main__ display the whole object tree (not only 1st level), and allow -# to extract objects, or provide a sample script to do it. +# - improve docstrings to show more sample uses # - see also original notes and FIXME below +# - remove all obsolete FIXMEs #----------------------------------------------------------------------------- # @@ -149,12 +154,21 @@ MAGIC = '\320\317\021\340\241\261\032\341' # [PL]: added constants (from AAF specifications) MAXREGSECT = 0xFFFFFFFAL; # maximum SECT -DIFSECT = 0xFFFFFFFCL; # denotes a DIFAT sector in a FAT -FATSECT = 0xFFFFFFFDL; # denotes a FAT sector in a FAT -ENDOFCHAIN = 0xFFFFFFFEL; # end of a virtual stream chain -FREESECT = 0xFFFFFFFFL; # unallocated sector +DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID -NOSTREAM = 0xFFFFFFFFL; # unallocated directory entry +NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry + +#[PL] object types in storage (from AAF specifications) +STGTY_INVALID = 0 # unknown storage type +STGTY_STORAGE = 1 # element is a storage object +STGTY_STREAM = 2 # element is a stream object +STGTY_LOCKBYTES = 3 # element is an ILockBytes object +STGTY_PROPERTY = 4 # element is an IPropertyStorage object +STGTY_ROOT = 5 # element is a root storage + # # -------------------------------------------------------------------- @@ -183,16 +197,24 @@ for k, v in vars().items(): WORD_CLSID = "00020900-0000-0000-C000-000000000046" +#[PL]: Defect levels to classify parsing errors - see OleFileIO.raise_defect() +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect +DEFECT_POTENTIAL = 20 # a potential defect +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing + # can go on +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is + # impossible # # -------------------------------------------------------------------- class _OleStream(StringIO.StringIO): - """OLE2 Stream + """ + OLE2 Stream Returns a read-only file object which can be used to read - the contents of a OLE stream. To open a stream, use the - openstream method in the OleFile class. + the contents of a OLE stream (instance of the StringIO class). + To open a stream, use the openstream method in the OleFile class. This function can be used with either ordinary streams, or ministreams, depending on the offset, sectorsize, and @@ -204,8 +226,13 @@ class _OleStream(StringIO.StringIO): # loading it all in one go. def __init__(self, fp, sect, size, offset, sectorsize, fat): + """ + Constructor for _OleStream class + """ + # optimization(?): data is first a list of strings, and join() is called + # at the end to concatenate all in one string. + # (this may not be really useful with recent Python versions) data = [] - # [PL] while sect != -2: # 0xFFFFFFFEL: while sect != ENDOFCHAIN: fp.seek(offset + sectorsize * sect) @@ -213,13 +240,12 @@ class _OleStream(StringIO.StringIO): # [PL] if pointer is out of the FAT an exception is raised if sect >= len(fat) : raise IOError, 'incorrect FAT' + # jump to next sector in the FAT: sect = fat[sect] - data = string.join(data, "") - - # print len(data), size - + # when all data is read in memory, StringIO constructor is called StringIO.StringIO.__init__(self, data[:size]) + # Then the _OleStream object can be used as a read-only file object. # # -------------------------------------------------------------------- @@ -264,14 +290,16 @@ class _OleDirectoryEntry: # [PL]: original code from PIL 1.1.5 #if sid != -1 # [PL]: necessary fix for Python 2.4 - #if sid != -1 and sid != 0xFFFFFFFFL: - # [PL]: new fix 22/02/2006 if sid != NOSTREAM: # the directory entries are organized as a red-black tree. # the following piece of code does an ordered traversal of # such a tree (at least that's what I hope ;-) + #[PL] Note from OpenOffice documentation: the safest way is to + # recreate the tree because some implementations may store broken + # red-black trees... + stack = [self.sid] # start at leftmost position @@ -279,7 +307,7 @@ class _OleDirectoryEntry: left, right, child = sidlist[sid][4] #[PL] while left != -1 and left != 0xFFFFFFFFL: - if left != NOSTREAM: + while left != NOSTREAM: stack.append(sid) sid = left left, right, child = sidlist[sid][4] @@ -384,15 +412,42 @@ class OleFileIO: TIFF files). """ - def __init__(self, filename = None): + def __init__(self, filename = None, raise_defects=DEFECT_FATAL): + """ + Constructor for OleFileIO class. + filename: file to open. + raise_defects: minimal level for defects to be raised as exceptions. + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a + security-oriented application, see source code for details) + """ + self.raise_defects_level = raise_defects if filename: self.open(filename) - ## - # Open an OLE2 file. + + def raise_defect(self, defect_level, message): + """ + This method should be called for any defect found during file parsing. + It may raise an IOError exception according to the minimal level chosen + for the OleFileIO object. + + defect_level: defect level, possible values are: + DEFECT_UNSURE : a case which looks weird, but not sure it's a defect + DEFECT_POTENTIAL : a potential defect + DEFECT_INCORRECT : an error according to specifications, but parsing can go on + DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + message: string describing the defect, used with raised exception. + """ + # added by [PL] + if defect_level >= self.raise_defects_level: + raise IOError, message + def open(self, filename): - """Open an OLE2 file""" + """ + Open an OLE2 file. + Reads the header, FAT and directory. + """ if type(filename) == type(""): self.fp = open(filename, "rb") else: @@ -401,7 +456,7 @@ class OleFileIO: header = self.fp.read(512) if len(header) != 512 or header[:8] != MAGIC: - raise IOError, "not an OLE2 structured storage file" + self.raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") # [PL] header structure according to AAF specifications: ##Header @@ -451,34 +506,42 @@ class OleFileIO: debug( struct.unpack(fmt_header, header1)) if Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': - raise IOError, "incorrect OLE signature" + # OLE signature should always be present + self.raise_defect(DEFECT_FATAL, "incorrect OLE signature") if clsid != '\x00'*16: - raise IOError, "incorrect CLSID in OLE header" + # according to AAF specs, CLSID should always be zero + self.raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") debug( "MinorVersion = %d" % MinorVersion ) debug( "DllVersion = %d" % DllVersion ) if DllVersion not in [3, 4]: - raise IOError, "incorrect DllVersion in OLE header" + # version 3: usual format, 512 bytes per sector + # version 4: large format, 4K per sector + self.raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") debug( "ByteOrder = %X" % ByteOrder ) if ByteOrder != 0xFFFE: - raise IOError, "incorrect ByteOrder in OLE header" + # For now only common little-endian documents are handled correctly + self.raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") + # TODO: add big-endian support for documents created on Mac ? SectorSize = 2**SectorShift debug( "SectorSize = %d" % SectorSize ) if SectorSize not in [512, 4096]: - raise IOError, "incorrect SectorSize in OLE header" + self.raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") + if (DllVersion==3 and SectorSize!=512) or (DllVersion==4 and SectorSize!=4096): + self.raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") MiniSectorSize = 2**MiniSectorShift debug( "MiniSectorSize = %d" % MiniSectorSize ) if MiniSectorSize not in [64]: - raise IOError, "incorrect MiniSectorSize in OLE header" + self.raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") if Reserved != 0 or Reserved1 != 0: - raise IOError, "incorrect OLE header" + self.raise_defect(DEFECT_INCORRECT, "incorrect OLE header") debug( "csectDir = %d" % csectDir ) if SectorSize==512 and csectDir!=0: - raise IOError, "incorrect csectDir in OLE header" + self.raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") debug( "csectFat = %d" % self.csectFat ) debug( "sectDirStart = %X" % sectDirStart ) debug( "signature = %d" % signature ) if signature != 0: - raise IOError, "incorrect OLE header" + self.raise_defect(DEFECT_INCORRECT, "incorrect OLE header") debug( "MiniSectorCutoff = %d" % MiniSectorCutoff ) debug( "MiniFatStart = %X" % MiniFatStart ) debug( "csectMiniFat = %d" % csectMiniFat ) @@ -573,13 +636,13 @@ class OleFileIO: def loadfat_sect(self, sect): "Adds the indexes of the given sector to the FAT" - # un secteur de FAT est un tableau d'ulong + # a FAT sector is an array of ulong integers. if isinstance(sect, array.array): fat1 = sect else: fat1 = array.array('L', sect) self.dumpsect(sect) - # la FAT est une chaîne de secteurs débutant au 1er index d'elle-même + # The FAT is a sector chain starting a the first index of itself. for isect in fat1: #print "isect = %X" % isect if isect == ENDOFCHAIN or isect == FREESECT: @@ -620,10 +683,10 @@ class OleFileIO: if self.csectFat <= 109: # there must be at least 109 blocks in header and the rest in # DIFAT, so number of sectors must be >109. - raise IOError, 'incorrect DIFAT, not enough sectors' + self.raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') if self.sectDifStart >= self.nb_sect: # initial DIFAT block index must be valid - raise IOError, 'incorrect DIFAT, first index out of range' + self.raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') debug( "DIFAT analysis..." ) # We compute the necessary number of DIFAT sectors : # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) @@ -634,6 +697,7 @@ class OleFileIO: isect_difat = self.sectDifStart for i in xrange(nb_difat): debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) + #TODO: check if corresponding FAT SID = DIFSECT sector_difat = self.getsect(isect_difat) difat = array.array('L', sector_difat) self.dumpsect(sector_difat) @@ -652,14 +716,23 @@ class OleFileIO: self.dumpfat(self.fat) def loadminifat(self): - "Load the MINIFAT table." - # This is stored in a standard sub- - # stream, pointed to by a header field. + """ + Load the MiniFAT table. + """ + # This is stored in a standard sub-stream, pointed to by a header + # field. s = self._open(self.minifatsect).read() - self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) + #[PL] Old code replaced by an array: + #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) + self.minifat = array.array('L', s) + def getsect(self, sect): - "Read given sector" + """ + Read given sector from file on disk. + sect: sector index + returns a string containing the sector data. + """ # [PL] this original code was wrong when sectors are 4KB instead of # 512 bytes: #self.fp.seek(512 + self.sectorsize * sect) @@ -668,27 +741,51 @@ class OleFileIO: try: self.fp.seek(self.sectorsize * (sect+1)) except: - raise IOError, 'wrong index for OLE sector' + self.raise_defect(DEFECT_FATAL, 'wrong index for OLE sector') sector = self.fp.read(self.sectorsize) if len(sector) != self.sectorsize: - raise IOError, 'incomplete OLE sector' + self.raise_defect(DEFECT_FATAL, 'incomplete OLE sector') return sector def _unicode(self, s): - # Map unicode string to Latin 1 - + """ + Map unicode string to Latin 1. + """ # FIXME: some day, Python will provide an official way to handle # Unicode strings, but until then, this will have to do... - return filter(ord, s) + + #[PL]: use Python Unicode when available (Python>=2.0): + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = unicode(s, 'UTF-16LE') + # Second the string is converted to Latin-1 + return u.encode('latin_1') + except ValueError: + # there was an error during UTF-16 to Unicode decoding: + self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') + # if no exception raised, fallback to foolproof version: + return filter(ord, s) + except UnicodeError: + # there was an error during Unicode to Latin-1 encoding: + self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') + # if no exception raised, fallback to foolproof version: + return filter(ord, s) + except: + # we assume this is an old Python version without Unicode support. + # Null bytes are simply removed: + return filter(ord, s) + def loaddirectory(self, sect): """ Load the directory. + sect: sector index of directory stream. """ # The directory is stored in a standard # substream, independent of its size. - # read directory stream + # open directory stream as a read-only file: fp = self._open(sect) # create list of sid entries @@ -697,6 +794,36 @@ class OleFileIO: entry = fp.read(128) if not entry: break + #[PL] décodage DirEntry + fmt_entry = "<64sHBBLLL16sLQQLLH" + len_entry = struct.calcsize(fmt_entry) + #debug("taille DirEntry: %d" % len_entry) + (name, namelength, type, color, sid_left, sid_right, sid_child, + clsid, dwUserFlags, createTime, modifyTime, isectStart, sizeLow, + sizeHigh) = struct.unpack(fmt_entry, entry[:len_entry]) + #debug("namelength = %d" % namelength) + if type == STGTY_INVALID: + break + if type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM]: + raise IOError, 'unhandled storage type' + #debug (struct.unpack(fmt_entry, entry[:len_entry])) + # vérification et conversion du nom Unicode + # on a au maximum 31 caractères + le zéro terminal + if namelength>64: + raise IOError, 'incorrect DirEntry name length' + # on ne garde que les caractères sans le zéro terminal + name = name[:(namelength-2)] + # on convertit la chaîne d'octets en véritable chaîne Unicode + name = unicode(name, 'utf_16_le') + debug("DirEntry: '%s'" % name) + # Si cette chaîne contient un caractère nul c'est que le champ + # namelength est incorrect: + if unichr(0) in name: + debug(len(name)) + debug(binascii.hexlify(name)) + raise IOError, 'incorrect DirEntry name length' + debug("type:%d" % type) + type = ord(entry[66]) name = self._unicode(entry[0:0+i16(entry, 64)]) ptrs = i32(entry, 68), i32(entry, 72), i32(entry, 76) @@ -712,6 +839,7 @@ class OleFileIO: self.root.dump() def _clsid(self, clsid): + "Converts a CLSID to a human-readable string" if clsid == "\0" * len(clsid): return "" return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % @@ -719,7 +847,7 @@ class OleFileIO: tuple(map(ord, clsid[8:16])))) def _list(self, files, prefix, node): - # listdir helper + "listdir helper" prefix = prefix + [node.name] for entry in node.kids: @@ -729,8 +857,14 @@ class OleFileIO: files.append(prefix[1:] + [entry.name]) def _find(self, filename): - # openstream helper - + """ + Returns directory entry of given filename. + filename: list of storage filenames, path to the desired stream/storage. + Example: ['Root Entry', 'storage_1', 'storage_1.2', 'stream'] + (openstream helper) + """ + #TODO: if filename is a string instead of a list, split it on slashes + # to allow a more common way of expressing paths ? node = self.root for name in filename: for kid in node.kids: @@ -742,26 +876,33 @@ class OleFileIO: return node.sid def _open(self, start, size = 0x7FFFFFFF): - # openstream helper. - + """ + Opens a stream, either in FAT or MiniFAT according to its size. + (openstream helper) + start: index of first sector + size: size of stream + """ + # stream size is compared to the MiniSectorCutoff threshold: if size < self.minisectorcutoff: # ministream object if not self.ministream: + # load MiniFAT if it wasn't already done: self.loadminifat() self.ministream = self._open(self.sidlist[0][2]) return _OleStream(self.ministream, start, size, 0, self.minisectorsize, self.minifat) - - # standard stream - return _OleStream(self.fp, start, size, 512, - self.sectorsize, self.fat) + else: + # standard stream + return _OleStream(self.fp, start, size, 512, + self.sectorsize, self.fat) ## # Returns a list of streams stored in this file. def listdir(self): - """Return a list of streams stored in this file""" - + """ + Return a list of streams stored in this file + """ files = [] self._list(files, [], self.root) return files @@ -770,8 +911,9 @@ class OleFileIO: # Opens a stream as a read-only file object. def openstream(self, filename): - """Open a stream as a read-only file object""" - + """ + Open a stream as a read-only file object + """ slot = self._find(filename) name, type, sect, size, sids, clsid = self.sidlist[slot] if type != 2: @@ -868,17 +1010,17 @@ if __name__ == "__main__": print "" sys.exit("usage: OleFileIO_PL.py [file2 ...]") - for file in sys.argv[1:]: + for filename in sys.argv[1:]: ## try: - ole = OleFileIO(file) + ole = OleFileIO(filename) print "-" * 68 - print file + print filename print "-" * 68 ole.dumpdirectory() - for file in ole.listdir(): - if file[-1][0] == "\005": - print file - props = ole.getproperties(file) + for streamname in ole.listdir(): + if streamname[-1][0] == "\005": + print streamname, ": properties" + props = ole.getproperties(streamname) props = props.items() props.sort() for k, v in props: From e614631025130c7c9d5d44e5718ca4865c16dd05 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:26:24 +0200 Subject: [PATCH 006/101] version 0.15 2007-11-25 --- PIL/OleFileIO.py | 708 ++++++++++++++++++++++++++++++----------------- 1 file changed, 452 insertions(+), 256 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index ffa1a12ab..1329cbbf9 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,9 @@ OleFileIO_PL: Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.14 2007-11-19 Philippe Lagadec - http://lagasoft.free.fr +version 0.15 2007-11-25 Philippe Lagadec - http://lagasoft.free.fr + +Project website: http://lagasoft.free.fr/python/olefileio Improved version of OleFileIO module from PIL library v1.1.6 See: http://www.pythonware.com/products/pil/index.htm @@ -22,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2007-11-19" -__version__ = '0.14' +__date__ = "2007-11-25" +__version__ = '0.15' #----------------------------------------------------------------------------- # CHANGELOG: (OleFileIO_PL changes only) @@ -44,10 +46,23 @@ __version__ = '0.14' # 2007-11-19 v0.14 PL: - added OleFileIO.raise_defect() to adapt sensitivity # - improved _unicode() to use Python 2.x unicode support # - fixed bug in _OleDirectoryEntry +# 2007-11-25 v0.15 PL: - added safety checks to detect malformed documents +# - fixed _OleStream which didn't check stream size +# - added/improved many docstrings and comments +# - moved helper functions _unicode and _clsid out of +# OleFileIO class +# - improved OleFileIO._find() to add Unix path syntax +# - OleFileIO._find() is now case-insensitive +# - added get_type() and get_rootentry_name() +# - rewritten loaddirectory and _OleDirectoryEntry #----------------------------------------------------------------------------- # TODO: +# - add underscore to each private method/constant, to avoid their display in +# pydoc/epydoc documentation # - replace all raised exceptions with raise_defect (at least in OleFileIO) +# - add dictionary of directory entries indexed on filenames to avoid using +# _find() each time ? # - fix Unicode names handling (find some way to stay compatible with Py1.5.2) # => if possible avoid converting names to Latin-1 # - fix handling of DIFSECT blocks in FAT (not stop) @@ -55,6 +70,7 @@ __version__ = '0.14' # - add (optional) checks on FAT block chains integrity to detect crossed # sectors, loops, ... # - improve docstrings to show more sample uses +# - fix docstrings to follow epydoc format # - see also original notes and FIXME below # - remove all obsolete FIXMEs #----------------------------------------------------------------------------- @@ -142,27 +158,21 @@ else: def debug(msg): pass -def i16(c, o = 0): - return ord(c[o])+(ord(c[o+1])<<8) - -def i32(c, o = 0): - return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) - # [PL]: added int() because "<<" gives long int since Python 2.4 - - MAGIC = '\320\317\021\340\241\261\032\341' -# [PL]: added constants (from AAF specifications) +#[PL]: added constants for Sector IDs (from AAF specifications) MAXREGSECT = 0xFFFFFFFAL; # maximum SECT DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector + +#[PL]: added constants for Directory Entry IDs (from AAF specifications) MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry #[PL] object types in storage (from AAF specifications) -STGTY_INVALID = 0 # unknown storage type +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) STGTY_STORAGE = 1 # element is a storage object STGTY_STREAM = 2 # element is a stream object STGTY_LOCKBYTES = 3 # element is an ILockBytes object @@ -205,8 +215,79 @@ DEFECT_INCORRECT = 30 # an error according to specifications, but parsing DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is # impossible -# -# -------------------------------------------------------------------- +#--- FUNCTIONS ---------------------------------------------------------------- + +#TODO: replace i16 and i32 with more readable struct.unpack equivalent +def i16(c, o = 0): + """ + Converts a 2-bytes (16 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return ord(c[o])+(ord(c[o+1])<<8) + + +def i32(c, o = 0): + """ + Converts a 4-bytes (32 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) + # [PL]: added int() because "<<" gives long int since Python 2.4 + + +def _clsid(clsid): + """ + Converts a CLSID to a human-readable string. + clsid: string of length 16. + """ + assert len(clsid) == 16 + if clsid == "\0" * len(clsid): + return "" + return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % + ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + + tuple(map(ord, clsid[8:16])))) + + +def _unicode(s): + """ + Map unicode string to Latin 1. + """ + #[PL]: use Python Unicode features when available (Python>=2.0): + #TODO: test this with old Python versions <2.0 + #TODO: test if it OleFileIO works with Unicode strings, instead of + # converting to Latin-1. + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = unicode(s, 'UTF-16LE') + except NameError: + # If the unicode function does not exist, we assume this is an old + # Python version without Unicode support. + # Null bytes are simply removed (this only works with usual Latin-1 + # strings which do not contain unicode characters>256): + return filter(ord, s) + except ValueError: + # there was an error during UTF-16 to Unicode decoding: + self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') + # if no exception raised, fallback to foolproof version: + return filter(ord, s) + try: + # Second the unicode string is converted to Latin-1 + return u.encode('latin_1') + except UnicodeError: # possible issue: this exception didn't exist before + # there was an error during Unicode to Latin-1 encoding: + self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') + # if no exception raised, fallback to foolproof version: + return filter(ord, s) + + +#=== CLASSES ================================================================== + +#--- _OleStream --------------------------------------------------------------- class _OleStream(StringIO.StringIO): """ @@ -219,6 +300,9 @@ class _OleStream(StringIO.StringIO): This function can be used with either ordinary streams, or ministreams, depending on the offset, sectorsize, and fat table arguments. + + Attributes: + - size: actual size of data stream, after it was opened. """ # FIXME: should store the list of sects obtained by following @@ -227,169 +311,267 @@ class _OleStream(StringIO.StringIO): def __init__(self, fp, sect, size, offset, sectorsize, fat): """ - Constructor for _OleStream class + Constructor for _OleStream class. + + fp : file object, the OLE container + sect : sector index of first sector in the stream + size : total size of the stream + offset : offset in bytes for the first FAT or MiniFAT sector + sectorsize: size of one sector + fat : array/list of sector indexes (FAT or MiniFAT) + return : a StringIO instance containing the OLE stream """ + debug('_OleStream.__init__:') + debug(' size=%d, offset=%d, sectorsize=%d, len(fat)=%d' + %(size,offset,sectorsize,len(fat))) + #[PL] To detect malformed documents with FAT loops, we compute the + # expected number of sectors in the stream: + if size==0x7FFFFFFF: + # this is the case when called from OleFileIO._open(), and stream + # size is not known in advance (for example when reading the + # Directory stream). Then we can only guess maximum size: + size = len(fat)*sectorsize + nb_sectors = (size + (sectorsize-1)) / sectorsize + # This number should (at least) be less than the total number of + # sectors in the given FAT: + if nb_sectors > len(fat): + raise IOError, 'malformed OLE document, stream too large' # optimization(?): data is first a list of strings, and join() is called # at the end to concatenate all in one string. # (this may not be really useful with recent Python versions) data = [] - # [PL] while sect != -2: # 0xFFFFFFFEL: - while sect != ENDOFCHAIN: + #[PL] first sector index should be within FAT or ENDOFCHAIN: + if sect != ENDOFCHAIN and (sect<0 or sect>=len(fat)): + raise IOError, 'incorrect OLE FAT, sector index out of range' + #[PL] A fixed-length for loop is used instead of an undefined while + # loop to avoid DoS attacks: + for i in xrange(nb_sectors): + #TODO: check if this works with 4K sectors: fp.seek(offset + sectorsize * sect) - data.append(fp.read(sectorsize)) - # [PL] if pointer is out of the FAT an exception is raised - if sect >= len(fat) : - raise IOError, 'incorrect FAT' + sector_data = fp.read(sectorsize) + # [PL] check if there was enough data: + if len(sector_data) != sectorsize: + raise IOError, 'incomplete OLE sector' + data.append(sector_data) # jump to next sector in the FAT: - sect = fat[sect] + try: + #[PL] sector index should not be negative, but Python allows it + if sect<0: raise IndexError + sect = fat[sect] + if sect == ENDOFCHAIN: + # this may happen when size was not known: + break + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + raise IOError, 'incorrect OLE FAT, sector index out of range' + #[PL] Last sector should be a "end of chain" marker: + if sect != ENDOFCHAIN: + raise IOError, 'incorrect last sector index in OLE stream' data = string.join(data, "") + # Data is truncated to the actual stream size: + if len(data) > size: + data = data[:size] + # actual stream size is stored for future use: + self.size = size + else: + # actual stream size was not known, now we know the size of read + # data: + self.size = len(data) # when all data is read in memory, StringIO constructor is called - StringIO.StringIO.__init__(self, data[:size]) + StringIO.StringIO.__init__(self, data) # Then the _OleStream object can be used as a read-only file object. -# -# -------------------------------------------------------------------- + +#--- _OleDirectoryEntry ------------------------------------------------------- # FIXME: should add a counter in here to avoid looping forever # if the tree is broken. class _OleDirectoryEntry: - """OLE2 Directory Entry - - Encapsulates a stream directory entry. Note that the - constructor builds a tree of all subentries, so we only - have to call it with the root object. + """ + OLE2 Directory Entry """ - def __init__(self, sidlist, sid): + #[PL] parsing code moved from OleFileIO.loaddirectory - # store directory parameters. the caller provides - # a complete list of directory entries, as read from - # the directory stream. + # struct to parse directory entries: + # <: little-endian byte order + # 64s: string containing entry name in unicode (max 31 chars) + null char + # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 + # B: uint8, dir entry type (between 0 and 5) + # B: uint8, color: 0=black, 1=red + # I: uint32, index of left child node in the red-black tree, NOSTREAM if none + # I: uint32, index of right child node in the red-black tree, NOSTREAM if none + # I: uint32, index of child root node if it is a storage, else NOSTREAM + # 16s: CLSID, unique identifier (only used if it is a storage) + # I: uint32, user flags + # 8s: uint64, creation timestamp or zero + # 8s: uint64, modification timestamp or zero + # I: uint32, SID of first sector if stream or ministream, SID of 1st sector + # of stream containing ministreams if root entry, 0 otherwise + # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise + # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise + STRUCT_DIRENTRY = '<64sHBBIII16sI8s8sIII' + # size of a directory entry: 128 bytes + DIRENTRY_SIZE = 128 + assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE - # [PL] conformity check - if sid >= len(sidlist) : - raise IOError, 'incorrect SID' - - name, type, sect, size, sids, clsid = sidlist[sid] - - self.sid = sid - self.name = name - self.type = type # 1=storage 2=stream - self.sect = sect - self.size = size - self.clsid = clsid - - # process child nodes, if any + def __init__(self, entry, sid, olefile): + """ + Constructor for an _OleDirectoryEntry object. + Parses a 128-bytes entry from the OLE Directory stream. + + entry: string (must be 128 bytes long) + olefile: OleFileIO containing this directory entry + """ + self.sid = sid + # ref to olefile is stored for future use + self.olefile = olefile + # kids is the list of children entries, if this entry is a storage: + # (list of _OleDirectoryEntry objects) self.kids = [] + # flag used to detect if the entry is referenced more than once in + # directory: + self.used = False + # decode DirEntry + ( + name, + namelength, + self.entry_type, + self.color, + self.sid_left, + self.sid_right, + self.sid_child, + clsid, + self.dwUserFlags, + self.createTime, + self.modifyTime, + self.isectStart, + sizeLow, + sizeHigh + ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) + if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: + olefile.raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + #debug (struct.unpack(fmt_entry, entry[:len_entry])) + # name should be at most 31 unicode characters + null character, + # so 64 bytes in total (31*2 + 2): + if namelength>64: + olefile.raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') + # if exception not raised, namelength is set to the maximum value: + namelength = 64 + # only characters without ending null char are kept: + name = name[:(namelength-2)] + # name is converted from unicode to Latin-1: + self.name = _unicode(name) + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes + # sectors: + if olefile.sectorsize == 512 and sizeHigh != 0: + olefile.raise_defect(DEFECT_INCORRECT, 'incorrect OLE stream size') + self.size = sizeLow + (long(sizeHigh)<<32) + self.clsid = _clsid(clsid) - sid = sidlist[sid][4][2] + debug('DirEntry SID=%d: %s' % (self.sid, self.name)) + debug(' - type: %d' % self.entry_type) + debug(' - sect: %d' % self.isectStart) + debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) + debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, + self.sid_right, self.sid_child)) - # [PL]: original code from PIL 1.1.5 - #if sid != -1 - # [PL]: necessary fix for Python 2.4 - if sid != NOSTREAM: - # the directory entries are organized as a red-black tree. - # the following piece of code does an ordered traversal of - # such a tree (at least that's what I hope ;-) + def build_storage_tree(self): + """ + Read and build the red-black tree attached to this _OleDirectoryEntry + object, if it is a storage. + Note that this method builds a tree of all subentries, so it should + only be called for the root object once. + """ + debug('build_storage_tree: SID=%d - %s - sid_child=%d' + % (self.sid, self.name, self.sid_child)) + if self.sid_child != NOSTREAM: + # if child SID is not NOSTREAM, then this entry is a storage. + # Let's walk through the tree of children to fill the kids list: + self.append_kids(self.sid_child) - #[PL] Note from OpenOffice documentation: the safest way is to + # Note from OpenOffice documentation: the safest way is to # recreate the tree because some implementations may store broken # red-black trees... - stack = [self.sid] - - # start at leftmost position - - left, right, child = sidlist[sid][4] - - #[PL] while left != -1 and left != 0xFFFFFFFFL: - while left != NOSTREAM: - stack.append(sid) - sid = left - left, right, child = sidlist[sid][4] - - while sid != self.sid: - - self.kids.append(_OleDirectoryEntry(sidlist, sid)) - - # try to move right - - # [PL] conformity check - if sid >= len(sidlist) : - raise IOError, 'incorrect SID' - - left, right, child = sidlist[sid][4] - #[PL] if right != -1 and right != 0xFFFFFFFFL: - if right != NOSTREAM: - # and then back to the left - sid = right - while 1: - - # [PL] conformity check - if sid >= len(sidlist) : - raise IOError, 'incorrect SID' - - left, right, child = sidlist[sid][4] - #[PL] if left == -1 or left == 0xFFFFFFFFL: - if left == NOSTREAM: - break - stack.append(sid) - sid = left - else: - # couldn't move right; move up instead - while 1: - ptr = stack[-1] - del stack[-1] - left, right, child = sidlist[ptr][4] - if right != sid: - break - sid = right - left, right, child = sidlist[sid][4] - if right != ptr: - sid = ptr - # in the OLE file, entries are sorted on (length, name). - # for convenience, we sort them on name instead. - + # for convenience, we sort them on name instead: + # (see __cmp__ method in this class) self.kids.sort() + + def append_kids(self, child_sid): + """ + Walk through red-black tree of children of this directory entry to add + all of them to the kids list. (recursive method) + + child_sid : index of child directory entry to use, or None when called + first time for the root. (only used during recursion) + """ + #[PL] this method was added to use simple recursion instead of a complex + # algorithm. + # if this is not a storage or a leaf of the tree, nothing to do: + if child_sid == NOSTREAM: + return + # check if child SID is in the proper range: + if child_sid<0 or child_sid>=len(self.olefile.direntries): + self.olefile.raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') + # get child direntry: + child = self.olefile.direntries[child_sid] + debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' + % (child.sid, child.name, child.sid_left, child.sid_right, child.sid_child)) + # the directory entries are organized as a red-black tree. + # (cf. Wikipedia for details) + # First walk through left side of the tree: + self.append_kids(child.sid_left) + # Then the child_sid _OleDirectoryEntry object is appended to the + # kids list: + self.kids.append(child) + # Check if kid was not already referenced in a storage: + if child.used: + self.olefile.raise_defect(DEFECT_INCORRECT, + 'OLE Entry referenced more than once') + child.used = True + # Finally walk through right side of the tree: + self.append_kids(child.sid_right) + # Afterwards build kid's own tree if it's also a storage: + child.build_storage_tree() + + def __cmp__(self, other): "Compare entries by name" return cmp(self.name, other.name) + def dump(self, tab = 0): "Dump this entry, and all its subentries (for debug purposes only)" TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", "(property)", "(root)"] - print " "*tab + repr(self.name), TYPES[self.type], - if self.type in (2, 5): + print " "*tab + repr(self.name), TYPES[self.entry_type], + if self.entry_type in (STGTY_STREAM, STGTY_ROOT): print self.size, "bytes", print - if self.type in (1, 5) and self.clsid: + if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: print " "*tab + "{%s}" % self.clsid for kid in self.kids: kid.dump(tab + 2) -# -# -------------------------------------------------------------------- -## -# This class encapsulates the interface to an OLE 2 structured -# storage file. Use the {@link listdir} and {@link openstream} -# methods to access the contents of this file. +#--- OleFileIO ---------------------------------------------------------------- class OleFileIO: - """OLE container object + """ + OLE container object This class encapsulates the interface to an OLE 2 structured - storage file. Use the listdir and openstream methods to access - the contents of this file. + storage file. Use the {@link listdir} and {@link openstream} methods to + access the contents of this file. Object names are given as a list of strings, one for each subentry level. The root entry should be omitted. For example, the following @@ -415,6 +597,7 @@ class OleFileIO: def __init__(self, filename = None, raise_defects=DEFECT_FATAL): """ Constructor for OleFileIO class. + filename: file to open. raise_defects: minimal level for defects to be raised as exceptions. (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a @@ -554,9 +737,7 @@ class OleFileIO: debug( "Number of sectors in the file: %d" % self.nb_sect ) # file clsid (probably never used, so we don't store it) - clsid = self._clsid(header[8:24]) - - # FIXME: could check version and byte order fields + clsid = _clsid(header[8:24]) self.sectorsize = 1 << i16(header, 30) self.minisectorsize = 1 << i16(header, 32) @@ -566,13 +747,14 @@ class OleFileIO: # Load file allocation tables self.loadfat(header) - # Load direcory. This sets both the sidlist (ordered by id) + # Load direcory. This sets both the direntries list (ordered by sid) # and the root (ordered by hierarchy) members. self.loaddirectory(i32(header, 48)) self.ministream = None self.minifatsect = i32(header, 60) + def dumpfat(self, fat, firstindex=0): "Displays a part of FAT in human-readable form for debugging purpose" # [PL] added only for debug @@ -609,6 +791,7 @@ class OleFileIO: print nom, print "" + def dumpsect(self, sector, firstindex=0): "Displays a sector in a human-readable form, for debugging purpose." if not DEBUG_MODE: @@ -635,19 +818,29 @@ class OleFileIO: def loadfat_sect(self, sect): - "Adds the indexes of the given sector to the FAT" + """ + Adds the indexes of the given sector to the FAT + sect: string containing the first FAT sector, or array of long integers + return: index of last FAT sector. + """ # a FAT sector is an array of ulong integers. if isinstance(sect, array.array): + # if sect is already an array it is directly used fat1 = sect else: + # if it's a raw sector, it is parsed in an array fat1 = array.array('L', sect) self.dumpsect(sect) - # The FAT is a sector chain starting a the first index of itself. + # The FAT is a sector chain starting at the first index of itself. for isect in fat1: #print "isect = %X" % isect if isect == ENDOFCHAIN or isect == FREESECT: + # the end of the sector chain has been reached break + # read the FAT sector s = self.getsect(isect) + # parse it as an array of 32 bits integers, and add it to the + # global FAT array self.fat = self.fat + array.array('L', s) return isect @@ -715,6 +908,7 @@ class OleFileIO: ## raise IOError, 'incorrect DIFAT' self.dumpfat(self.fat) + def loadminifat(self): """ Load the MiniFAT table. @@ -747,35 +941,6 @@ class OleFileIO: self.raise_defect(DEFECT_FATAL, 'incomplete OLE sector') return sector - def _unicode(self, s): - """ - Map unicode string to Latin 1. - """ - # FIXME: some day, Python will provide an official way to handle - # Unicode strings, but until then, this will have to do... - - #[PL]: use Python Unicode when available (Python>=2.0): - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = unicode(s, 'UTF-16LE') - # Second the string is converted to Latin-1 - return u.encode('latin_1') - except ValueError: - # there was an error during UTF-16 to Unicode decoding: - self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') - # if no exception raised, fallback to foolproof version: - return filter(ord, s) - except UnicodeError: - # there was an error during Unicode to Latin-1 encoding: - self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') - # if no exception raised, fallback to foolproof version: - return filter(ord, s) - except: - # we assume this is an old Python version without Unicode support. - # Null bytes are simply removed: - return filter(ord, s) - def loaddirectory(self, sect): """ @@ -786,101 +951,41 @@ class OleFileIO: # substream, independent of its size. # open directory stream as a read-only file: + # (stream size is not known in advance) fp = self._open(sect) - # create list of sid entries - self.sidlist = [] - while 1: + #[PL] to detect malformed documents and avoid DoS attacks, the maximum + # number of directory entries can be calculated: + max_entries = fp.size / 128 + debug('loaddirectory: size=%d, max_entries=%d' % (fp.size, max_entries)) + + # Create list of directory entries + self.direntries = [] + for sid in xrange(max_entries): entry = fp.read(128) if not entry: break - #[PL] décodage DirEntry - fmt_entry = "<64sHBBLLL16sLQQLLH" - len_entry = struct.calcsize(fmt_entry) - #debug("taille DirEntry: %d" % len_entry) - (name, namelength, type, color, sid_left, sid_right, sid_child, - clsid, dwUserFlags, createTime, modifyTime, isectStart, sizeLow, - sizeHigh) = struct.unpack(fmt_entry, entry[:len_entry]) - #debug("namelength = %d" % namelength) - if type == STGTY_INVALID: - break - if type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM]: - raise IOError, 'unhandled storage type' - #debug (struct.unpack(fmt_entry, entry[:len_entry])) - # vérification et conversion du nom Unicode - # on a au maximum 31 caractères + le zéro terminal - if namelength>64: - raise IOError, 'incorrect DirEntry name length' - # on ne garde que les caractères sans le zéro terminal - name = name[:(namelength-2)] - # on convertit la chaîne d'octets en véritable chaîne Unicode - name = unicode(name, 'utf_16_le') - debug("DirEntry: '%s'" % name) - # Si cette chaîne contient un caractère nul c'est que le champ - # namelength est incorrect: - if unichr(0) in name: - debug(len(name)) - debug(binascii.hexlify(name)) - raise IOError, 'incorrect DirEntry name length' - debug("type:%d" % type) + self.direntries.append(_OleDirectoryEntry(entry, sid, self)) + # Root entry is the first entry: + self.root = self.direntries[0] + # read and build all storage trees, starting from the root: + self.root.build_storage_tree() - type = ord(entry[66]) - name = self._unicode(entry[0:0+i16(entry, 64)]) - ptrs = i32(entry, 68), i32(entry, 72), i32(entry, 76) - sect, size = i32(entry, 116), i32(entry, 120) - clsid = self._clsid(entry[80:96]) - self.sidlist.append((name, type, sect, size, ptrs, clsid)) - - # create hierarchical list of directory entries - self.root = _OleDirectoryEntry(self.sidlist, 0) def dumpdirectory(self): - # Dump directory (for debugging only) + """ + Dump directory (for debugging only) + """ self.root.dump() - def _clsid(self, clsid): - "Converts a CLSID to a human-readable string" - if clsid == "\0" * len(clsid): - return "" - return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % - ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + - tuple(map(ord, clsid[8:16])))) - - def _list(self, files, prefix, node): - "listdir helper" - - prefix = prefix + [node.name] - for entry in node.kids: - if entry.kids: - self._list(files, prefix, entry) - else: - files.append(prefix[1:] + [entry.name]) - - def _find(self, filename): - """ - Returns directory entry of given filename. - filename: list of storage filenames, path to the desired stream/storage. - Example: ['Root Entry', 'storage_1', 'storage_1.2', 'stream'] - (openstream helper) - """ - #TODO: if filename is a string instead of a list, split it on slashes - # to allow a more common way of expressing paths ? - node = self.root - for name in filename: - for kid in node.kids: - if kid.name == name: - break - else: - raise IOError, "file not found" - node = kid - return node.sid def _open(self, start, size = 0x7FFFFFFF): """ - Opens a stream, either in FAT or MiniFAT according to its size. + Open a stream, either in FAT or MiniFAT according to its size. (openstream helper) + start: index of first sector - size: size of stream + size: size of stream (or nothing if size is unknown) """ # stream size is compared to the MiniSectorCutoff threshold: if size < self.minisectorcutoff: @@ -888,7 +993,9 @@ class OleFileIO: if not self.ministream: # load MiniFAT if it wasn't already done: self.loadminifat() - self.ministream = self._open(self.sidlist[0][2]) + # The first sector index of the miniFAT stream is stored in the + # root directory entry: + self.ministream = self._open(self.root.isectStart) return _OleStream(self.ministream, start, size, 0, self.minisectorsize, self.minifat) else: @@ -896,8 +1003,21 @@ class OleFileIO: return _OleStream(self.fp, start, size, 512, self.sectorsize, self.fat) - ## - # Returns a list of streams stored in this file. + + def _list(self, files, prefix, node): + """ + (listdir helper) + files: list of files to fill in + prefix: current location in storage tree (list of names) + node: current node (_OleDirectoryEntry object) + """ + prefix = prefix + [node.name] + for entry in node.kids: + if entry.kids: + self._list(files, prefix, entry) + else: + files.append(prefix[1:] + [entry.name]) + def listdir(self): """ @@ -907,36 +1027,108 @@ class OleFileIO: self._list(files, [], self.root) return files - ## - # Opens a stream as a read-only file object. + + def _find(self, filename): + """ + Returns directory entry of given filename. (openstream helper) + Note: this method is case-insensitive. + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: sid of requested filename + raise IOError if file not found + """ + + # if filename is a string instead of a list, split it on slashes to + # convert to a list: + if isinstance(filename, basestring): + filename = filename.split('/') + # walk across storage tree, following given path: + node = self.root + for name in filename: + for kid in node.kids: + if kid.name.lower() == name.lower(): + break + else: + raise IOError, "file not found" + node = kid + return node.sid + def openstream(self, filename): """ - Open a stream as a read-only file object + Open a stream as a read-only file object (StringIO). + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: file object (read-only) + raise IOError if filename not found, or if this is not a stream. """ - slot = self._find(filename) - name, type, sect, size, sids, clsid = self.sidlist[slot] - if type != 2: + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: raise IOError, "this file is not a stream" - return self._open(sect, size) + return self._open(entry.isectStart, entry.size) + + + def get_type(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container, and return its type. + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: False if object does not exist, its entry type (>0) otherwise: + - STGTY_STREAM: a stream + - STGTY_STORAGE: a storage + - STGTY_ROOT: the root entry + """ + try: + sid = self._find(filename) + entry = self.direntries[sid] + return entry.entry_type + except: + return False + + + def get_rootentry_name(self): + """ + Return root entry name. Should usually be 'Root Entry' or 'R' in most + implementations. + """ + return self.root.name - ## - # Gets a list of properties described in substream. def getproperties(self, filename): - """Return properties described in substream""" + """ + Return properties described in substream + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + """ fp = self.openstream(filename) data = {} # header s = fp.read(28) - clsid = self._clsid(s[8:24]) + clsid = _clsid(s[8:24]) # format id s = fp.read(20) - fmtid = self._clsid(s[:16]) + fmtid = _clsid(s[:16]) fp.seek(i32(s, 16)) # get section @@ -978,7 +1170,7 @@ class OleFileIO: elif type == VT_UI1: value = ord(s[offset+4]) elif type == VT_CLSID: - value = self._clsid(s[offset+4:offset+20]) + value = _clsid(s[offset+4:offset+20]) elif type == VT_CF: count = i32(s, offset+4) value = s[offset+8:offset+8+count] @@ -1012,7 +1204,7 @@ if __name__ == "__main__": for filename in sys.argv[1:]: ## try: - ole = OleFileIO(filename) + ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) print "-" * 68 print filename print "-" * 68 @@ -1025,5 +1217,9 @@ if __name__ == "__main__": props.sort() for k, v in props: print " ", k, v + root = ole.get_rootentry_name() + print 'Root entry name: "%s"' % root + if ole.get_type('macros/vba'): + print "This may be a Word document with VBA macros." ## except IOError, v: ## print "***", "cannot read", file, "-", v From 18333d8edf7fa103a4d5106f997855a0f3a52c38 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:29:01 +0200 Subject: [PATCH 007/101] version 0.17 2007-12-04 --- PIL/OleFileIO.py | 607 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 443 insertions(+), 164 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 1329cbbf9..965bc0fc8 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -2,15 +2,15 @@ # -*- coding: latin-1 -*- """ OleFileIO_PL: - Module to read Microsoft OLE2 files (Structured Storage), such as - Microsoft Office documents, Image Composer and FlashPix files, - Outlook messages, ... + Module to read Microsoft OLE2 files (also called Structured Storage or + Microsoft Compound Document File Format), such as Microsoft Office + documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.15 2007-11-25 Philippe Lagadec - http://lagasoft.free.fr +version 0.17 2007-12-04 Philippe Lagadec - http://lagasoft.free.fr Project website: http://lagasoft.free.fr/python/olefileio -Improved version of OleFileIO module from PIL library v1.1.6 +Improved version of the OleFileIO module from PIL library v1.1.6 See: http://www.pythonware.com/products/pil/index.htm The Python Imaging Library (PIL) is @@ -24,11 +24,42 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2007-11-25" -__version__ = '0.15' +__date__ = "2007-12-04" +__version__ = '0.17' + +#--- LICENSE ------------------------------------------------------------------ + +# OleFileIO_PL is an improved version of the OleFileIO module from the +# Python Imaging Library (PIL). + +# OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its associated +# documentation, you agree that you have read, understood, and will comply with +# the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and its +# associated documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appears in all copies, and that both +# that copyright notice and this permission notice appear in supporting +# documentation, and that the name of Secret Labs AB or the author(s) not be used +# in advertising or publicity pertaining to distribution of the software +# without specific, written prior permission. +# +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. #----------------------------------------------------------------------------- -# CHANGELOG: (OleFileIO_PL changes only) +# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) # 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility # (all changes flagged with [PL]) # 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise @@ -43,10 +74,10 @@ __version__ = '0.15' # 2007-09-04 v0.13 PL: - improved/translated (lots of) comments # - updated license # - converted tabs to 4 spaces -# 2007-11-19 v0.14 PL: - added OleFileIO.raise_defect() to adapt sensitivity +# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity # - improved _unicode() to use Python 2.x unicode support # - fixed bug in _OleDirectoryEntry -# 2007-11-25 v0.15 PL: - added safety checks to detect malformed documents +# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops # - fixed _OleStream which didn't check stream size # - added/improved many docstrings and comments # - moved helper functions _unicode and _clsid out of @@ -55,24 +86,61 @@ __version__ = '0.15' # - OleFileIO._find() is now case-insensitive # - added get_type() and get_rootentry_name() # - rewritten loaddirectory and _OleDirectoryEntry +# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict +# - added detection of duplicate filenames in storages +# - added detection of duplicate references to streams +# - added get_size() and exists() to _OleDirectoryEntry +# - added isOleFile to check header before parsing +# - added __all__ list to control public keywords in pydoc +# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory +# - improved _unicode(), added workarounds for Python <2.3 +# - added set_debug_mode and -d option to set debug mode +# - fixed bugs in OleFileIO.open and _OleDirectoryEntry +# - added safety check in main for large or binary +# properties +# - allow size>0 for storages for some implementations #----------------------------------------------------------------------------- -# TODO: -# - add underscore to each private method/constant, to avoid their display in +# TODO (for version 1.0): +# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... +# - add underscore to each private method, to avoid their display in # pydoc/epydoc documentation -# - replace all raised exceptions with raise_defect (at least in OleFileIO) -# - add dictionary of directory entries indexed on filenames to avoid using -# _find() each time ? +# - replace all raised exceptions with _raise_defect (at least in OleFileIO) +# - add method to check all streams (follow sectors chains without storing all +# stream in memory, and report anomalies) +# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? # - fix Unicode names handling (find some way to stay compatible with Py1.5.2) # => if possible avoid converting names to Latin-1 -# - fix handling of DIFSECT blocks in FAT (not stop) -# - add stricter checks in decoding -# - add (optional) checks on FAT block chains integrity to detect crossed -# sectors, loops, ... +# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) +# - rewrite OleFileIO.getproperties # - improve docstrings to show more sample uses -# - fix docstrings to follow epydoc format # - see also original notes and FIXME below # - remove all obsolete FIXMEs + +# IDEAS: +# - allow _raise_defect to raise different exceptions, not only IOError +# - provide a class with named attributes to get well-known properties of +# MS Office documents (title, author, ...) ? +# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for +# streams with unknown size +# - use arrays of int instead of long integers for FAT/MiniFAT, to improve +# performance and reduce memory usage ? (possible issue with values >2^31) +# - provide tests with unittest (may need write support to create samples) +# - move all debug code (and maybe dump methods) to a separate module, with +# a class which inherits OleFileIO ? +# - fix docstrings to follow epydoc format +# - add support for 4K sectors ? +# - add support for big endian byte order ? +# - create a simple OLE explorer with wxPython + +# FUTURE EVOLUTIONS to add write support: +# 1) add ability to write a stream back on disk from StringIO (same size, no +# change in FAT/MiniFAT). +# 2) rename a stream/storage if it doesn't change the RB tree +# 3) use rbtree module to update the red-black tree + any rename +# 4) remove a stream/storage: free sectors in FAT/MiniFAT +# 5) allocate new sectors in FAT/MiniFAT +# 6) create new storage/stream #----------------------------------------------------------------------------- # @@ -113,51 +181,60 @@ __version__ = '0.15' # See the README file for information on usage and redistribution. # -#--- LICENSE ------------------------------------------------------------------ - -# OleFileIO_PL is an improved version of the OleFileIO module from the -# Python Imaging Library (PIL). - -# OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec -# -# The Python Imaging Library (PIL) is -# Copyright (c) 1997-2005 by Secret Labs AB -# Copyright (c) 1995-2005 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its associated -# documentation, you agree that you have read, understood, and will comply with -# the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and its -# associated documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appears in all copies, and that both -# that copyright notice and this permission notice appear in supporting -# documentation, and that the name of Secret Labs AB or the author(s) not be used -# in advertising or publicity pertaining to distribution of the software -# without specific, written prior permission. -# -# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS -# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. -# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, -# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THIS SOFTWARE. - #------------------------------------------------------------------------------ import string, StringIO, struct, array, os.path -#[PL] DEBUG display mode: +#[PL] Define explicitly the public API to avoid private objects in pydoc: +__all__ = ['OleFileIO', 'isOleFile'] + + +#[PL] These workarounds were inspired from the Path module +# (see http://www.jorendorff.com/articles/python/path/) +#TODO: test with old Python versions + +# Pre-2.3 workaround for booleans +try: + True, False +except NameError: + True, False = 1, 0 + +# Pre-2.3 workaround for basestring. +try: + basestring +except NameError: + try: + # is Unicode supported (Python >2.0 or >1.6 ?) + basestring = (str, unicode) + except NameError: + basestring = str + +#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode +# if False (default PIL behaviour), all filenames are converted to Latin-1. +KEEP_UNICODE_NAMES = False + +#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on +# command line to change it. DEBUG_MODE = False +def debug_print(msg): + print msg +def debug_pass(msg): + pass +debug = debug_pass -if DEBUG_MODE: - def debug(msg): - print msg -else: - def debug(msg): - pass +def set_debug_mode(debug_mode): + """ + Set debug mode on or off, to control display of debugging messages. + mode: True or False + """ + global DEBUG_MODE, debug + DEBUG_MODE = debug_mode + if debug_mode: + debug = debug_print + else: + debug = debug_pass +#TODO: convert this to hex MAGIC = '\320\317\021\340\241\261\032\341' #[PL]: added constants for Sector IDs (from AAF specifications) @@ -197,17 +274,18 @@ VT_VECTOR=0x1000; # map property id to name (for debugging purposes) VT = {} -for k, v in vars().items(): - if k[:3] == "VT_": - VT[v] = k +for keyword, var in vars().items(): + if keyword[:3] == "VT_": + VT[var] = keyword # # -------------------------------------------------------------------- # Some common document types (root.clsid fields) WORD_CLSID = "00020900-0000-0000-C000-000000000046" +#TODO: check Excel, PPT, ... -#[PL]: Defect levels to classify parsing errors - see OleFileIO.raise_defect() +#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect DEFECT_POTENTIAL = 20 # a potential defect DEFECT_INCORRECT = 30 # an error according to specifications, but parsing @@ -215,8 +293,28 @@ DEFECT_INCORRECT = 30 # an error according to specifications, but parsing DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is # impossible +#[PL] add useful constants to __all__: +for key in vars().keys(): + if key.startswith('STGTY_') or key.startswith('DEFECT_'): + __all__.append(key) + + #--- FUNCTIONS ---------------------------------------------------------------- +def isOleFile (filename): + """ + Test if file is an OLE container (according to its header). + filename: file name or path (str, unicode) + return: True if OLE, False otherwise. + """ + f = open(filename, 'rb') + header = f.read(len(MAGIC)) + if header == MAGIC: + return True + else: + return False + + #TODO: replace i16 and i32 with more readable struct.unpack equivalent def i16(c, o = 0): """ @@ -252,37 +350,51 @@ def _clsid(clsid): tuple(map(ord, clsid[8:16])))) -def _unicode(s): - """ - Map unicode string to Latin 1. - """ - #[PL]: use Python Unicode features when available (Python>=2.0): - #TODO: test this with old Python versions <2.0 - #TODO: test if it OleFileIO works with Unicode strings, instead of - # converting to Latin-1. - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = unicode(s, 'UTF-16LE') - except NameError: + +# UNICODE support for Old Python versions: +# (necessary to handle storages/streams names which use Unicode) + +try: + # is Unicode supported ? + unicode + + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python with Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() + """ + #TODO: test if it OleFileIO works with Unicode strings, instead of + # converting to Latin-1. + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = s.decode('UTF-16LE', errors) + if KEEP_UNICODE_NAMES: + return u + else: + # Second the unicode string is converted to Latin-1 + return u.encode('latin_1', errors) + except: + # there was an error during Unicode to Latin-1 conversion: + raise IOError, 'incorrect Unicode name' + +except NameError: + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python without native Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. (ignored in this version) + """ # If the unicode function does not exist, we assume this is an old # Python version without Unicode support. # Null bytes are simply removed (this only works with usual Latin-1 # strings which do not contain unicode characters>256): return filter(ord, s) - except ValueError: - # there was an error during UTF-16 to Unicode decoding: - self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') - # if no exception raised, fallback to foolproof version: - return filter(ord, s) - try: - # Second the unicode string is converted to Latin-1 - return u.encode('latin_1') - except UnicodeError: # possible issue: this exception didn't exist before - # there was an error during Unicode to Latin-1 encoding: - self.raise_defect(DEFECT_INCORRECT, 'incorrect Unicode name') - # if no exception raised, fallback to foolproof version: - return filter(ord, s) + + #=== CLASSES ================================================================== @@ -313,7 +425,7 @@ class _OleStream(StringIO.StringIO): """ Constructor for _OleStream class. - fp : file object, the OLE container + fp : file object, the OLE container or the MiniFAT stream sect : sector index of first sector in the stream size : total size of the stream offset : offset in bytes for the first FAT or MiniFAT sector @@ -326,11 +438,15 @@ class _OleStream(StringIO.StringIO): %(size,offset,sectorsize,len(fat))) #[PL] To detect malformed documents with FAT loops, we compute the # expected number of sectors in the stream: + unknown_size = False if size==0x7FFFFFFF: # this is the case when called from OleFileIO._open(), and stream # size is not known in advance (for example when reading the # Directory stream). Then we can only guess maximum size: size = len(fat)*sectorsize + # and we keep a record that size was unknown: + unknown_size = True + debug(' stream with UNKNOWN SIZE') nb_sectors = (size + (sectorsize-1)) / sectorsize # This number should (at least) be less than the total number of # sectors in the given FAT: @@ -340,12 +456,26 @@ class _OleStream(StringIO.StringIO): # at the end to concatenate all in one string. # (this may not be really useful with recent Python versions) data = [] - #[PL] first sector index should be within FAT or ENDOFCHAIN: - if sect != ENDOFCHAIN and (sect<0 or sect>=len(fat)): - raise IOError, 'incorrect OLE FAT, sector index out of range' + # if size is zero, then first sector index should be ENDOFCHAIN: + if size == 0 and sect != ENDOFCHAIN: + raise IOError, 'incorrect OLE sector index for empty stream' #[PL] A fixed-length for loop is used instead of an undefined while # loop to avoid DoS attacks: for i in xrange(nb_sectors): + # Sector index may be ENDOFCHAIN, but only if size was unknown + if sect == ENDOFCHAIN: + if unknown_size: + break + else: + # else this means that the stream is smaller than declared: + raise IOError, 'incomplete OLE stream' + # sector index should be within FAT: + if sect<0 or sect>=len(fat): + debug('fp = '+ repr(fp)) + debug('file size: %d' % os.path.getsize(fp.name)) + debug('offset=%d, sectorsize=%d, sect=%d, seek=%d, len read=%d, len(fat)=%d' % + (offset, sectorsize, sect, offset + sectorsize * sect, len(sector_data), len(fat))) + raise IOError, 'incorrect OLE FAT, sector index out of range' #TODO: check if this works with 4K sectors: fp.seek(offset + sectorsize * sect) sector_data = fp.read(sectorsize) @@ -355,12 +485,7 @@ class _OleStream(StringIO.StringIO): data.append(sector_data) # jump to next sector in the FAT: try: - #[PL] sector index should not be negative, but Python allows it - if sect<0: raise IndexError sect = fat[sect] - if sect == ENDOFCHAIN: - # this may happen when size was not known: - break except IndexError: # [PL] if pointer is out of the FAT an exception is raised raise IOError, 'incorrect OLE FAT, sector index out of range' @@ -384,15 +509,11 @@ class _OleStream(StringIO.StringIO): #--- _OleDirectoryEntry ------------------------------------------------------- -# FIXME: should add a counter in here to avoid looping forever -# if the tree is broken. - class _OleDirectoryEntry: """ OLE2 Directory Entry """ - #[PL] parsing code moved from OleFileIO.loaddirectory # struct to parse directory entries: @@ -423,15 +544,20 @@ class _OleDirectoryEntry: Constructor for an _OleDirectoryEntry object. Parses a 128-bytes entry from the OLE Directory stream. - entry: string (must be 128 bytes long) + entry : string (must be 128 bytes long) + sid : index of this directory entry in the OLE file directory olefile: OleFileIO containing this directory entry """ self.sid = sid # ref to olefile is stored for future use self.olefile = olefile - # kids is the list of children entries, if this entry is a storage: + # kids is a list of children entries, if this entry is a storage: # (list of _OleDirectoryEntry objects) self.kids = [] + # kids_dict is a dictionary of children entries, indexed by their + # name in lowercase: used to quickly find an entry, and to detect + # duplicates + self.kids_dict = {} # flag used to detect if the entry is referenced more than once in # directory: self.used = False @@ -453,32 +579,59 @@ class _OleDirectoryEntry: sizeHigh ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: - olefile.raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + # only first directory entry can (and should) be root: + if self.entry_type == STGTY_ROOT and sid != 0: + olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') + if sid == 0 and self.entry_type != STGTY_ROOT: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') #debug (struct.unpack(fmt_entry, entry[:len_entry])) # name should be at most 31 unicode characters + null character, # so 64 bytes in total (31*2 + 2): if namelength>64: - olefile.raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') # if exception not raised, namelength is set to the maximum value: namelength = 64 # only characters without ending null char are kept: name = name[:(namelength-2)] # name is converted from unicode to Latin-1: self.name = _unicode(name) - # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes - # sectors: - if olefile.sectorsize == 512 and sizeHigh != 0: - olefile.raise_defect(DEFECT_INCORRECT, 'incorrect OLE stream size') - self.size = sizeLow + (long(sizeHigh)<<32) - self.clsid = _clsid(clsid) debug('DirEntry SID=%d: %s' % (self.sid, self.name)) debug(' - type: %d' % self.entry_type) debug(' - sect: %d' % self.isectStart) - debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, self.sid_right, self.sid_child)) + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes + # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 + # or some other value so it cannot be raised as a defect in general: + if olefile.sectorsize == 512: + if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: + debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % + (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) + olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') + self.size = sizeLow + else: + self.size = sizeLow + (long(sizeHigh)<<32) + debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) + + self.clsid = _clsid(clsid) + # a storage should have a null size, BUT some implementations such as + # Word 8 for Mac seem to allow non-null values => Potential defect: + if self.entry_type == STGTY_STORAGE and self.size != 0: + olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') + # check if stream is not already referenced elsewhere: + if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: + if self.size < olefile.minisectorcutoff \ + and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT + # ministream object + minifat = True + else: + minifat = False + olefile._check_duplicate_stream(self.isectStart, minifat) + + def build_storage_tree(self): """ @@ -519,21 +672,27 @@ class _OleDirectoryEntry: return # check if child SID is in the proper range: if child_sid<0 or child_sid>=len(self.olefile.direntries): - self.olefile.raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') + self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') # get child direntry: - child = self.olefile.direntries[child_sid] + child = self.olefile._load_direntry(child_sid) #direntries[child_sid] debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' % (child.sid, child.name, child.sid_left, child.sid_right, child.sid_child)) # the directory entries are organized as a red-black tree. # (cf. Wikipedia for details) # First walk through left side of the tree: self.append_kids(child.sid_left) + # Check if its name is not already used (case-insensitive): + name_lower = child.name.lower() + if self.kids_dict.has_key(name_lower): + self.olefile._raise_defect(DEFECT_INCORRECT, + "Duplicate filename in OLE storage") # Then the child_sid _OleDirectoryEntry object is appended to the - # kids list: + # kids list and dictionary: self.kids.append(child) + self.kids_dict[name_lower] = child # Check if kid was not already referenced in a storage: if child.used: - self.olefile.raise_defect(DEFECT_INCORRECT, + self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE Entry referenced more than once') child.used = True # Finally walk through right side of the tree: @@ -545,13 +704,14 @@ class _OleDirectoryEntry: def __cmp__(self, other): "Compare entries by name" return cmp(self.name, other.name) + #TODO: replace by the same function as MS implementation ? + # (order by name length first, then case-insensitive order) def dump(self, tab = 0): "Dump this entry, and all its subentries (for debug purposes only)" TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", "(property)", "(root)"] - print " "*tab + repr(self.name), TYPES[self.entry_type], if self.entry_type in (STGTY_STREAM, STGTY_ROOT): print self.size, "bytes", @@ -603,12 +763,12 @@ class OleFileIO: (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a security-oriented application, see source code for details) """ - self.raise_defects_level = raise_defects + self._raise_defects_level = raise_defects if filename: self.open(filename) - def raise_defect(self, defect_level, message): + def _raise_defect(self, defect_level, message): """ This method should be called for any defect found during file parsing. It may raise an IOError exception according to the minimal level chosen @@ -622,7 +782,7 @@ class OleFileIO: message: string describing the defect, used with raised exception. """ # added by [PL] - if defect_level >= self.raise_defects_level: + if defect_level >= self._raise_defects_level: raise IOError, message @@ -636,10 +796,15 @@ class OleFileIO: else: self.fp = filename + # lists of streams in FAT and MiniFAT, to detect duplicate references + # (list of indexes of first sectors of each stream) + self._used_streams_fat = [] + self._used_streams_minifat = [] + header = self.fp.read(512) if len(header) != 512 or header[:8] != MAGIC: - self.raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") + self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") # [PL] header structure according to AAF specifications: ##Header @@ -690,41 +855,43 @@ class OleFileIO: if Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': # OLE signature should always be present - self.raise_defect(DEFECT_FATAL, "incorrect OLE signature") + self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") if clsid != '\x00'*16: # according to AAF specs, CLSID should always be zero - self.raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") debug( "MinorVersion = %d" % MinorVersion ) debug( "DllVersion = %d" % DllVersion ) if DllVersion not in [3, 4]: # version 3: usual format, 512 bytes per sector # version 4: large format, 4K per sector - self.raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") debug( "ByteOrder = %X" % ByteOrder ) if ByteOrder != 0xFFFE: # For now only common little-endian documents are handled correctly - self.raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") + self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") # TODO: add big-endian support for documents created on Mac ? SectorSize = 2**SectorShift debug( "SectorSize = %d" % SectorSize ) if SectorSize not in [512, 4096]: - self.raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") if (DllVersion==3 and SectorSize!=512) or (DllVersion==4 and SectorSize!=4096): - self.raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") + self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") MiniSectorSize = 2**MiniSectorShift debug( "MiniSectorSize = %d" % MiniSectorSize ) if MiniSectorSize not in [64]: - self.raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") if Reserved != 0 or Reserved1 != 0: - self.raise_defect(DEFECT_INCORRECT, "incorrect OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") debug( "csectDir = %d" % csectDir ) if SectorSize==512 and csectDir!=0: - self.raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") + self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") debug( "csectFat = %d" % self.csectFat ) debug( "sectDirStart = %X" % sectDirStart ) debug( "signature = %d" % signature ) + # Signature should be zero, BUT some implementations do not follow this + # rule => only a potential defect: if signature != 0: - self.raise_defect(DEFECT_INCORRECT, "incorrect OLE header") + self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") debug( "MiniSectorCutoff = %d" % MiniSectorCutoff ) debug( "MiniFatStart = %X" % MiniFatStart ) debug( "csectMiniFat = %d" % csectMiniFat ) @@ -738,23 +905,54 @@ class OleFileIO: # file clsid (probably never used, so we don't store it) clsid = _clsid(header[8:24]) - self.sectorsize = 1 << i16(header, 30) self.minisectorsize = 1 << i16(header, 32) - self.minisectorcutoff = i32(header, 56) + # check known streams for duplicate references (these are always in FAT, + # never in MiniFAT): + self._check_duplicate_stream(sectDirStart) + # check MiniFAT only if it is not empty: + if csectMiniFat: + self._check_duplicate_stream(MiniFatStart) + # check DIFAT only if it is not empty: + if self.csectDif: + self._check_duplicate_stream(self.sectDifStart) + # Load file allocation tables self.loadfat(header) - # Load direcory. This sets both the direntries list (ordered by sid) # and the root (ordered by hierarchy) members. self.loaddirectory(i32(header, 48)) - self.ministream = None self.minifatsect = i32(header, 60) + def _check_duplicate_stream(self, first_sect, minifat=False): + """ + Checks if a stream has not been already referenced elsewhere. + This method should only be called once for each known stream, and only + if stream size is not null. + first_sect: index of first sector of the stream in FAT + minifat: if True, stream is located in the MiniFAT, else in the FAT + """ + if minifat: + debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) + used_streams = self._used_streams_minifat + else: + debug('_check_duplicate_stream: sect=%d in FAT' % first_sect) + # some values can be safely ignored (not a real stream): + if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): + return + used_streams = self._used_streams_fat + #TODO: would it be more efficient using a dict or hash values, instead + # of a list of long ? + if first_sect in used_streams: + self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') + else: + used_streams.append(first_sect) + + def dumpfat(self, fat, firstindex=0): "Displays a part of FAT in human-readable form for debugging purpose" # [PL] added only for debug @@ -876,10 +1074,10 @@ class OleFileIO: if self.csectFat <= 109: # there must be at least 109 blocks in header and the rest in # DIFAT, so number of sectors must be >109. - self.raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') + self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') if self.sectDifStart >= self.nb_sect: # initial DIFAT block index must be valid - self.raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') + self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') debug( "DIFAT analysis..." ) # We compute the necessary number of DIFAT sectors : # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) @@ -935,10 +1133,10 @@ class OleFileIO: try: self.fp.seek(self.sectorsize * (sect+1)) except: - self.raise_defect(DEFECT_FATAL, 'wrong index for OLE sector') + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') sector = self.fp.read(self.sectorsize) if len(sector) != self.sectorsize: - self.raise_defect(DEFECT_FATAL, 'incomplete OLE sector') + self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') return sector @@ -952,24 +1150,53 @@ class OleFileIO: # open directory stream as a read-only file: # (stream size is not known in advance) - fp = self._open(sect) + self.directory_fp = self._open(sect) #[PL] to detect malformed documents and avoid DoS attacks, the maximum # number of directory entries can be calculated: - max_entries = fp.size / 128 - debug('loaddirectory: size=%d, max_entries=%d' % (fp.size, max_entries)) + max_entries = self.directory_fp.size / 128 + debug('loaddirectory: size=%d, max_entries=%d' % + (self.directory_fp.size, max_entries)) # Create list of directory entries - self.direntries = [] - for sid in xrange(max_entries): - entry = fp.read(128) - if not entry: - break - self.direntries.append(_OleDirectoryEntry(entry, sid, self)) + #self.direntries = [] + # We start with a list of "None" object + self.direntries = [None] * max_entries +## for sid in xrange(max_entries): +## entry = fp.read(128) +## if not entry: +## break +## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) + # load root entry: + root_entry = self._load_direntry(0) # Root entry is the first entry: self.root = self.direntries[0] # read and build all storage trees, starting from the root: self.root.build_storage_tree() + + + def _load_direntry (self, sid): + """ + Load a directory entry from the directory. + This method should only be called once for each storage/stream when + loading the directory. + sid: index of storage/stream in the directory. + return: a _OleDirectoryEntry object + raise: IOError if the entry has always been referenced. + """ + # check if SID is OK: + if sid<0 or sid>=len(self.direntries): + self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") + # check if entry was already referenced: + if self.direntries[sid] is not None: + self._raise_defect(DEFECT_INCORRECT, + "double reference for OLE stream/storage") + # if exception not raised, return the object + return self.direntries[sid] + self.directory_fp.seek(sid * 128) + entry = self.directory_fp.read(128) + self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) + return self.direntries[sid] def dumpdirectory(self): @@ -1082,11 +1309,7 @@ class OleFileIO: Test if given filename exists as a stream or a storage in the OLE container, and return its type. - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] + filename: path of stream in storage tree. (see openstream for syntax) return: False if object does not exist, its entry type (>0) otherwise: - STGTY_STREAM: a stream - STGTY_STORAGE: a storage @@ -1100,6 +1323,37 @@ class OleFileIO: return False + def exists(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container. + + filename: path of stream in storage tree. (see openstream for syntax) + return: True if object exist, else False. + """ + try: + sid = self._find(filename) + return True + except: + return False + + + def get_size(self, filename): + """ + Return size of a stream in the OLE container, in bytes. + + filename: path of stream in storage tree (see openstream for syntax) + return: size in bytes (long integer) + raise: IOError if file not found, TypeError if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + #TODO: Should it return zero instead of raising an exception ? + raise TypeError, 'object is not an OLE stream' + return entry.size + + def get_rootentry_name(self): """ Return root entry name. Should usually be 'Root Entry' or 'R' in most @@ -1110,13 +1364,10 @@ class OleFileIO: def getproperties(self, filename): """ - Return properties described in substream + Return properties described in substream. - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] + filename: path of stream in storage tree (see openstream for syntax) + return: a dictionary of values indexed by id (integer) """ fp = self.openstream(filename) @@ -1139,6 +1390,8 @@ class OleFileIO: id = i32(s, 8+i*8) offset = i32(s, 12+i*8) type = i32(s, offset) + + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) # test for common types first (should perhaps use # a dictionary instead?) @@ -1198,12 +1451,22 @@ if __name__ == "__main__": # [PL] display quick usage info if launched from command-line if len(sys.argv) <= 1: print __doc__ - print "Launched from command line, this script parses OLE files and prints info." - print "" - sys.exit("usage: OleFileIO_PL.py [file2 ...]") + print """ +Launched from command line, this script parses OLE files and prints info. + +Usage: OleFileIO_PL.py [-d] [file2 ...] + +Options: +-d : debug mode (display a lot of messages, for developers only) +""" + sys.exit() for filename in sys.argv[1:]: ## try: + if filename == '-d': + # option to switch debug mode on: + set_debug_mode(True) + continue ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) print "-" * 68 print filename @@ -1216,10 +1479,26 @@ if __name__ == "__main__": props = props.items() props.sort() for k, v in props: + #[PL]: avoid to display too large or binary values: + if isinstance(v, basestring): + if len(v) > 50: + v = v[:50] + # quick and dirty binary check: + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30,31): + if chr(c) in v: + v = '(binary data)' + break print " ", k, v + + #[PL] Test a few new methods: root = ole.get_rootentry_name() print 'Root entry name: "%s"' % root - if ole.get_type('macros/vba'): - print "This may be a Word document with VBA macros." + if ole.exists('worddocument'): + print "This is a Word document." + print "type of stream 'WordDocument':", ole.get_type('worddocument') + print "size :", ole.get_size('worddocument') + if ole.exists('macros/vba'): + print "This document may contain VBA macros." ## except IOError, v: ## print "***", "cannot read", file, "-", v From 70a99619bc910fb0fb92edaed2e0b9fa87e43996 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:29:49 +0200 Subject: [PATCH 008/101] version 0.18 2007-12-05 --- PIL/OleFileIO.py | 237 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 175 insertions(+), 62 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 965bc0fc8..d1cc456ee 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.17 2007-12-04 Philippe Lagadec - http://lagasoft.free.fr +version 0.18 2007-12-05 Philippe Lagadec - http://lagasoft.free.fr Project website: http://lagasoft.free.fr/python/olefileio @@ -24,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2007-12-04" -__version__ = '0.17' +__date__ = "2007-12-08" +__version__ = '0.18' #--- LICENSE ------------------------------------------------------------------ @@ -99,6 +99,9 @@ __version__ = '0.17' # - added safety check in main for large or binary # properties # - allow size>0 for storages for some implementations +# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and +# streams +# - added option '-c' in main to check all streams #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -106,6 +109,8 @@ __version__ = '0.17' # - add underscore to each private method, to avoid their display in # pydoc/epydoc documentation # - replace all raised exceptions with _raise_defect (at least in OleFileIO) +# - merge code from _OleStream and OleFileIO.getsect to read sectors +# (maybe add a class for FAT and MiniFAT ?) # - add method to check all streams (follow sectors chains without storing all # stream in memory, and report anomalies) # - use _OleDirectoryEntry.kids_dict to improve _find and _list ? @@ -434,8 +439,13 @@ class _OleStream(StringIO.StringIO): return : a StringIO instance containing the OLE stream """ debug('_OleStream.__init__:') - debug(' size=%d, offset=%d, sectorsize=%d, len(fat)=%d' - %(size,offset,sectorsize,len(fat))) + debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) + # for debugging messages, size of file where stream is read: + if isinstance(fp, StringIO.StringIO): + filesize = len(fp.getvalue()) # file in MiniFAT + else: + filesize = os.path.getsize(fp.name) # file on disk #[PL] To detect malformed documents with FAT loops, we compute the # expected number of sectors in the stream: unknown_size = False @@ -448,6 +458,7 @@ class _OleStream(StringIO.StringIO): unknown_size = True debug(' stream with UNKNOWN SIZE') nb_sectors = (size + (sectorsize-1)) / sectorsize + debug('nb_sectors = %d' % nb_sectors) # This number should (at least) be less than the total number of # sectors in the given FAT: if nb_sectors > len(fat): @@ -458,6 +469,7 @@ class _OleStream(StringIO.StringIO): data = [] # if size is zero, then first sector index should be ENDOFCHAIN: if size == 0 and sect != ENDOFCHAIN: + debug('size == 0 and sect != ENDOFCHAIN:') raise IOError, 'incorrect OLE sector index for empty stream' #[PL] A fixed-length for loop is used instead of an undefined while # loop to avoid DoS attacks: @@ -468,19 +480,35 @@ class _OleStream(StringIO.StringIO): break else: # else this means that the stream is smaller than declared: + debug('sect=ENDOFCHAIN before expected size') raise IOError, 'incomplete OLE stream' # sector index should be within FAT: if sect<0 or sect>=len(fat): - debug('fp = '+ repr(fp)) - debug('file size: %d' % os.path.getsize(fp.name)) - debug('offset=%d, sectorsize=%d, sect=%d, seek=%d, len read=%d, len(fat)=%d' % - (offset, sectorsize, sect, offset + sectorsize * sect, len(sector_data), len(fat))) + debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) + debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) +## tmp_data = string.join(data, "") +## f = open('test_debug.bin', 'wb') +## f.write(tmp_data) +## f.close() +## debug('data read so far: %d bytes' % len(tmp_data)) raise IOError, 'incorrect OLE FAT, sector index out of range' + #TODO: merge this code with OleFileIO.getsect() ? #TODO: check if this works with 4K sectors: - fp.seek(offset + sectorsize * sect) + try: + fp.seek(offset + sectorsize * sect) + except: + debug('sect=%d, seek=%d, filesize=%d' % + (sect, offset+sectorsize*sect, filesize)) + raise IOError, 'OLE sector index out of range' sector_data = fp.read(sectorsize) # [PL] check if there was enough data: - if len(sector_data) != sectorsize: + # Note: if sector is the last of the file, sometimes it is not a + # complete sector (of 512 or 4K), so we may read less than + # sectorsize. + if len(sector_data)!=sectorsize and sect!=(len(fat)-1): + debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % + (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) + debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) raise IOError, 'incomplete OLE sector' data.append(sector_data) # jump to next sector in the FAT: @@ -494,14 +522,18 @@ class _OleStream(StringIO.StringIO): raise IOError, 'incorrect last sector index in OLE stream' data = string.join(data, "") # Data is truncated to the actual stream size: - if len(data) > size: + if len(data) >= size: data = data[:size] # actual stream size is stored for future use: self.size = size - else: + elif unknown_size: # actual stream size was not known, now we know the size of read # data: self.size = len(data) + else: + # read data is less than expected: + debug('len(data)=%d, size=%d' % (len(data), size)) + raise IOError, 'OLE stream size is less than declared' # when all data is read in memory, StringIO constructor is called StringIO.StringIO.__init__(self, data) # Then the _OleStream object can be used as a read-only file object. @@ -597,7 +629,7 @@ class _OleDirectoryEntry: # name is converted from unicode to Latin-1: self.name = _unicode(name) - debug('DirEntry SID=%d: %s' % (self.sid, self.name)) + debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) debug(' - type: %d' % self.entry_type) debug(' - sect: %d' % self.isectStart) debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, @@ -641,7 +673,7 @@ class _OleDirectoryEntry: only be called for the root object once. """ debug('build_storage_tree: SID=%d - %s - sid_child=%d' - % (self.sid, self.name, self.sid_child)) + % (self.sid, repr(self.name), self.sid_child)) if self.sid_child != NOSTREAM: # if child SID is not NOSTREAM, then this entry is a storage. # Let's walk through the tree of children to fill the kids list: @@ -676,7 +708,7 @@ class _OleDirectoryEntry: # get child direntry: child = self.olefile._load_direntry(child_sid) #direntries[child_sid] debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' - % (child.sid, child.name, child.sid_left, child.sid_right, child.sid_child)) + % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) # the directory entries are organized as a red-black tree. # (cf. Wikipedia for details) # First walk through left side of the tree: @@ -847,74 +879,91 @@ class OleFileIO: header_size = struct.calcsize(fmt_header) debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) header1 = header[:header_size] - (Sig, clsid, MinorVersion, DllVersion, ByteOrder, SectorShift, - MiniSectorShift, Reserved, Reserved1, csectDir, self.csectFat, sectDirStart, - signature, MiniSectorCutoff, MiniFatStart, csectMiniFat, self.sectDifStart, - self.csectDif) = struct.unpack(fmt_header, header1) + ( + self.Sig, + self.clsid, + self.MinorVersion, + self.DllVersion, + self.ByteOrder, + self.SectorShift, + self.MiniSectorShift, + self.Reserved, self.Reserved1, + self.csectDir, + self.csectFat, + self.sectDirStart, + self.signature, + self.MiniSectorCutoff, + self.MiniFatStart, + self.csectMiniFat, + self.sectDifStart, + self.csectDif + ) = struct.unpack(fmt_header, header1) debug( struct.unpack(fmt_header, header1)) - if Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': # OLE signature should always be present self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") - if clsid != '\x00'*16: + if self.clsid != '\x00'*16: # according to AAF specs, CLSID should always be zero self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") - debug( "MinorVersion = %d" % MinorVersion ) - debug( "DllVersion = %d" % DllVersion ) - if DllVersion not in [3, 4]: + debug( "MinorVersion = %d" % self.MinorVersion ) + debug( "DllVersion = %d" % self.DllVersion ) + if self.DllVersion not in [3, 4]: # version 3: usual format, 512 bytes per sector # version 4: large format, 4K per sector self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") - debug( "ByteOrder = %X" % ByteOrder ) - if ByteOrder != 0xFFFE: + debug( "ByteOrder = %X" % self.ByteOrder ) + if self.ByteOrder != 0xFFFE: # For now only common little-endian documents are handled correctly self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") # TODO: add big-endian support for documents created on Mac ? - SectorSize = 2**SectorShift - debug( "SectorSize = %d" % SectorSize ) - if SectorSize not in [512, 4096]: + self.SectorSize = 2**self.SectorShift + debug( "SectorSize = %d" % self.SectorSize ) + if self.SectorSize not in [512, 4096]: self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") - if (DllVersion==3 and SectorSize!=512) or (DllVersion==4 and SectorSize!=4096): + if (self.DllVersion==3 and self.SectorSize!=512) \ + or (self.DllVersion==4 and self.SectorSize!=4096): self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") - MiniSectorSize = 2**MiniSectorShift - debug( "MiniSectorSize = %d" % MiniSectorSize ) - if MiniSectorSize not in [64]: + self.MiniSectorSize = 2**self.MiniSectorShift + debug( "MiniSectorSize = %d" % self.MiniSectorSize ) + if self.MiniSectorSize not in [64]: self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") - if Reserved != 0 or Reserved1 != 0: + if self.Reserved != 0 or self.Reserved1 != 0: self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") - debug( "csectDir = %d" % csectDir ) - if SectorSize==512 and csectDir!=0: + debug( "csectDir = %d" % self.csectDir ) + if self.SectorSize==512 and self.csectDir!=0: self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") debug( "csectFat = %d" % self.csectFat ) - debug( "sectDirStart = %X" % sectDirStart ) - debug( "signature = %d" % signature ) + debug( "sectDirStart = %X" % self.sectDirStart ) + debug( "signature = %d" % self.signature ) # Signature should be zero, BUT some implementations do not follow this # rule => only a potential defect: - if signature != 0: + if self.signature != 0: self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") - debug( "MiniSectorCutoff = %d" % MiniSectorCutoff ) - debug( "MiniFatStart = %X" % MiniFatStart ) - debug( "csectMiniFat = %d" % csectMiniFat ) - debug( "sectDifStart = %X" % self.sectDifStart ) - debug( "csectDif = %d" % self.csectDif ) + debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) + debug( "MiniFatStart = %X" % self.MiniFatStart ) + debug( "csectMiniFat = %d" % self.csectMiniFat ) + debug( "sectDifStart = %X" % self.sectDifStart ) + debug( "csectDif = %d" % self.csectDif ) # calculate the number of sectors in the file # (-1 because header doesn't count) - self.nb_sect = (os.path.getsize(filename) / SectorSize) - 1 + filesize = os.path.getsize(filename) + self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 debug( "Number of sectors in the file: %d" % self.nb_sect ) # file clsid (probably never used, so we don't store it) clsid = _clsid(header[8:24]) - self.sectorsize = 1 << i16(header, 30) - self.minisectorsize = 1 << i16(header, 32) - self.minisectorcutoff = i32(header, 56) + self.sectorsize = self.SectorSize #1 << i16(header, 30) + self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) + self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) # check known streams for duplicate references (these are always in FAT, # never in MiniFAT): - self._check_duplicate_stream(sectDirStart) + self._check_duplicate_stream(self.sectDirStart) # check MiniFAT only if it is not empty: - if csectMiniFat: - self._check_duplicate_stream(MiniFatStart) + if self.csectMiniFat: + self._check_duplicate_stream(self.MiniFatStart) # check DIFAT only if it is not empty: if self.csectDif: self._check_duplicate_stream(self.sectDifStart) @@ -923,9 +972,9 @@ class OleFileIO: self.loadfat(header) # Load direcory. This sets both the direntries list (ordered by sid) # and the root (ordered by hierarchy) members. - self.loaddirectory(i32(header, 48)) + self.loaddirectory(self.sectDirStart)#i32(header, 48)) self.ministream = None - self.minifatsect = i32(header, 60) + self.minifatsect = self.MiniFatStart #i32(header, 60) def _check_duplicate_stream(self, first_sect, minifat=False): @@ -1104,6 +1153,13 @@ class OleFileIO: ## # FAT should contain csectFat blocks ## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) ## raise IOError, 'incorrect DIFAT' + # since FAT is read from fixed-size sectors, it may contain more values + # than the actual number of sectors in the file. + # Keep only the relevant sector indexes: + if len(self.fat) > self.nb_sect: + debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) + self.fat = self.fat[:self.nb_sect] + debug('\nFAT:') self.dumpfat(self.fat) @@ -1111,13 +1167,35 @@ class OleFileIO: """ Load the MiniFAT table. """ - # This is stored in a standard sub-stream, pointed to by a header + # MiniFAT is stored in a standard sub-stream, pointed to by a header # field. - s = self._open(self.minifatsect).read() + # NOTE: there are two sizes to take into account for this stream: + # 1) Stream size is calculated according to the number of sectors + # declared in the OLE header. This allocated stream may be more than + # needed to store the actual sector indexes. + # (self.csectMiniFat is the number of sectors of size self.SectorSize) + stream_size = self.csectMiniFat * self.SectorSize + # 2) Actually used size is calculated by dividing the MiniStream size + # (given by root entry size) by the size of mini sectors, *4 for + # 32 bits indexes: + nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize + used_size = nb_minisectors * 4 + debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % + (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) + if used_size > stream_size: + # This is not really a problem, but may indicate a wrong implementation: + self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') + # In any case, first read stream_size: + s = self._open(self.minifatsect, stream_size, force_FAT=True).read() #[PL] Old code replaced by an array: #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) self.minifat = array.array('L', s) - + # Then shrink the array to used size, to avoid indexes out of MiniStream: + debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) + self.minifat = self.minifat[:nb_minisectors] + debug('loadminifat(): len=%d' % len(self.minifat)) + debug('\nMiniFAT:') + self.dumpfat(self.minifat) def getsect(self, sect): """ @@ -1133,9 +1211,13 @@ class OleFileIO: try: self.fp.seek(self.sectorsize * (sect+1)) except: + debug('getsect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name))) self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') sector = self.fp.read(self.sectorsize) if len(sector) != self.sectorsize: + debug('getsect(): sect=%X, read=%d, sectorsize=%d' % + (sect, len(sector), self.sectorsize)) self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') return sector @@ -1206,23 +1288,31 @@ class OleFileIO: self.root.dump() - def _open(self, start, size = 0x7FFFFFFF): + def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): """ Open a stream, either in FAT or MiniFAT according to its size. (openstream helper) start: index of first sector size: size of stream (or nothing if size is unknown) + force_FAT: if False (default), stream will be opened in FAT or MiniFAT + according to size. If True, it will always be opened in FAT. """ + debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % + (start, size, str(force_FAT))) # stream size is compared to the MiniSectorCutoff threshold: - if size < self.minisectorcutoff: + if size < self.minisectorcutoff and not force_FAT: # ministream object if not self.ministream: # load MiniFAT if it wasn't already done: self.loadminifat() # The first sector index of the miniFAT stream is stored in the # root directory entry: - self.ministream = self._open(self.root.isectStart) + size_ministream = self.root.size + debug('Opening MiniStream: sect=%d, size=%d' % + (self.root.isectStart, size_ministream)) + self.ministream = self._open(self.root.isectStart, + size_ministream, force_FAT=True) return _OleStream(self.ministream, start, size, 0, self.minisectorsize, self.minifat) else: @@ -1454,19 +1544,27 @@ if __name__ == "__main__": print """ Launched from command line, this script parses OLE files and prints info. -Usage: OleFileIO_PL.py [-d] [file2 ...] +Usage: OleFileIO_PL.py [-d] [-s] [file2 ...] Options: --d : debug mode (display a lot of messages, for developers only) +-d : debug mode (display a lot of debug information, for developers only) +-s : check all streams (for debugging purposes) """ sys.exit() + check_streams = False for filename in sys.argv[1:]: ## try: + # OPTIONS: if filename == '-d': # option to switch debug mode on: set_debug_mode(True) continue + if filename == '-c': + # option to switch check streams mode on: + check_streams = True + continue + ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) print "-" * 68 print filename @@ -1490,6 +1588,21 @@ Options: v = '(binary data)' break print " ", k, v + + if check_streams: + # Read all streams to check if there are errors: + print '\nChecking streams...' + for streamname in ole.listdir(): + # print name using repr() to convert binary chars to \xNN: + print '-', repr('/'.join(streamname)),'-', + st_type = ole.get_type(streamname) + if st_type == STGTY_STREAM: + print 'size %d' % ole.get_size(streamname) + # just try to read stream in memory: + ole.openstream(streamname) + else: + print 'NOT a stream : type=%d' % st_type + print '' #[PL] Test a few new methods: root = ole.get_rootentry_name() From 61a3ceb812ee4b3d4fd1dd4ed1820f6f00038217 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:34:48 +0200 Subject: [PATCH 009/101] version 0.19 2009-12-10 --- PIL/OleFileIO.py | 70 ++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index d1cc456ee..32233b195 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,9 +6,9 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.18 2007-12-05 Philippe Lagadec - http://lagasoft.free.fr +version 0.19 2009-12-10 Philippe Lagadec - http://www.decalage.info -Project website: http://lagasoft.free.fr/python/olefileio +Project website: http://www.decalage.info/python/olefileio Improved version of the OleFileIO module from PIL library v1.1.6 See: http://www.pythonware.com/products/pil/index.htm @@ -16,7 +16,7 @@ See: http://www.pythonware.com/products/pil/index.htm The Python Imaging Library (PIL) is Copyright (c) 1997-2005 by Secret Labs AB Copyright (c) 1995-2005 by Fredrik Lundh -OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec +OleFileIO_PL changes are Copyright (c) 2005-2009 by Philippe Lagadec See source code and LICENSE.txt for information on usage and redistribution. @@ -24,15 +24,15 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2007-12-08" -__version__ = '0.18' +__date__ = "2009-12-10" +__version__ = '0.19' #--- LICENSE ------------------------------------------------------------------ # OleFileIO_PL is an improved version of the OleFileIO module from the # Python Imaging Library (PIL). -# OleFileIO_PL changes are Copyright (c) 2005-2007 by Philippe Lagadec +# OleFileIO_PL changes are Copyright (c) 2005-2009 by Philippe Lagadec # # The Python Imaging Library (PIL) is # Copyright (c) 1997-2005 by Secret Labs AB @@ -102,6 +102,8 @@ __version__ = '0.18' # 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and # streams # - added option '-c' in main to check all streams +# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms +# (thanks to Ben G. and Martijn for reporting the bug) #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -193,6 +195,16 @@ import string, StringIO, struct, array, os.path #[PL] Define explicitly the public API to avoid private objects in pydoc: __all__ = ['OleFileIO', 'isOleFile'] +#[PL] workaround to fix an issue with array item size on 64 bits systems: +if array.array('L').itemsize == 4: + # on 32 bits platforms, long integers in an array are 32 bits: + UINT32 = 'L' +elif array.array('I').itemsize == 4: + # on 64 bits platforms, integers in an array are 32 bits: + UINT32 = 'I' +else: + raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' + #[PL] These workarounds were inspired from the Path module # (see http://www.jorendorff.com/articles/python/path/) @@ -362,11 +374,11 @@ def _clsid(clsid): try: # is Unicode supported ? unicode - + def _unicode(s, errors='replace'): """ Map unicode string to Latin 1. (Python with Unicode support) - + s: UTF-16LE unicode string to convert to Latin-1 errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() """ @@ -381,7 +393,7 @@ try: else: # Second the unicode string is converted to Latin-1 return u.encode('latin_1', errors) - except: + except: # there was an error during Unicode to Latin-1 conversion: raise IOError, 'incorrect Unicode name' @@ -575,7 +587,7 @@ class _OleDirectoryEntry: """ Constructor for an _OleDirectoryEntry object. Parses a 128-bytes entry from the OLE Directory stream. - + entry : string (must be 128 bytes long) sid : index of this directory entry in the OLE file directory olefile: OleFileIO containing this directory entry @@ -662,7 +674,7 @@ class _OleDirectoryEntry: else: minifat = False olefile._check_duplicate_stream(self.isectStart, minifat) - + def build_storage_tree(self): @@ -789,7 +801,7 @@ class OleFileIO: def __init__(self, filename = None, raise_defects=DEFECT_FATAL): """ Constructor for OleFileIO class. - + filename: file to open. raise_defects: minimal level for defects to be raised as exceptions. (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a @@ -816,7 +828,7 @@ class OleFileIO: # added by [PL] if defect_level >= self._raise_defects_level: raise IOError, message - + def open(self, filename): """ @@ -1044,7 +1056,7 @@ class OleFileIO: if not DEBUG_MODE: return VPL=8 # number of values per line (8+1 * 8+1 = 81) - tab = array.array('L', sector) + tab = array.array(UINT32, sector) nbsect = len(tab) nlines = (nbsect+VPL-1)/VPL print "index", @@ -1076,7 +1088,7 @@ class OleFileIO: fat1 = sect else: # if it's a raw sector, it is parsed in an array - fat1 = array.array('L', sect) + fat1 = array.array(UINT32, sect) self.dumpsect(sect) # The FAT is a sector chain starting at the first index of itself. for isect in fat1: @@ -1088,7 +1100,7 @@ class OleFileIO: s = self.getsect(isect) # parse it as an array of 32 bits integers, and add it to the # global FAT array - self.fat = self.fat + array.array('L', s) + self.fat = self.fat + array.array(UINT32, s) return isect @@ -1098,7 +1110,7 @@ class OleFileIO: """ # The header contains a sector numbers # for the first 109 FAT sectors. Additional sectors are - # described by DIF blocks + # described by DIF blocks sect = header[76:512] debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) @@ -1106,7 +1118,7 @@ class OleFileIO: # [PL] FAT is an array of 32 bits unsigned ints, it's more effective # to use an array than a list in Python. # It's initialized as empty first: - self.fat = array.array('L') + self.fat = array.array(UINT32) self.loadfat_sect(sect) #self.dumpfat(self.fat) ## for i in range(0, len(sect), 4): @@ -1116,7 +1128,7 @@ class OleFileIO: ## break ## s = self.getsect(ix) ## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) -## fat = fat + array.array('L', s) +## fat = fat + array.array(UINT32, s) if self.csectDif != 0: # [PL] There's a DIFAT because file is larger than 6.8MB # some checks just in case: @@ -1139,7 +1151,7 @@ class OleFileIO: debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) #TODO: check if corresponding FAT SID = DIFSECT sector_difat = self.getsect(isect_difat) - difat = array.array('L', sector_difat) + difat = array.array(UINT32, sector_difat) self.dumpsect(sector_difat) self.loadfat_sect(difat[:127]) # last DIFAT pointer is next DIFAT sector: @@ -1189,7 +1201,7 @@ class OleFileIO: s = self._open(self.minifatsect, stream_size, force_FAT=True).read() #[PL] Old code replaced by an array: #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) - self.minifat = array.array('L', s) + self.minifat = array.array(UINT32, s) # Then shrink the array to used size, to avoid indexes out of MiniStream: debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) self.minifat = self.minifat[:nb_minisectors] @@ -1255,8 +1267,8 @@ class OleFileIO: self.root = self.direntries[0] # read and build all storage trees, starting from the root: self.root.build_storage_tree() - - + + def _load_direntry (self, sid): """ Load a directory entry from the directory. @@ -1292,7 +1304,7 @@ class OleFileIO: """ Open a stream, either in FAT or MiniFAT according to its size. (openstream helper) - + start: index of first sector size: size of stream (or nothing if size is unknown) force_FAT: if False (default), stream will be opened in FAT or MiniFAT @@ -1378,7 +1390,7 @@ class OleFileIO: def openstream(self, filename): """ Open a stream as a read-only file object (StringIO). - + filename: path of stream in storage tree (except root entry), either: - a string using Unix path syntax, for example: 'storage_1/storage_1.2/stream' @@ -1480,7 +1492,7 @@ class OleFileIO: id = i32(s, 8+i*8) offset = i32(s, 12+i*8) type = i32(s, offset) - + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) # test for common types first (should perhaps use @@ -1544,11 +1556,11 @@ if __name__ == "__main__": print """ Launched from command line, this script parses OLE files and prints info. -Usage: OleFileIO_PL.py [-d] [-s] [file2 ...] +Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] Options: -d : debug mode (display a lot of debug information, for developers only) --s : check all streams (for debugging purposes) +-c : check all streams (for debugging purposes) """ sys.exit() @@ -1588,7 +1600,7 @@ Options: v = '(binary data)' break print " ", k, v - + if check_streams: # Read all streams to check if there are errors: print '\nChecking streams...' From 4248acb7a73edaf3ce41f4d6b85fe977cfdca068 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:35:53 +0200 Subject: [PATCH 010/101] version 0.20 2009-12-11 --- PIL/OleFileIO.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 32233b195..5e296ef52 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.19 2009-12-10 Philippe Lagadec - http://www.decalage.info +version 0.20 2009-12-11 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2009-12-10" -__version__ = '0.19' +__date__ = "2009-12-11" +__version__ = '0.20' #--- LICENSE ------------------------------------------------------------------ @@ -104,6 +104,7 @@ __version__ = '0.19' # - added option '-c' in main to check all streams # 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms # (thanks to Ben G. and Martijn for reporting the bug) +# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -834,11 +835,22 @@ class OleFileIO: """ Open an OLE2 file. Reads the header, FAT and directory. + + filename: string-like or file-like object """ - if type(filename) == type(""): - self.fp = open(filename, "rb") - else: + #[PL] check if filename is a string-like or file-like object: + # (it is better to check for a read() method) + if hasattr(filename, 'read'): + # file-like object self.fp = filename + else: + # string-like object + self.fp = open(filename, "rb") + # old code fails if filename is not a plain string: + #if type(filename) == type(""): + # self.fp = open(filename, "rb") + #else: + # self.fp = filename # lists of streams in FAT and MiniFAT, to detect duplicate references # (list of indexes of first sectors of each stream) From 87a7ec955ed8146bea35816d54273364dc488f7e Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:40:42 +0200 Subject: [PATCH 011/101] version 0.21 2010-01-22 --- PIL/OleFileIO.py | 3294 +++++++++++++++++++++++----------------------- 1 file changed, 1653 insertions(+), 1641 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 5e296ef52..bcac82e55 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -1,1641 +1,1653 @@ -#!/usr/local/bin/python -# -*- coding: latin-1 -*- -""" -OleFileIO_PL: - Module to read Microsoft OLE2 files (also called Structured Storage or - Microsoft Compound Document File Format), such as Microsoft Office - documents, Image Composer and FlashPix files, Outlook messages, ... - -version 0.20 2009-12-11 Philippe Lagadec - http://www.decalage.info - -Project website: http://www.decalage.info/python/olefileio - -Improved version of the OleFileIO module from PIL library v1.1.6 -See: http://www.pythonware.com/products/pil/index.htm - -The Python Imaging Library (PIL) is - Copyright (c) 1997-2005 by Secret Labs AB - Copyright (c) 1995-2005 by Fredrik Lundh -OleFileIO_PL changes are Copyright (c) 2005-2009 by Philippe Lagadec - -See source code and LICENSE.txt for information on usage and redistribution. - -WARNING: THIS IS (STILL) WORK IN PROGRESS. -""" - -__author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2009-12-11" -__version__ = '0.20' - -#--- LICENSE ------------------------------------------------------------------ - -# OleFileIO_PL is an improved version of the OleFileIO module from the -# Python Imaging Library (PIL). - -# OleFileIO_PL changes are Copyright (c) 2005-2009 by Philippe Lagadec -# -# The Python Imaging Library (PIL) is -# Copyright (c) 1997-2005 by Secret Labs AB -# Copyright (c) 1995-2005 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its associated -# documentation, you agree that you have read, understood, and will comply with -# the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and its -# associated documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appears in all copies, and that both -# that copyright notice and this permission notice appear in supporting -# documentation, and that the name of Secret Labs AB or the author(s) not be used -# in advertising or publicity pertaining to distribution of the software -# without specific, written prior permission. -# -# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS -# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. -# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, -# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THIS SOFTWARE. - -#----------------------------------------------------------------------------- -# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) -# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility -# (all changes flagged with [PL]) -# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise -# exceptions in _OleStream.__init__() -# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) -# - added some constants -# - added header values checks -# - added some docstrings -# - getsect: bugfix in case sectors >512 bytes -# - getsect: added conformity checks -# - DEBUG_MODE constant to activate debug display -# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments -# - updated license -# - converted tabs to 4 spaces -# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity -# - improved _unicode() to use Python 2.x unicode support -# - fixed bug in _OleDirectoryEntry -# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops -# - fixed _OleStream which didn't check stream size -# - added/improved many docstrings and comments -# - moved helper functions _unicode and _clsid out of -# OleFileIO class -# - improved OleFileIO._find() to add Unix path syntax -# - OleFileIO._find() is now case-insensitive -# - added get_type() and get_rootentry_name() -# - rewritten loaddirectory and _OleDirectoryEntry -# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict -# - added detection of duplicate filenames in storages -# - added detection of duplicate references to streams -# - added get_size() and exists() to _OleDirectoryEntry -# - added isOleFile to check header before parsing -# - added __all__ list to control public keywords in pydoc -# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory -# - improved _unicode(), added workarounds for Python <2.3 -# - added set_debug_mode and -d option to set debug mode -# - fixed bugs in OleFileIO.open and _OleDirectoryEntry -# - added safety check in main for large or binary -# properties -# - allow size>0 for storages for some implementations -# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and -# streams -# - added option '-c' in main to check all streams -# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms -# (thanks to Ben G. and Martijn for reporting the bug) -# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str - -#----------------------------------------------------------------------------- -# TODO (for version 1.0): -# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... -# - add underscore to each private method, to avoid their display in -# pydoc/epydoc documentation -# - replace all raised exceptions with _raise_defect (at least in OleFileIO) -# - merge code from _OleStream and OleFileIO.getsect to read sectors -# (maybe add a class for FAT and MiniFAT ?) -# - add method to check all streams (follow sectors chains without storing all -# stream in memory, and report anomalies) -# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? -# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) -# => if possible avoid converting names to Latin-1 -# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) -# - rewrite OleFileIO.getproperties -# - improve docstrings to show more sample uses -# - see also original notes and FIXME below -# - remove all obsolete FIXMEs - -# IDEAS: -# - allow _raise_defect to raise different exceptions, not only IOError -# - provide a class with named attributes to get well-known properties of -# MS Office documents (title, author, ...) ? -# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for -# streams with unknown size -# - use arrays of int instead of long integers for FAT/MiniFAT, to improve -# performance and reduce memory usage ? (possible issue with values >2^31) -# - provide tests with unittest (may need write support to create samples) -# - move all debug code (and maybe dump methods) to a separate module, with -# a class which inherits OleFileIO ? -# - fix docstrings to follow epydoc format -# - add support for 4K sectors ? -# - add support for big endian byte order ? -# - create a simple OLE explorer with wxPython - -# FUTURE EVOLUTIONS to add write support: -# 1) add ability to write a stream back on disk from StringIO (same size, no -# change in FAT/MiniFAT). -# 2) rename a stream/storage if it doesn't change the RB tree -# 3) use rbtree module to update the red-black tree + any rename -# 4) remove a stream/storage: free sectors in FAT/MiniFAT -# 5) allocate new sectors in FAT/MiniFAT -# 6) create new storage/stream -#----------------------------------------------------------------------------- - -# -# THIS IS WORK IN PROGRESS -# -# The Python Imaging Library -# $Id$ -# -# stuff to deal with OLE2 Structured Storage files. this module is -# used by PIL to read Image Composer and FlashPix files, but can also -# be used to read other files of this type. -# -# History: -# 1997-01-20 fl Created -# 1997-01-22 fl Fixed 64-bit portability quirk -# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) -# 2004-02-29 fl Changed long hex constants to signed integers -# -# Notes: -# FIXME: sort out sign problem (eliminate long hex constants) -# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] -# FIXME: provide a glob mechanism function (using fnmatchcase) -# -# Literature: -# -# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, -# September 1996. -# -# Quotes: -# -# "If this document and functionality of the Software conflict, -# the actual functionality of the Software represents the correct -# functionality" -- Microsoft, in the OLE format specification -# -# Copyright (c) Secret Labs AB 1997. -# Copyright (c) Fredrik Lundh 1997. -# -# See the README file for information on usage and redistribution. -# - -#------------------------------------------------------------------------------ - -import string, StringIO, struct, array, os.path - -#[PL] Define explicitly the public API to avoid private objects in pydoc: -__all__ = ['OleFileIO', 'isOleFile'] - -#[PL] workaround to fix an issue with array item size on 64 bits systems: -if array.array('L').itemsize == 4: - # on 32 bits platforms, long integers in an array are 32 bits: - UINT32 = 'L' -elif array.array('I').itemsize == 4: - # on 64 bits platforms, integers in an array are 32 bits: - UINT32 = 'I' -else: - raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' - - -#[PL] These workarounds were inspired from the Path module -# (see http://www.jorendorff.com/articles/python/path/) -#TODO: test with old Python versions - -# Pre-2.3 workaround for booleans -try: - True, False -except NameError: - True, False = 1, 0 - -# Pre-2.3 workaround for basestring. -try: - basestring -except NameError: - try: - # is Unicode supported (Python >2.0 or >1.6 ?) - basestring = (str, unicode) - except NameError: - basestring = str - -#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode -# if False (default PIL behaviour), all filenames are converted to Latin-1. -KEEP_UNICODE_NAMES = False - -#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on -# command line to change it. -DEBUG_MODE = False -def debug_print(msg): - print msg -def debug_pass(msg): - pass -debug = debug_pass - -def set_debug_mode(debug_mode): - """ - Set debug mode on or off, to control display of debugging messages. - mode: True or False - """ - global DEBUG_MODE, debug - DEBUG_MODE = debug_mode - if debug_mode: - debug = debug_print - else: - debug = debug_pass - -#TODO: convert this to hex -MAGIC = '\320\317\021\340\241\261\032\341' - -#[PL]: added constants for Sector IDs (from AAF specifications) -MAXREGSECT = 0xFFFFFFFAL; # maximum SECT -DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT -FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT -ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain -FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector - -#[PL]: added constants for Directory Entry IDs (from AAF specifications) -MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID -NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry - -#[PL] object types in storage (from AAF specifications) -STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) -STGTY_STORAGE = 1 # element is a storage object -STGTY_STREAM = 2 # element is a stream object -STGTY_LOCKBYTES = 3 # element is an ILockBytes object -STGTY_PROPERTY = 4 # element is an IPropertyStorage object -STGTY_ROOT = 5 # element is a root storage - - -# -# -------------------------------------------------------------------- -# property types - -VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; -VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; -VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; -VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; -VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; -VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; -VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; -VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; -VT_VECTOR=0x1000; - -# map property id to name (for debugging purposes) - -VT = {} -for keyword, var in vars().items(): - if keyword[:3] == "VT_": - VT[var] = keyword - -# -# -------------------------------------------------------------------- -# Some common document types (root.clsid fields) - -WORD_CLSID = "00020900-0000-0000-C000-000000000046" -#TODO: check Excel, PPT, ... - -#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() -DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect -DEFECT_POTENTIAL = 20 # a potential defect -DEFECT_INCORRECT = 30 # an error according to specifications, but parsing - # can go on -DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is - # impossible - -#[PL] add useful constants to __all__: -for key in vars().keys(): - if key.startswith('STGTY_') or key.startswith('DEFECT_'): - __all__.append(key) - - -#--- FUNCTIONS ---------------------------------------------------------------- - -def isOleFile (filename): - """ - Test if file is an OLE container (according to its header). - filename: file name or path (str, unicode) - return: True if OLE, False otherwise. - """ - f = open(filename, 'rb') - header = f.read(len(MAGIC)) - if header == MAGIC: - return True - else: - return False - - -#TODO: replace i16 and i32 with more readable struct.unpack equivalent -def i16(c, o = 0): - """ - Converts a 2-bytes (16 bits) string to an integer. - - c: string containing bytes to convert - o: offset of bytes to convert in string - """ - return ord(c[o])+(ord(c[o+1])<<8) - - -def i32(c, o = 0): - """ - Converts a 4-bytes (32 bits) string to an integer. - - c: string containing bytes to convert - o: offset of bytes to convert in string - """ - return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) - # [PL]: added int() because "<<" gives long int since Python 2.4 - - -def _clsid(clsid): - """ - Converts a CLSID to a human-readable string. - clsid: string of length 16. - """ - assert len(clsid) == 16 - if clsid == "\0" * len(clsid): - return "" - return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % - ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + - tuple(map(ord, clsid[8:16])))) - - - -# UNICODE support for Old Python versions: -# (necessary to handle storages/streams names which use Unicode) - -try: - # is Unicode supported ? - unicode - - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python with Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() - """ - #TODO: test if it OleFileIO works with Unicode strings, instead of - # converting to Latin-1. - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = s.decode('UTF-16LE', errors) - if KEEP_UNICODE_NAMES: - return u - else: - # Second the unicode string is converted to Latin-1 - return u.encode('latin_1', errors) - except: - # there was an error during Unicode to Latin-1 conversion: - raise IOError, 'incorrect Unicode name' - -except NameError: - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python without native Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. (ignored in this version) - """ - # If the unicode function does not exist, we assume this is an old - # Python version without Unicode support. - # Null bytes are simply removed (this only works with usual Latin-1 - # strings which do not contain unicode characters>256): - return filter(ord, s) - - - - -#=== CLASSES ================================================================== - -#--- _OleStream --------------------------------------------------------------- - -class _OleStream(StringIO.StringIO): - """ - OLE2 Stream - - Returns a read-only file object which can be used to read - the contents of a OLE stream (instance of the StringIO class). - To open a stream, use the openstream method in the OleFile class. - - This function can be used with either ordinary streams, - or ministreams, depending on the offset, sectorsize, and - fat table arguments. - - Attributes: - - size: actual size of data stream, after it was opened. - """ - - # FIXME: should store the list of sects obtained by following - # the fat chain, and load new sectors on demand instead of - # loading it all in one go. - - def __init__(self, fp, sect, size, offset, sectorsize, fat): - """ - Constructor for _OleStream class. - - fp : file object, the OLE container or the MiniFAT stream - sect : sector index of first sector in the stream - size : total size of the stream - offset : offset in bytes for the first FAT or MiniFAT sector - sectorsize: size of one sector - fat : array/list of sector indexes (FAT or MiniFAT) - return : a StringIO instance containing the OLE stream - """ - debug('_OleStream.__init__:') - debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' - %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) - # for debugging messages, size of file where stream is read: - if isinstance(fp, StringIO.StringIO): - filesize = len(fp.getvalue()) # file in MiniFAT - else: - filesize = os.path.getsize(fp.name) # file on disk - #[PL] To detect malformed documents with FAT loops, we compute the - # expected number of sectors in the stream: - unknown_size = False - if size==0x7FFFFFFF: - # this is the case when called from OleFileIO._open(), and stream - # size is not known in advance (for example when reading the - # Directory stream). Then we can only guess maximum size: - size = len(fat)*sectorsize - # and we keep a record that size was unknown: - unknown_size = True - debug(' stream with UNKNOWN SIZE') - nb_sectors = (size + (sectorsize-1)) / sectorsize - debug('nb_sectors = %d' % nb_sectors) - # This number should (at least) be less than the total number of - # sectors in the given FAT: - if nb_sectors > len(fat): - raise IOError, 'malformed OLE document, stream too large' - # optimization(?): data is first a list of strings, and join() is called - # at the end to concatenate all in one string. - # (this may not be really useful with recent Python versions) - data = [] - # if size is zero, then first sector index should be ENDOFCHAIN: - if size == 0 and sect != ENDOFCHAIN: - debug('size == 0 and sect != ENDOFCHAIN:') - raise IOError, 'incorrect OLE sector index for empty stream' - #[PL] A fixed-length for loop is used instead of an undefined while - # loop to avoid DoS attacks: - for i in xrange(nb_sectors): - # Sector index may be ENDOFCHAIN, but only if size was unknown - if sect == ENDOFCHAIN: - if unknown_size: - break - else: - # else this means that the stream is smaller than declared: - debug('sect=ENDOFCHAIN before expected size') - raise IOError, 'incomplete OLE stream' - # sector index should be within FAT: - if sect<0 or sect>=len(fat): - debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) - debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) -## tmp_data = string.join(data, "") -## f = open('test_debug.bin', 'wb') -## f.write(tmp_data) -## f.close() -## debug('data read so far: %d bytes' % len(tmp_data)) - raise IOError, 'incorrect OLE FAT, sector index out of range' - #TODO: merge this code with OleFileIO.getsect() ? - #TODO: check if this works with 4K sectors: - try: - fp.seek(offset + sectorsize * sect) - except: - debug('sect=%d, seek=%d, filesize=%d' % - (sect, offset+sectorsize*sect, filesize)) - raise IOError, 'OLE sector index out of range' - sector_data = fp.read(sectorsize) - # [PL] check if there was enough data: - # Note: if sector is the last of the file, sometimes it is not a - # complete sector (of 512 or 4K), so we may read less than - # sectorsize. - if len(sector_data)!=sectorsize and sect!=(len(fat)-1): - debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % - (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) - debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) - raise IOError, 'incomplete OLE sector' - data.append(sector_data) - # jump to next sector in the FAT: - try: - sect = fat[sect] - except IndexError: - # [PL] if pointer is out of the FAT an exception is raised - raise IOError, 'incorrect OLE FAT, sector index out of range' - #[PL] Last sector should be a "end of chain" marker: - if sect != ENDOFCHAIN: - raise IOError, 'incorrect last sector index in OLE stream' - data = string.join(data, "") - # Data is truncated to the actual stream size: - if len(data) >= size: - data = data[:size] - # actual stream size is stored for future use: - self.size = size - elif unknown_size: - # actual stream size was not known, now we know the size of read - # data: - self.size = len(data) - else: - # read data is less than expected: - debug('len(data)=%d, size=%d' % (len(data), size)) - raise IOError, 'OLE stream size is less than declared' - # when all data is read in memory, StringIO constructor is called - StringIO.StringIO.__init__(self, data) - # Then the _OleStream object can be used as a read-only file object. - - -#--- _OleDirectoryEntry ------------------------------------------------------- - -class _OleDirectoryEntry: - - """ - OLE2 Directory Entry - """ - #[PL] parsing code moved from OleFileIO.loaddirectory - - # struct to parse directory entries: - # <: little-endian byte order - # 64s: string containing entry name in unicode (max 31 chars) + null char - # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 - # B: uint8, dir entry type (between 0 and 5) - # B: uint8, color: 0=black, 1=red - # I: uint32, index of left child node in the red-black tree, NOSTREAM if none - # I: uint32, index of right child node in the red-black tree, NOSTREAM if none - # I: uint32, index of child root node if it is a storage, else NOSTREAM - # 16s: CLSID, unique identifier (only used if it is a storage) - # I: uint32, user flags - # 8s: uint64, creation timestamp or zero - # 8s: uint64, modification timestamp or zero - # I: uint32, SID of first sector if stream or ministream, SID of 1st sector - # of stream containing ministreams if root entry, 0 otherwise - # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise - # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise - STRUCT_DIRENTRY = '<64sHBBIII16sI8s8sIII' - # size of a directory entry: 128 bytes - DIRENTRY_SIZE = 128 - assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE - - - def __init__(self, entry, sid, olefile): - """ - Constructor for an _OleDirectoryEntry object. - Parses a 128-bytes entry from the OLE Directory stream. - - entry : string (must be 128 bytes long) - sid : index of this directory entry in the OLE file directory - olefile: OleFileIO containing this directory entry - """ - self.sid = sid - # ref to olefile is stored for future use - self.olefile = olefile - # kids is a list of children entries, if this entry is a storage: - # (list of _OleDirectoryEntry objects) - self.kids = [] - # kids_dict is a dictionary of children entries, indexed by their - # name in lowercase: used to quickly find an entry, and to detect - # duplicates - self.kids_dict = {} - # flag used to detect if the entry is referenced more than once in - # directory: - self.used = False - # decode DirEntry - ( - name, - namelength, - self.entry_type, - self.color, - self.sid_left, - self.sid_right, - self.sid_child, - clsid, - self.dwUserFlags, - self.createTime, - self.modifyTime, - self.isectStart, - sizeLow, - sizeHigh - ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) - if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: - olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') - # only first directory entry can (and should) be root: - if self.entry_type == STGTY_ROOT and sid != 0: - olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') - if sid == 0 and self.entry_type != STGTY_ROOT: - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') - #debug (struct.unpack(fmt_entry, entry[:len_entry])) - # name should be at most 31 unicode characters + null character, - # so 64 bytes in total (31*2 + 2): - if namelength>64: - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') - # if exception not raised, namelength is set to the maximum value: - namelength = 64 - # only characters without ending null char are kept: - name = name[:(namelength-2)] - # name is converted from unicode to Latin-1: - self.name = _unicode(name) - - debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) - debug(' - type: %d' % self.entry_type) - debug(' - sect: %d' % self.isectStart) - debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, - self.sid_right, self.sid_child)) - - # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes - # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 - # or some other value so it cannot be raised as a defect in general: - if olefile.sectorsize == 512: - if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: - debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % - (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) - olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') - self.size = sizeLow - else: - self.size = sizeLow + (long(sizeHigh)<<32) - debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) - - self.clsid = _clsid(clsid) - # a storage should have a null size, BUT some implementations such as - # Word 8 for Mac seem to allow non-null values => Potential defect: - if self.entry_type == STGTY_STORAGE and self.size != 0: - olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') - # check if stream is not already referenced elsewhere: - if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: - if self.size < olefile.minisectorcutoff \ - and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT - # ministream object - minifat = True - else: - minifat = False - olefile._check_duplicate_stream(self.isectStart, minifat) - - - - def build_storage_tree(self): - """ - Read and build the red-black tree attached to this _OleDirectoryEntry - object, if it is a storage. - Note that this method builds a tree of all subentries, so it should - only be called for the root object once. - """ - debug('build_storage_tree: SID=%d - %s - sid_child=%d' - % (self.sid, repr(self.name), self.sid_child)) - if self.sid_child != NOSTREAM: - # if child SID is not NOSTREAM, then this entry is a storage. - # Let's walk through the tree of children to fill the kids list: - self.append_kids(self.sid_child) - - # Note from OpenOffice documentation: the safest way is to - # recreate the tree because some implementations may store broken - # red-black trees... - - # in the OLE file, entries are sorted on (length, name). - # for convenience, we sort them on name instead: - # (see __cmp__ method in this class) - self.kids.sort() - - - def append_kids(self, child_sid): - """ - Walk through red-black tree of children of this directory entry to add - all of them to the kids list. (recursive method) - - child_sid : index of child directory entry to use, or None when called - first time for the root. (only used during recursion) - """ - #[PL] this method was added to use simple recursion instead of a complex - # algorithm. - # if this is not a storage or a leaf of the tree, nothing to do: - if child_sid == NOSTREAM: - return - # check if child SID is in the proper range: - if child_sid<0 or child_sid>=len(self.olefile.direntries): - self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') - # get child direntry: - child = self.olefile._load_direntry(child_sid) #direntries[child_sid] - debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' - % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) - # the directory entries are organized as a red-black tree. - # (cf. Wikipedia for details) - # First walk through left side of the tree: - self.append_kids(child.sid_left) - # Check if its name is not already used (case-insensitive): - name_lower = child.name.lower() - if self.kids_dict.has_key(name_lower): - self.olefile._raise_defect(DEFECT_INCORRECT, - "Duplicate filename in OLE storage") - # Then the child_sid _OleDirectoryEntry object is appended to the - # kids list and dictionary: - self.kids.append(child) - self.kids_dict[name_lower] = child - # Check if kid was not already referenced in a storage: - if child.used: - self.olefile._raise_defect(DEFECT_INCORRECT, - 'OLE Entry referenced more than once') - child.used = True - # Finally walk through right side of the tree: - self.append_kids(child.sid_right) - # Afterwards build kid's own tree if it's also a storage: - child.build_storage_tree() - - - def __cmp__(self, other): - "Compare entries by name" - return cmp(self.name, other.name) - #TODO: replace by the same function as MS implementation ? - # (order by name length first, then case-insensitive order) - - - def dump(self, tab = 0): - "Dump this entry, and all its subentries (for debug purposes only)" - TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", - "(property)", "(root)"] - print " "*tab + repr(self.name), TYPES[self.entry_type], - if self.entry_type in (STGTY_STREAM, STGTY_ROOT): - print self.size, "bytes", - print - if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: - print " "*tab + "{%s}" % self.clsid - - for kid in self.kids: - kid.dump(tab + 2) - - -#--- OleFileIO ---------------------------------------------------------------- - -class OleFileIO: - """ - OLE container object - - This class encapsulates the interface to an OLE 2 structured - storage file. Use the {@link listdir} and {@link openstream} methods to - access the contents of this file. - - Object names are given as a list of strings, one for each subentry - level. The root entry should be omitted. For example, the following - code extracts all image streams from a Microsoft Image Composer file: - - ole = OleFileIO("fan.mic") - - for entry in ole.listdir(): - if entry[1:2] == "Image": - fin = ole.openstream(entry) - fout = open(entry[0:1], "wb") - while 1: - s = fin.read(8192) - if not s: - break - fout.write(s) - - You can use the viewer application provided with the Python Imaging - Library to view the resulting files (which happens to be standard - TIFF files). - """ - - def __init__(self, filename = None, raise_defects=DEFECT_FATAL): - """ - Constructor for OleFileIO class. - - filename: file to open. - raise_defects: minimal level for defects to be raised as exceptions. - (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a - security-oriented application, see source code for details) - """ - self._raise_defects_level = raise_defects - if filename: - self.open(filename) - - - def _raise_defect(self, defect_level, message): - """ - This method should be called for any defect found during file parsing. - It may raise an IOError exception according to the minimal level chosen - for the OleFileIO object. - - defect_level: defect level, possible values are: - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect - DEFECT_POTENTIAL : a potential defect - DEFECT_INCORRECT : an error according to specifications, but parsing can go on - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible - message: string describing the defect, used with raised exception. - """ - # added by [PL] - if defect_level >= self._raise_defects_level: - raise IOError, message - - - def open(self, filename): - """ - Open an OLE2 file. - Reads the header, FAT and directory. - - filename: string-like or file-like object - """ - #[PL] check if filename is a string-like or file-like object: - # (it is better to check for a read() method) - if hasattr(filename, 'read'): - # file-like object - self.fp = filename - else: - # string-like object - self.fp = open(filename, "rb") - # old code fails if filename is not a plain string: - #if type(filename) == type(""): - # self.fp = open(filename, "rb") - #else: - # self.fp = filename - - # lists of streams in FAT and MiniFAT, to detect duplicate references - # (list of indexes of first sectors of each stream) - self._used_streams_fat = [] - self._used_streams_minifat = [] - - header = self.fp.read(512) - - if len(header) != 512 or header[:8] != MAGIC: - self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") - - # [PL] header structure according to AAF specifications: - ##Header - ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] - ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, - ## // 0x1a, 0xe1} for current version - ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ - ## // GetClassFile uses root directory class id) - ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is - ## // written by reference implementation - ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for - ## // 512-byte sectors, 4 for 4 KB sectors - ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering - ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; - ## // typically 9 indicating 512-byte sectors - ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; - ## // typically 6 indicating 64-byte mini-sectors - ##USHORT _usReserved; // [22H,02] reserved, must be zero - ##ULONG _ulReserved1; // [24H,04] reserved, must be zero - ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, - ## // number of SECTs in directory chain for 4 KB - ## // sectors - ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain - ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain - ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must - ## // be zero. The reference implementation - ## // does not support transactions - ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; - ## // typically 4096 bytes - ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain - ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain - ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain - ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain - ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors - ##}; - - # [PL] header decoding: - # '<' indicates little-endian byte ordering for Intel (cf. struct module help) - fmt_header = '<8s16sHHHHHHLLLLLLLLLL' - header_size = struct.calcsize(fmt_header) - debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) - header1 = header[:header_size] - ( - self.Sig, - self.clsid, - self.MinorVersion, - self.DllVersion, - self.ByteOrder, - self.SectorShift, - self.MiniSectorShift, - self.Reserved, self.Reserved1, - self.csectDir, - self.csectFat, - self.sectDirStart, - self.signature, - self.MiniSectorCutoff, - self.MiniFatStart, - self.csectMiniFat, - self.sectDifStart, - self.csectDif - ) = struct.unpack(fmt_header, header1) - debug( struct.unpack(fmt_header, header1)) - - if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': - # OLE signature should always be present - self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") - if self.clsid != '\x00'*16: - # according to AAF specs, CLSID should always be zero - self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") - debug( "MinorVersion = %d" % self.MinorVersion ) - debug( "DllVersion = %d" % self.DllVersion ) - if self.DllVersion not in [3, 4]: - # version 3: usual format, 512 bytes per sector - # version 4: large format, 4K per sector - self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") - debug( "ByteOrder = %X" % self.ByteOrder ) - if self.ByteOrder != 0xFFFE: - # For now only common little-endian documents are handled correctly - self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") - # TODO: add big-endian support for documents created on Mac ? - self.SectorSize = 2**self.SectorShift - debug( "SectorSize = %d" % self.SectorSize ) - if self.SectorSize not in [512, 4096]: - self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") - if (self.DllVersion==3 and self.SectorSize!=512) \ - or (self.DllVersion==4 and self.SectorSize!=4096): - self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") - self.MiniSectorSize = 2**self.MiniSectorShift - debug( "MiniSectorSize = %d" % self.MiniSectorSize ) - if self.MiniSectorSize not in [64]: - self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") - if self.Reserved != 0 or self.Reserved1 != 0: - self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") - debug( "csectDir = %d" % self.csectDir ) - if self.SectorSize==512 and self.csectDir!=0: - self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") - debug( "csectFat = %d" % self.csectFat ) - debug( "sectDirStart = %X" % self.sectDirStart ) - debug( "signature = %d" % self.signature ) - # Signature should be zero, BUT some implementations do not follow this - # rule => only a potential defect: - if self.signature != 0: - self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") - debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) - debug( "MiniFatStart = %X" % self.MiniFatStart ) - debug( "csectMiniFat = %d" % self.csectMiniFat ) - debug( "sectDifStart = %X" % self.sectDifStart ) - debug( "csectDif = %d" % self.csectDif ) - - # calculate the number of sectors in the file - # (-1 because header doesn't count) - filesize = os.path.getsize(filename) - self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 - debug( "Number of sectors in the file: %d" % self.nb_sect ) - - # file clsid (probably never used, so we don't store it) - clsid = _clsid(header[8:24]) - self.sectorsize = self.SectorSize #1 << i16(header, 30) - self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) - self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) - - # check known streams for duplicate references (these are always in FAT, - # never in MiniFAT): - self._check_duplicate_stream(self.sectDirStart) - # check MiniFAT only if it is not empty: - if self.csectMiniFat: - self._check_duplicate_stream(self.MiniFatStart) - # check DIFAT only if it is not empty: - if self.csectDif: - self._check_duplicate_stream(self.sectDifStart) - - # Load file allocation tables - self.loadfat(header) - # Load direcory. This sets both the direntries list (ordered by sid) - # and the root (ordered by hierarchy) members. - self.loaddirectory(self.sectDirStart)#i32(header, 48)) - self.ministream = None - self.minifatsect = self.MiniFatStart #i32(header, 60) - - - def _check_duplicate_stream(self, first_sect, minifat=False): - """ - Checks if a stream has not been already referenced elsewhere. - This method should only be called once for each known stream, and only - if stream size is not null. - first_sect: index of first sector of the stream in FAT - minifat: if True, stream is located in the MiniFAT, else in the FAT - """ - if minifat: - debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) - used_streams = self._used_streams_minifat - else: - debug('_check_duplicate_stream: sect=%d in FAT' % first_sect) - # some values can be safely ignored (not a real stream): - if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): - return - used_streams = self._used_streams_fat - #TODO: would it be more efficient using a dict or hash values, instead - # of a list of long ? - if first_sect in used_streams: - self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') - else: - used_streams.append(first_sect) - - - def dumpfat(self, fat, firstindex=0): - "Displays a part of FAT in human-readable form for debugging purpose" - # [PL] added only for debug - if not DEBUG_MODE: - return - # dictionary to convert special FAT values in human-readable strings - VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) - fatnames = { - FREESECT: "..free..", - ENDOFCHAIN: "[ END. ]", - FATSECT: "FATSECT ", - DIFSECT: "DIFSECT " - } - nbsect = len(fat) - nlines = (nbsect+VPL-1)/VPL - print "index", - for i in range(VPL): - print ("%8X" % i), - print "" - for l in range(nlines): - index = l*VPL - print ("%8X:" % (firstindex+index)), - for i in range(index, index+VPL): - if i>=nbsect: - break - sect = fat[i] - if sect in fatnames: - nom = fatnames[sect] - else: - if sect == i+1: - nom = " --->" - else: - nom = "%8X" % sect - print nom, - print "" - - - def dumpsect(self, sector, firstindex=0): - "Displays a sector in a human-readable form, for debugging purpose." - if not DEBUG_MODE: - return - VPL=8 # number of values per line (8+1 * 8+1 = 81) - tab = array.array(UINT32, sector) - nbsect = len(tab) - nlines = (nbsect+VPL-1)/VPL - print "index", - for i in range(VPL): - print ("%8X" % i), - print "" - for l in range(nlines): - index = l*VPL - print ("%8X:" % (firstindex+index)), - for i in range(index, index+VPL): - if i>=nbsect: - break - sect = tab[i] - nom = "%8X" % sect - print nom, - print "" - - - - def loadfat_sect(self, sect): - """ - Adds the indexes of the given sector to the FAT - sect: string containing the first FAT sector, or array of long integers - return: index of last FAT sector. - """ - # a FAT sector is an array of ulong integers. - if isinstance(sect, array.array): - # if sect is already an array it is directly used - fat1 = sect - else: - # if it's a raw sector, it is parsed in an array - fat1 = array.array(UINT32, sect) - self.dumpsect(sect) - # The FAT is a sector chain starting at the first index of itself. - for isect in fat1: - #print "isect = %X" % isect - if isect == ENDOFCHAIN or isect == FREESECT: - # the end of the sector chain has been reached - break - # read the FAT sector - s = self.getsect(isect) - # parse it as an array of 32 bits integers, and add it to the - # global FAT array - self.fat = self.fat + array.array(UINT32, s) - return isect - - - def loadfat(self, header): - """ - Load the FAT table. - """ - # The header contains a sector numbers - # for the first 109 FAT sectors. Additional sectors are - # described by DIF blocks - - sect = header[76:512] - debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) - #fat = [] - # [PL] FAT is an array of 32 bits unsigned ints, it's more effective - # to use an array than a list in Python. - # It's initialized as empty first: - self.fat = array.array(UINT32) - self.loadfat_sect(sect) - #self.dumpfat(self.fat) -## for i in range(0, len(sect), 4): -## ix = i32(sect, i) -## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: -## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: -## break -## s = self.getsect(ix) -## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) -## fat = fat + array.array(UINT32, s) - if self.csectDif != 0: - # [PL] There's a DIFAT because file is larger than 6.8MB - # some checks just in case: - if self.csectFat <= 109: - # there must be at least 109 blocks in header and the rest in - # DIFAT, so number of sectors must be >109. - self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') - if self.sectDifStart >= self.nb_sect: - # initial DIFAT block index must be valid - self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') - debug( "DIFAT analysis..." ) - # We compute the necessary number of DIFAT sectors : - # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) - nb_difat = (self.csectFat-109 + 126)/127 - debug( "nb_difat = %d" % nb_difat ) - if self.csectDif != nb_difat: - raise IOError, 'incorrect DIFAT' - isect_difat = self.sectDifStart - for i in xrange(nb_difat): - debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) - #TODO: check if corresponding FAT SID = DIFSECT - sector_difat = self.getsect(isect_difat) - difat = array.array(UINT32, sector_difat) - self.dumpsect(sector_difat) - self.loadfat_sect(difat[:127]) - # last DIFAT pointer is next DIFAT sector: - isect_difat = difat[127] - debug( "next DIFAT sector: %X" % isect_difat ) - # checks: - if isect_difat not in [ENDOFCHAIN, FREESECT]: - # last DIFAT pointer value must be ENDOFCHAIN or FREESECT - raise IOError, 'incorrect end of DIFAT' -## if len(self.fat) != self.csectFat: -## # FAT should contain csectFat blocks -## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) -## raise IOError, 'incorrect DIFAT' - # since FAT is read from fixed-size sectors, it may contain more values - # than the actual number of sectors in the file. - # Keep only the relevant sector indexes: - if len(self.fat) > self.nb_sect: - debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) - self.fat = self.fat[:self.nb_sect] - debug('\nFAT:') - self.dumpfat(self.fat) - - - def loadminifat(self): - """ - Load the MiniFAT table. - """ - # MiniFAT is stored in a standard sub-stream, pointed to by a header - # field. - # NOTE: there are two sizes to take into account for this stream: - # 1) Stream size is calculated according to the number of sectors - # declared in the OLE header. This allocated stream may be more than - # needed to store the actual sector indexes. - # (self.csectMiniFat is the number of sectors of size self.SectorSize) - stream_size = self.csectMiniFat * self.SectorSize - # 2) Actually used size is calculated by dividing the MiniStream size - # (given by root entry size) by the size of mini sectors, *4 for - # 32 bits indexes: - nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize - used_size = nb_minisectors * 4 - debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % - (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) - if used_size > stream_size: - # This is not really a problem, but may indicate a wrong implementation: - self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') - # In any case, first read stream_size: - s = self._open(self.minifatsect, stream_size, force_FAT=True).read() - #[PL] Old code replaced by an array: - #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) - self.minifat = array.array(UINT32, s) - # Then shrink the array to used size, to avoid indexes out of MiniStream: - debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) - self.minifat = self.minifat[:nb_minisectors] - debug('loadminifat(): len=%d' % len(self.minifat)) - debug('\nMiniFAT:') - self.dumpfat(self.minifat) - - def getsect(self, sect): - """ - Read given sector from file on disk. - sect: sector index - returns a string containing the sector data. - """ - # [PL] this original code was wrong when sectors are 4KB instead of - # 512 bytes: - #self.fp.seek(512 + self.sectorsize * sect) - #[PL]: added safety checks: - #print "getsect(%X)" % sect - try: - self.fp.seek(self.sectorsize * (sect+1)) - except: - debug('getsect(): sect=%X, seek=%d, filesize=%d' % - (sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name))) - self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') - sector = self.fp.read(self.sectorsize) - if len(sector) != self.sectorsize: - debug('getsect(): sect=%X, read=%d, sectorsize=%d' % - (sect, len(sector), self.sectorsize)) - self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') - return sector - - - def loaddirectory(self, sect): - """ - Load the directory. - sect: sector index of directory stream. - """ - # The directory is stored in a standard - # substream, independent of its size. - - # open directory stream as a read-only file: - # (stream size is not known in advance) - self.directory_fp = self._open(sect) - - #[PL] to detect malformed documents and avoid DoS attacks, the maximum - # number of directory entries can be calculated: - max_entries = self.directory_fp.size / 128 - debug('loaddirectory: size=%d, max_entries=%d' % - (self.directory_fp.size, max_entries)) - - # Create list of directory entries - #self.direntries = [] - # We start with a list of "None" object - self.direntries = [None] * max_entries -## for sid in xrange(max_entries): -## entry = fp.read(128) -## if not entry: -## break -## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) - # load root entry: - root_entry = self._load_direntry(0) - # Root entry is the first entry: - self.root = self.direntries[0] - # read and build all storage trees, starting from the root: - self.root.build_storage_tree() - - - def _load_direntry (self, sid): - """ - Load a directory entry from the directory. - This method should only be called once for each storage/stream when - loading the directory. - sid: index of storage/stream in the directory. - return: a _OleDirectoryEntry object - raise: IOError if the entry has always been referenced. - """ - # check if SID is OK: - if sid<0 or sid>=len(self.direntries): - self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") - # check if entry was already referenced: - if self.direntries[sid] is not None: - self._raise_defect(DEFECT_INCORRECT, - "double reference for OLE stream/storage") - # if exception not raised, return the object - return self.direntries[sid] - self.directory_fp.seek(sid * 128) - entry = self.directory_fp.read(128) - self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) - return self.direntries[sid] - - - def dumpdirectory(self): - """ - Dump directory (for debugging only) - """ - self.root.dump() - - - def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): - """ - Open a stream, either in FAT or MiniFAT according to its size. - (openstream helper) - - start: index of first sector - size: size of stream (or nothing if size is unknown) - force_FAT: if False (default), stream will be opened in FAT or MiniFAT - according to size. If True, it will always be opened in FAT. - """ - debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % - (start, size, str(force_FAT))) - # stream size is compared to the MiniSectorCutoff threshold: - if size < self.minisectorcutoff and not force_FAT: - # ministream object - if not self.ministream: - # load MiniFAT if it wasn't already done: - self.loadminifat() - # The first sector index of the miniFAT stream is stored in the - # root directory entry: - size_ministream = self.root.size - debug('Opening MiniStream: sect=%d, size=%d' % - (self.root.isectStart, size_ministream)) - self.ministream = self._open(self.root.isectStart, - size_ministream, force_FAT=True) - return _OleStream(self.ministream, start, size, 0, - self.minisectorsize, self.minifat) - else: - # standard stream - return _OleStream(self.fp, start, size, 512, - self.sectorsize, self.fat) - - - def _list(self, files, prefix, node): - """ - (listdir helper) - files: list of files to fill in - prefix: current location in storage tree (list of names) - node: current node (_OleDirectoryEntry object) - """ - prefix = prefix + [node.name] - for entry in node.kids: - if entry.kids: - self._list(files, prefix, entry) - else: - files.append(prefix[1:] + [entry.name]) - - - def listdir(self): - """ - Return a list of streams stored in this file - """ - files = [] - self._list(files, [], self.root) - return files - - - def _find(self, filename): - """ - Returns directory entry of given filename. (openstream helper) - Note: this method is case-insensitive. - - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] - return: sid of requested filename - raise IOError if file not found - """ - - # if filename is a string instead of a list, split it on slashes to - # convert to a list: - if isinstance(filename, basestring): - filename = filename.split('/') - # walk across storage tree, following given path: - node = self.root - for name in filename: - for kid in node.kids: - if kid.name.lower() == name.lower(): - break - else: - raise IOError, "file not found" - node = kid - return node.sid - - - def openstream(self, filename): - """ - Open a stream as a read-only file object (StringIO). - - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] - return: file object (read-only) - raise IOError if filename not found, or if this is not a stream. - """ - sid = self._find(filename) - entry = self.direntries[sid] - if entry.entry_type != STGTY_STREAM: - raise IOError, "this file is not a stream" - return self._open(entry.isectStart, entry.size) - - - def get_type(self, filename): - """ - Test if given filename exists as a stream or a storage in the OLE - container, and return its type. - - filename: path of stream in storage tree. (see openstream for syntax) - return: False if object does not exist, its entry type (>0) otherwise: - - STGTY_STREAM: a stream - - STGTY_STORAGE: a storage - - STGTY_ROOT: the root entry - """ - try: - sid = self._find(filename) - entry = self.direntries[sid] - return entry.entry_type - except: - return False - - - def exists(self, filename): - """ - Test if given filename exists as a stream or a storage in the OLE - container. - - filename: path of stream in storage tree. (see openstream for syntax) - return: True if object exist, else False. - """ - try: - sid = self._find(filename) - return True - except: - return False - - - def get_size(self, filename): - """ - Return size of a stream in the OLE container, in bytes. - - filename: path of stream in storage tree (see openstream for syntax) - return: size in bytes (long integer) - raise: IOError if file not found, TypeError if this is not a stream. - """ - sid = self._find(filename) - entry = self.direntries[sid] - if entry.entry_type != STGTY_STREAM: - #TODO: Should it return zero instead of raising an exception ? - raise TypeError, 'object is not an OLE stream' - return entry.size - - - def get_rootentry_name(self): - """ - Return root entry name. Should usually be 'Root Entry' or 'R' in most - implementations. - """ - return self.root.name - - - def getproperties(self, filename): - """ - Return properties described in substream. - - filename: path of stream in storage tree (see openstream for syntax) - return: a dictionary of values indexed by id (integer) - """ - fp = self.openstream(filename) - - data = {} - - # header - s = fp.read(28) - clsid = _clsid(s[8:24]) - - # format id - s = fp.read(20) - fmtid = _clsid(s[:16]) - fp.seek(i32(s, 16)) - - # get section - s = "****" + fp.read(i32(fp.read(4))-4) - - for i in range(i32(s, 4)): - - id = i32(s, 8+i*8) - offset = i32(s, 12+i*8) - type = i32(s, offset) - - debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) - - # test for common types first (should perhaps use - # a dictionary instead?) - - if type == VT_I2: - value = i16(s, offset+4) - if value >= 32768: - value = value - 65536 - elif type == VT_UI2: - value = i16(s, offset+4) - elif type in (VT_I4, VT_ERROR): - value = i32(s, offset+4) - elif type == VT_UI4: - value = i32(s, offset+4) # FIXME - elif type in (VT_BSTR, VT_LPSTR): - count = i32(s, offset+4) - value = s[offset+8:offset+8+count-1] - elif type == VT_BLOB: - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - elif type == VT_LPWSTR: - count = i32(s, offset+4) - value = self._unicode(s[offset+8:offset+8+count*2]) - elif type == VT_FILETIME: - value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) - # FIXME: this is a 64-bit int: "number of 100ns periods - # since Jan 1,1601". Should map this to Python time - value = value / 10000000L # seconds - elif type == VT_UI1: - value = ord(s[offset+4]) - elif type == VT_CLSID: - value = _clsid(s[offset+4:offset+20]) - elif type == VT_CF: - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - else: - value = None # everything else yields "None" - - # FIXME: add support for VT_VECTOR - - #print "%08x" % id, repr(value), - #print "(%s)" % VT[i32(s, offset) & 0xFFF] - - data[id] = value - - return data - -# -# -------------------------------------------------------------------- -# This script can be used to dump the directory of any OLE2 structured -# storage file. - -if __name__ == "__main__": - - import sys - - # [PL] display quick usage info if launched from command-line - if len(sys.argv) <= 1: - print __doc__ - print """ -Launched from command line, this script parses OLE files and prints info. - -Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] - -Options: --d : debug mode (display a lot of debug information, for developers only) --c : check all streams (for debugging purposes) -""" - sys.exit() - - check_streams = False - for filename in sys.argv[1:]: -## try: - # OPTIONS: - if filename == '-d': - # option to switch debug mode on: - set_debug_mode(True) - continue - if filename == '-c': - # option to switch check streams mode on: - check_streams = True - continue - - ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) - print "-" * 68 - print filename - print "-" * 68 - ole.dumpdirectory() - for streamname in ole.listdir(): - if streamname[-1][0] == "\005": - print streamname, ": properties" - props = ole.getproperties(streamname) - props = props.items() - props.sort() - for k, v in props: - #[PL]: avoid to display too large or binary values: - if isinstance(v, basestring): - if len(v) > 50: - v = v[:50] - # quick and dirty binary check: - for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, - 21,22,23,24,25,26,27,28,29,30,31): - if chr(c) in v: - v = '(binary data)' - break - print " ", k, v - - if check_streams: - # Read all streams to check if there are errors: - print '\nChecking streams...' - for streamname in ole.listdir(): - # print name using repr() to convert binary chars to \xNN: - print '-', repr('/'.join(streamname)),'-', - st_type = ole.get_type(streamname) - if st_type == STGTY_STREAM: - print 'size %d' % ole.get_size(streamname) - # just try to read stream in memory: - ole.openstream(streamname) - else: - print 'NOT a stream : type=%d' % st_type - print '' - - #[PL] Test a few new methods: - root = ole.get_rootentry_name() - print 'Root entry name: "%s"' % root - if ole.exists('worddocument'): - print "This is a Word document." - print "type of stream 'WordDocument':", ole.get_type('worddocument') - print "size :", ole.get_size('worddocument') - if ole.exists('macros/vba'): - print "This document may contain VBA macros." -## except IOError, v: -## print "***", "cannot read", file, "-", v +#!/usr/local/bin/python +# -*- coding: latin-1 -*- +""" +OleFileIO_PL: + Module to read Microsoft OLE2 files (also called Structured Storage or + Microsoft Compound Document File Format), such as Microsoft Office + documents, Image Composer and FlashPix files, Outlook messages, ... + +version 0.21 2010-01-22 Philippe Lagadec - http://www.decalage.info + +Project website: http://www.decalage.info/python/olefileio + +Improved version of the OleFileIO module from PIL library v1.1.6 +See: http://www.pythonware.com/products/pil/index.htm + +The Python Imaging Library (PIL) is + Copyright (c) 1997-2005 by Secret Labs AB + Copyright (c) 1995-2005 by Fredrik Lundh +OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec + +See source code and LICENSE.txt for information on usage and redistribution. + +WARNING: THIS IS (STILL) WORK IN PROGRESS. +""" + +__author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" +__date__ = "2010-01-22" +__version__ = '0.21' + +#--- LICENSE ------------------------------------------------------------------ + +# OleFileIO_PL is an improved version of the OleFileIO module from the +# Python Imaging Library (PIL). + +# OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its associated +# documentation, you agree that you have read, understood, and will comply with +# the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and its +# associated documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appears in all copies, and that both +# that copyright notice and this permission notice appear in supporting +# documentation, and that the name of Secret Labs AB or the author(s) not be used +# in advertising or publicity pertaining to distribution of the software +# without specific, written prior permission. +# +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +#----------------------------------------------------------------------------- +# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) +# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility +# (all changes flagged with [PL]) +# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise +# exceptions in _OleStream.__init__() +# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) +# - added some constants +# - added header values checks +# - added some docstrings +# - getsect: bugfix in case sectors >512 bytes +# - getsect: added conformity checks +# - DEBUG_MODE constant to activate debug display +# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments +# - updated license +# - converted tabs to 4 spaces +# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity +# - improved _unicode() to use Python 2.x unicode support +# - fixed bug in _OleDirectoryEntry +# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops +# - fixed _OleStream which didn't check stream size +# - added/improved many docstrings and comments +# - moved helper functions _unicode and _clsid out of +# OleFileIO class +# - improved OleFileIO._find() to add Unix path syntax +# - OleFileIO._find() is now case-insensitive +# - added get_type() and get_rootentry_name() +# - rewritten loaddirectory and _OleDirectoryEntry +# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict +# - added detection of duplicate filenames in storages +# - added detection of duplicate references to streams +# - added get_size() and exists() to _OleDirectoryEntry +# - added isOleFile to check header before parsing +# - added __all__ list to control public keywords in pydoc +# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory +# - improved _unicode(), added workarounds for Python <2.3 +# - added set_debug_mode and -d option to set debug mode +# - fixed bugs in OleFileIO.open and _OleDirectoryEntry +# - added safety check in main for large or binary +# properties +# - allow size>0 for storages for some implementations +# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and +# streams +# - added option '-c' in main to check all streams +# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms +# (thanks to Ben G. and Martijn for reporting the bug) +# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str +# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs + +#----------------------------------------------------------------------------- +# TODO (for version 1.0): +# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... +# - add underscore to each private method, to avoid their display in +# pydoc/epydoc documentation +# - replace all raised exceptions with _raise_defect (at least in OleFileIO) +# - merge code from _OleStream and OleFileIO.getsect to read sectors +# (maybe add a class for FAT and MiniFAT ?) +# - add method to check all streams (follow sectors chains without storing all +# stream in memory, and report anomalies) +# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? +# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) +# => if possible avoid converting names to Latin-1 +# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) +# - rewrite OleFileIO.getproperties +# - improve docstrings to show more sample uses +# - see also original notes and FIXME below +# - remove all obsolete FIXMEs + +# IDEAS: +# - allow _raise_defect to raise different exceptions, not only IOError +# - provide a class with named attributes to get well-known properties of +# MS Office documents (title, author, ...) ? +# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for +# streams with unknown size +# - use arrays of int instead of long integers for FAT/MiniFAT, to improve +# performance and reduce memory usage ? (possible issue with values >2^31) +# - provide tests with unittest (may need write support to create samples) +# - move all debug code (and maybe dump methods) to a separate module, with +# a class which inherits OleFileIO ? +# - fix docstrings to follow epydoc format +# - add support for 4K sectors ? +# - add support for big endian byte order ? +# - create a simple OLE explorer with wxPython + +# FUTURE EVOLUTIONS to add write support: +# 1) add ability to write a stream back on disk from StringIO (same size, no +# change in FAT/MiniFAT). +# 2) rename a stream/storage if it doesn't change the RB tree +# 3) use rbtree module to update the red-black tree + any rename +# 4) remove a stream/storage: free sectors in FAT/MiniFAT +# 5) allocate new sectors in FAT/MiniFAT +# 6) create new storage/stream +#----------------------------------------------------------------------------- + +# +# THIS IS WORK IN PROGRESS +# +# The Python Imaging Library +# $Id$ +# +# stuff to deal with OLE2 Structured Storage files. this module is +# used by PIL to read Image Composer and FlashPix files, but can also +# be used to read other files of this type. +# +# History: +# 1997-01-20 fl Created +# 1997-01-22 fl Fixed 64-bit portability quirk +# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) +# 2004-02-29 fl Changed long hex constants to signed integers +# +# Notes: +# FIXME: sort out sign problem (eliminate long hex constants) +# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] +# FIXME: provide a glob mechanism function (using fnmatchcase) +# +# Literature: +# +# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, +# September 1996. +# +# Quotes: +# +# "If this document and functionality of the Software conflict, +# the actual functionality of the Software represents the correct +# functionality" -- Microsoft, in the OLE format specification +# +# Copyright (c) Secret Labs AB 1997. +# Copyright (c) Fredrik Lundh 1997. +# +# See the README file for information on usage and redistribution. +# + +#------------------------------------------------------------------------------ + +import string, StringIO, struct, array, os.path, sys + +#[PL] Define explicitly the public API to avoid private objects in pydoc: +__all__ = ['OleFileIO', 'isOleFile'] + +#[PL] workaround to fix an issue with array item size on 64 bits systems: +if array.array('L').itemsize == 4: + # on 32 bits platforms, long integers in an array are 32 bits: + UINT32 = 'L' +elif array.array('I').itemsize == 4: + # on 64 bits platforms, integers in an array are 32 bits: + UINT32 = 'I' +else: + raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' + + +#[PL] These workarounds were inspired from the Path module +# (see http://www.jorendorff.com/articles/python/path/) +#TODO: test with old Python versions + +# Pre-2.3 workaround for booleans +try: + True, False +except NameError: + True, False = 1, 0 + +# Pre-2.3 workaround for basestring. +try: + basestring +except NameError: + try: + # is Unicode supported (Python >2.0 or >1.6 ?) + basestring = (str, unicode) + except NameError: + basestring = str + +#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode +# if False (default PIL behaviour), all filenames are converted to Latin-1. +KEEP_UNICODE_NAMES = False + +#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on +# command line to change it. +DEBUG_MODE = False +def debug_print(msg): + print msg +def debug_pass(msg): + pass +debug = debug_pass + +def set_debug_mode(debug_mode): + """ + Set debug mode on or off, to control display of debugging messages. + mode: True or False + """ + global DEBUG_MODE, debug + DEBUG_MODE = debug_mode + if debug_mode: + debug = debug_print + else: + debug = debug_pass + +#TODO: convert this to hex +MAGIC = '\320\317\021\340\241\261\032\341' + +#[PL]: added constants for Sector IDs (from AAF specifications) +MAXREGSECT = 0xFFFFFFFAL; # maximum SECT +DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector + +#[PL]: added constants for Directory Entry IDs (from AAF specifications) +MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID +NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry + +#[PL] object types in storage (from AAF specifications) +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) +STGTY_STORAGE = 1 # element is a storage object +STGTY_STREAM = 2 # element is a stream object +STGTY_LOCKBYTES = 3 # element is an ILockBytes object +STGTY_PROPERTY = 4 # element is an IPropertyStorage object +STGTY_ROOT = 5 # element is a root storage + + +# +# -------------------------------------------------------------------- +# property types + +VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; +VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; +VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; +VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; +VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; +VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; +VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; +VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; +VT_VECTOR=0x1000; + +# map property id to name (for debugging purposes) + +VT = {} +for keyword, var in vars().items(): + if keyword[:3] == "VT_": + VT[var] = keyword + +# +# -------------------------------------------------------------------- +# Some common document types (root.clsid fields) + +WORD_CLSID = "00020900-0000-0000-C000-000000000046" +#TODO: check Excel, PPT, ... + +#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect +DEFECT_POTENTIAL = 20 # a potential defect +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing + # can go on +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is + # impossible + +#[PL] add useful constants to __all__: +for key in vars().keys(): + if key.startswith('STGTY_') or key.startswith('DEFECT_'): + __all__.append(key) + + +#--- FUNCTIONS ---------------------------------------------------------------- + +def isOleFile (filename): + """ + Test if file is an OLE container (according to its header). + filename: file name or path (str, unicode) + return: True if OLE, False otherwise. + """ + f = open(filename, 'rb') + header = f.read(len(MAGIC)) + if header == MAGIC: + return True + else: + return False + + +#TODO: replace i16 and i32 with more readable struct.unpack equivalent +def i16(c, o = 0): + """ + Converts a 2-bytes (16 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return ord(c[o])+(ord(c[o+1])<<8) + + +def i32(c, o = 0): + """ + Converts a 4-bytes (32 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) + # [PL]: added int() because "<<" gives long int since Python 2.4 + + +def _clsid(clsid): + """ + Converts a CLSID to a human-readable string. + clsid: string of length 16. + """ + assert len(clsid) == 16 + if clsid == "\0" * len(clsid): + return "" + return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % + ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + + tuple(map(ord, clsid[8:16])))) + + + +# UNICODE support for Old Python versions: +# (necessary to handle storages/streams names which use Unicode) + +try: + # is Unicode supported ? + unicode + + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python with Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() + """ + #TODO: test if it OleFileIO works with Unicode strings, instead of + # converting to Latin-1. + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = s.decode('UTF-16LE', errors) + if KEEP_UNICODE_NAMES: + return u + else: + # Second the unicode string is converted to Latin-1 + return u.encode('latin_1', errors) + except: + # there was an error during Unicode to Latin-1 conversion: + raise IOError, 'incorrect Unicode name' + +except NameError: + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python without native Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. (ignored in this version) + """ + # If the unicode function does not exist, we assume this is an old + # Python version without Unicode support. + # Null bytes are simply removed (this only works with usual Latin-1 + # strings which do not contain unicode characters>256): + return filter(ord, s) + + + + +#=== CLASSES ================================================================== + +#--- _OleStream --------------------------------------------------------------- + +class _OleStream(StringIO.StringIO): + """ + OLE2 Stream + + Returns a read-only file object which can be used to read + the contents of a OLE stream (instance of the StringIO class). + To open a stream, use the openstream method in the OleFile class. + + This function can be used with either ordinary streams, + or ministreams, depending on the offset, sectorsize, and + fat table arguments. + + Attributes: + - size: actual size of data stream, after it was opened. + """ + + # FIXME: should store the list of sects obtained by following + # the fat chain, and load new sectors on demand instead of + # loading it all in one go. + + def __init__(self, fp, sect, size, offset, sectorsize, fat): + """ + Constructor for _OleStream class. + + fp : file object, the OLE container or the MiniFAT stream + sect : sector index of first sector in the stream + size : total size of the stream + offset : offset in bytes for the first FAT or MiniFAT sector + sectorsize: size of one sector + fat : array/list of sector indexes (FAT or MiniFAT) + return : a StringIO instance containing the OLE stream + """ + debug('_OleStream.__init__:') + debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) + # for debugging messages, size of file where stream is read: + if isinstance(fp, StringIO.StringIO): + filesize = len(fp.getvalue()) # file in MiniFAT + else: + filesize = os.path.getsize(fp.name) # file on disk + #[PL] To detect malformed documents with FAT loops, we compute the + # expected number of sectors in the stream: + unknown_size = False + if size==0x7FFFFFFF: + # this is the case when called from OleFileIO._open(), and stream + # size is not known in advance (for example when reading the + # Directory stream). Then we can only guess maximum size: + size = len(fat)*sectorsize + # and we keep a record that size was unknown: + unknown_size = True + debug(' stream with UNKNOWN SIZE') + nb_sectors = (size + (sectorsize-1)) / sectorsize + debug('nb_sectors = %d' % nb_sectors) + # This number should (at least) be less than the total number of + # sectors in the given FAT: + if nb_sectors > len(fat): + raise IOError, 'malformed OLE document, stream too large' + # optimization(?): data is first a list of strings, and join() is called + # at the end to concatenate all in one string. + # (this may not be really useful with recent Python versions) + data = [] + # if size is zero, then first sector index should be ENDOFCHAIN: + if size == 0 and sect != ENDOFCHAIN: + debug('size == 0 and sect != ENDOFCHAIN:') + raise IOError, 'incorrect OLE sector index for empty stream' + #[PL] A fixed-length for loop is used instead of an undefined while + # loop to avoid DoS attacks: + for i in xrange(nb_sectors): + # Sector index may be ENDOFCHAIN, but only if size was unknown + if sect == ENDOFCHAIN: + if unknown_size: + break + else: + # else this means that the stream is smaller than declared: + debug('sect=ENDOFCHAIN before expected size') + raise IOError, 'incomplete OLE stream' + # sector index should be within FAT: + if sect<0 or sect>=len(fat): + debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) + debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) +## tmp_data = string.join(data, "") +## f = open('test_debug.bin', 'wb') +## f.write(tmp_data) +## f.close() +## debug('data read so far: %d bytes' % len(tmp_data)) + raise IOError, 'incorrect OLE FAT, sector index out of range' + #TODO: merge this code with OleFileIO.getsect() ? + #TODO: check if this works with 4K sectors: + try: + fp.seek(offset + sectorsize * sect) + except: + debug('sect=%d, seek=%d, filesize=%d' % + (sect, offset+sectorsize*sect, filesize)) + raise IOError, 'OLE sector index out of range' + sector_data = fp.read(sectorsize) + # [PL] check if there was enough data: + # Note: if sector is the last of the file, sometimes it is not a + # complete sector (of 512 or 4K), so we may read less than + # sectorsize. + if len(sector_data)!=sectorsize and sect!=(len(fat)-1): + debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % + (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) + debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) + raise IOError, 'incomplete OLE sector' + data.append(sector_data) + # jump to next sector in the FAT: + try: + sect = fat[sect] + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + raise IOError, 'incorrect OLE FAT, sector index out of range' + #[PL] Last sector should be a "end of chain" marker: + if sect != ENDOFCHAIN: + raise IOError, 'incorrect last sector index in OLE stream' + data = string.join(data, "") + # Data is truncated to the actual stream size: + if len(data) >= size: + data = data[:size] + # actual stream size is stored for future use: + self.size = size + elif unknown_size: + # actual stream size was not known, now we know the size of read + # data: + self.size = len(data) + else: + # read data is less than expected: + debug('len(data)=%d, size=%d' % (len(data), size)) + raise IOError, 'OLE stream size is less than declared' + # when all data is read in memory, StringIO constructor is called + StringIO.StringIO.__init__(self, data) + # Then the _OleStream object can be used as a read-only file object. + + +#--- _OleDirectoryEntry ------------------------------------------------------- + +class _OleDirectoryEntry: + + """ + OLE2 Directory Entry + """ + #[PL] parsing code moved from OleFileIO.loaddirectory + + # struct to parse directory entries: + # <: little-endian byte order + # 64s: string containing entry name in unicode (max 31 chars) + null char + # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 + # B: uint8, dir entry type (between 0 and 5) + # B: uint8, color: 0=black, 1=red + # I: uint32, index of left child node in the red-black tree, NOSTREAM if none + # I: uint32, index of right child node in the red-black tree, NOSTREAM if none + # I: uint32, index of child root node if it is a storage, else NOSTREAM + # 16s: CLSID, unique identifier (only used if it is a storage) + # I: uint32, user flags + # 8s: uint64, creation timestamp or zero + # 8s: uint64, modification timestamp or zero + # I: uint32, SID of first sector if stream or ministream, SID of 1st sector + # of stream containing ministreams if root entry, 0 otherwise + # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise + # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise + STRUCT_DIRENTRY = '<64sHBBIII16sI8s8sIII' + # size of a directory entry: 128 bytes + DIRENTRY_SIZE = 128 + assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE + + + def __init__(self, entry, sid, olefile): + """ + Constructor for an _OleDirectoryEntry object. + Parses a 128-bytes entry from the OLE Directory stream. + + entry : string (must be 128 bytes long) + sid : index of this directory entry in the OLE file directory + olefile: OleFileIO containing this directory entry + """ + self.sid = sid + # ref to olefile is stored for future use + self.olefile = olefile + # kids is a list of children entries, if this entry is a storage: + # (list of _OleDirectoryEntry objects) + self.kids = [] + # kids_dict is a dictionary of children entries, indexed by their + # name in lowercase: used to quickly find an entry, and to detect + # duplicates + self.kids_dict = {} + # flag used to detect if the entry is referenced more than once in + # directory: + self.used = False + # decode DirEntry + ( + name, + namelength, + self.entry_type, + self.color, + self.sid_left, + self.sid_right, + self.sid_child, + clsid, + self.dwUserFlags, + self.createTime, + self.modifyTime, + self.isectStart, + sizeLow, + sizeHigh + ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) + if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: + olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + # only first directory entry can (and should) be root: + if self.entry_type == STGTY_ROOT and sid != 0: + olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') + if sid == 0 and self.entry_type != STGTY_ROOT: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') + #debug (struct.unpack(fmt_entry, entry[:len_entry])) + # name should be at most 31 unicode characters + null character, + # so 64 bytes in total (31*2 + 2): + if namelength>64: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') + # if exception not raised, namelength is set to the maximum value: + namelength = 64 + # only characters without ending null char are kept: + name = name[:(namelength-2)] + # name is converted from unicode to Latin-1: + self.name = _unicode(name) + + debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) + debug(' - type: %d' % self.entry_type) + debug(' - sect: %d' % self.isectStart) + debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, + self.sid_right, self.sid_child)) + + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes + # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 + # or some other value so it cannot be raised as a defect in general: + if olefile.sectorsize == 512: + if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: + debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % + (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) + olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') + self.size = sizeLow + else: + self.size = sizeLow + (long(sizeHigh)<<32) + debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) + + self.clsid = _clsid(clsid) + # a storage should have a null size, BUT some implementations such as + # Word 8 for Mac seem to allow non-null values => Potential defect: + if self.entry_type == STGTY_STORAGE and self.size != 0: + olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') + # check if stream is not already referenced elsewhere: + if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: + if self.size < olefile.minisectorcutoff \ + and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT + # ministream object + minifat = True + else: + minifat = False + olefile._check_duplicate_stream(self.isectStart, minifat) + + + + def build_storage_tree(self): + """ + Read and build the red-black tree attached to this _OleDirectoryEntry + object, if it is a storage. + Note that this method builds a tree of all subentries, so it should + only be called for the root object once. + """ + debug('build_storage_tree: SID=%d - %s - sid_child=%d' + % (self.sid, repr(self.name), self.sid_child)) + if self.sid_child != NOSTREAM: + # if child SID is not NOSTREAM, then this entry is a storage. + # Let's walk through the tree of children to fill the kids list: + self.append_kids(self.sid_child) + + # Note from OpenOffice documentation: the safest way is to + # recreate the tree because some implementations may store broken + # red-black trees... + + # in the OLE file, entries are sorted on (length, name). + # for convenience, we sort them on name instead: + # (see __cmp__ method in this class) + self.kids.sort() + + + def append_kids(self, child_sid): + """ + Walk through red-black tree of children of this directory entry to add + all of them to the kids list. (recursive method) + + child_sid : index of child directory entry to use, or None when called + first time for the root. (only used during recursion) + """ + #[PL] this method was added to use simple recursion instead of a complex + # algorithm. + # if this is not a storage or a leaf of the tree, nothing to do: + if child_sid == NOSTREAM: + return + # check if child SID is in the proper range: + if child_sid<0 or child_sid>=len(self.olefile.direntries): + self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') + # get child direntry: + child = self.olefile._load_direntry(child_sid) #direntries[child_sid] + debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' + % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) + # the directory entries are organized as a red-black tree. + # (cf. Wikipedia for details) + # First walk through left side of the tree: + self.append_kids(child.sid_left) + # Check if its name is not already used (case-insensitive): + name_lower = child.name.lower() + if self.kids_dict.has_key(name_lower): + self.olefile._raise_defect(DEFECT_INCORRECT, + "Duplicate filename in OLE storage") + # Then the child_sid _OleDirectoryEntry object is appended to the + # kids list and dictionary: + self.kids.append(child) + self.kids_dict[name_lower] = child + # Check if kid was not already referenced in a storage: + if child.used: + self.olefile._raise_defect(DEFECT_INCORRECT, + 'OLE Entry referenced more than once') + child.used = True + # Finally walk through right side of the tree: + self.append_kids(child.sid_right) + # Afterwards build kid's own tree if it's also a storage: + child.build_storage_tree() + + + def __cmp__(self, other): + "Compare entries by name" + return cmp(self.name, other.name) + #TODO: replace by the same function as MS implementation ? + # (order by name length first, then case-insensitive order) + + + def dump(self, tab = 0): + "Dump this entry, and all its subentries (for debug purposes only)" + TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", + "(property)", "(root)"] + print " "*tab + repr(self.name), TYPES[self.entry_type], + if self.entry_type in (STGTY_STREAM, STGTY_ROOT): + print self.size, "bytes", + print + if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: + print " "*tab + "{%s}" % self.clsid + + for kid in self.kids: + kid.dump(tab + 2) + + +#--- OleFileIO ---------------------------------------------------------------- + +class OleFileIO: + """ + OLE container object + + This class encapsulates the interface to an OLE 2 structured + storage file. Use the {@link listdir} and {@link openstream} methods to + access the contents of this file. + + Object names are given as a list of strings, one for each subentry + level. The root entry should be omitted. For example, the following + code extracts all image streams from a Microsoft Image Composer file: + + ole = OleFileIO("fan.mic") + + for entry in ole.listdir(): + if entry[1:2] == "Image": + fin = ole.openstream(entry) + fout = open(entry[0:1], "wb") + while 1: + s = fin.read(8192) + if not s: + break + fout.write(s) + + You can use the viewer application provided with the Python Imaging + Library to view the resulting files (which happens to be standard + TIFF files). + """ + + def __init__(self, filename = None, raise_defects=DEFECT_FATAL): + """ + Constructor for OleFileIO class. + + filename: file to open. + raise_defects: minimal level for defects to be raised as exceptions. + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a + security-oriented application, see source code for details) + """ + self._raise_defects_level = raise_defects + if filename: + self.open(filename) + + + def _raise_defect(self, defect_level, message): + """ + This method should be called for any defect found during file parsing. + It may raise an IOError exception according to the minimal level chosen + for the OleFileIO object. + + defect_level: defect level, possible values are: + DEFECT_UNSURE : a case which looks weird, but not sure it's a defect + DEFECT_POTENTIAL : a potential defect + DEFECT_INCORRECT : an error according to specifications, but parsing can go on + DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + message: string describing the defect, used with raised exception. + """ + # added by [PL] + if defect_level >= self._raise_defects_level: + raise IOError, message + + + def open(self, filename): + """ + Open an OLE2 file. + Reads the header, FAT and directory. + + filename: string-like or file-like object + """ + #[PL] check if filename is a string-like or file-like object: + # (it is better to check for a read() method) + if hasattr(filename, 'read'): + # file-like object + self.fp = filename + else: + # string-like object + self.fp = open(filename, "rb") + # old code fails if filename is not a plain string: + #if type(filename) == type(""): + # self.fp = open(filename, "rb") + #else: + # self.fp = filename + + # lists of streams in FAT and MiniFAT, to detect duplicate references + # (list of indexes of first sectors of each stream) + self._used_streams_fat = [] + self._used_streams_minifat = [] + + header = self.fp.read(512) + + if len(header) != 512 or header[:8] != MAGIC: + self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") + + # [PL] header structure according to AAF specifications: + ##Header + ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] + ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, + ## // 0x1a, 0xe1} for current version + ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ + ## // GetClassFile uses root directory class id) + ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is + ## // written by reference implementation + ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for + ## // 512-byte sectors, 4 for 4 KB sectors + ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering + ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; + ## // typically 9 indicating 512-byte sectors + ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; + ## // typically 6 indicating 64-byte mini-sectors + ##USHORT _usReserved; // [22H,02] reserved, must be zero + ##ULONG _ulReserved1; // [24H,04] reserved, must be zero + ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, + ## // number of SECTs in directory chain for 4 KB + ## // sectors + ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain + ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain + ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must + ## // be zero. The reference implementation + ## // does not support transactions + ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; + ## // typically 4096 bytes + ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain + ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain + ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain + ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain + ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors + ##}; + + # [PL] header decoding: + # '<' indicates little-endian byte ordering for Intel (cf. struct module help) + fmt_header = '<8s16sHHHHHHLLLLLLLLLL' + header_size = struct.calcsize(fmt_header) + debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) + header1 = header[:header_size] + ( + self.Sig, + self.clsid, + self.MinorVersion, + self.DllVersion, + self.ByteOrder, + self.SectorShift, + self.MiniSectorShift, + self.Reserved, self.Reserved1, + self.csectDir, + self.csectFat, + self.sectDirStart, + self.signature, + self.MiniSectorCutoff, + self.MiniFatStart, + self.csectMiniFat, + self.sectDifStart, + self.csectDif + ) = struct.unpack(fmt_header, header1) + debug( struct.unpack(fmt_header, header1)) + + if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + # OLE signature should always be present + self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") + if self.clsid != '\x00'*16: + # according to AAF specs, CLSID should always be zero + self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") + debug( "MinorVersion = %d" % self.MinorVersion ) + debug( "DllVersion = %d" % self.DllVersion ) + if self.DllVersion not in [3, 4]: + # version 3: usual format, 512 bytes per sector + # version 4: large format, 4K per sector + self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") + debug( "ByteOrder = %X" % self.ByteOrder ) + if self.ByteOrder != 0xFFFE: + # For now only common little-endian documents are handled correctly + self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") + # TODO: add big-endian support for documents created on Mac ? + self.SectorSize = 2**self.SectorShift + debug( "SectorSize = %d" % self.SectorSize ) + if self.SectorSize not in [512, 4096]: + self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") + if (self.DllVersion==3 and self.SectorSize!=512) \ + or (self.DllVersion==4 and self.SectorSize!=4096): + self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") + self.MiniSectorSize = 2**self.MiniSectorShift + debug( "MiniSectorSize = %d" % self.MiniSectorSize ) + if self.MiniSectorSize not in [64]: + self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") + if self.Reserved != 0 or self.Reserved1 != 0: + self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") + debug( "csectDir = %d" % self.csectDir ) + if self.SectorSize==512 and self.csectDir!=0: + self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") + debug( "csectFat = %d" % self.csectFat ) + debug( "sectDirStart = %X" % self.sectDirStart ) + debug( "signature = %d" % self.signature ) + # Signature should be zero, BUT some implementations do not follow this + # rule => only a potential defect: + if self.signature != 0: + self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") + debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) + debug( "MiniFatStart = %X" % self.MiniFatStart ) + debug( "csectMiniFat = %d" % self.csectMiniFat ) + debug( "sectDifStart = %X" % self.sectDifStart ) + debug( "csectDif = %d" % self.csectDif ) + + # calculate the number of sectors in the file + # (-1 because header doesn't count) + filesize = os.path.getsize(filename) + self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 + debug( "Number of sectors in the file: %d" % self.nb_sect ) + + # file clsid (probably never used, so we don't store it) + clsid = _clsid(header[8:24]) + self.sectorsize = self.SectorSize #1 << i16(header, 30) + self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) + self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) + + # check known streams for duplicate references (these are always in FAT, + # never in MiniFAT): + self._check_duplicate_stream(self.sectDirStart) + # check MiniFAT only if it is not empty: + if self.csectMiniFat: + self._check_duplicate_stream(self.MiniFatStart) + # check DIFAT only if it is not empty: + if self.csectDif: + self._check_duplicate_stream(self.sectDifStart) + + # Load file allocation tables + self.loadfat(header) + # Load direcory. This sets both the direntries list (ordered by sid) + # and the root (ordered by hierarchy) members. + self.loaddirectory(self.sectDirStart)#i32(header, 48)) + self.ministream = None + self.minifatsect = self.MiniFatStart #i32(header, 60) + + + def _check_duplicate_stream(self, first_sect, minifat=False): + """ + Checks if a stream has not been already referenced elsewhere. + This method should only be called once for each known stream, and only + if stream size is not null. + first_sect: index of first sector of the stream in FAT + minifat: if True, stream is located in the MiniFAT, else in the FAT + """ + if minifat: + debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) + used_streams = self._used_streams_minifat + else: + debug('_check_duplicate_stream: sect=%d in FAT' % first_sect) + # some values can be safely ignored (not a real stream): + if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): + return + used_streams = self._used_streams_fat + #TODO: would it be more efficient using a dict or hash values, instead + # of a list of long ? + if first_sect in used_streams: + self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') + else: + used_streams.append(first_sect) + + + def dumpfat(self, fat, firstindex=0): + "Displays a part of FAT in human-readable form for debugging purpose" + # [PL] added only for debug + if not DEBUG_MODE: + return + # dictionary to convert special FAT values in human-readable strings + VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) + fatnames = { + FREESECT: "..free..", + ENDOFCHAIN: "[ END. ]", + FATSECT: "FATSECT ", + DIFSECT: "DIFSECT " + } + nbsect = len(fat) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = fat[i] + if sect in fatnames: + nom = fatnames[sect] + else: + if sect == i+1: + nom = " --->" + else: + nom = "%8X" % sect + print nom, + print "" + + + def dumpsect(self, sector, firstindex=0): + "Displays a sector in a human-readable form, for debugging purpose." + if not DEBUG_MODE: + return + VPL=8 # number of values per line (8+1 * 8+1 = 81) + tab = array.array(UINT32, sector) + nbsect = len(tab) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = tab[i] + nom = "%8X" % sect + print nom, + print "" + + def sect2array(self, sect): + """ + convert a sector to an array of 32 bits unsigned integers, + swapping bytes on big endian CPUs such as PowerPC (old Macs) + """ + a = array.array(UINT32, sect) + # if CPU is big endian, swap bytes: + if sys.byteorder == 'big': + a.byteswap() + return a + + + def loadfat_sect(self, sect): + """ + Adds the indexes of the given sector to the FAT + sect: string containing the first FAT sector, or array of long integers + return: index of last FAT sector. + """ + # a FAT sector is an array of ulong integers. + if isinstance(sect, array.array): + # if sect is already an array it is directly used + fat1 = sect + else: + # if it's a raw sector, it is parsed in an array + fat1 = self.sect2array(sect) + self.dumpsect(sect) + # The FAT is a sector chain starting at the first index of itself. + for isect in fat1: + #print "isect = %X" % isect + if isect == ENDOFCHAIN or isect == FREESECT: + # the end of the sector chain has been reached + break + # read the FAT sector + s = self.getsect(isect) + # parse it as an array of 32 bits integers, and add it to the + # global FAT array + nextfat = self.sect2array(s) + self.fat = self.fat + nextfat + return isect + + + def loadfat(self, header): + """ + Load the FAT table. + """ + # The header contains a sector numbers + # for the first 109 FAT sectors. Additional sectors are + # described by DIF blocks + + sect = header[76:512] + debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) + #fat = [] + # [PL] FAT is an array of 32 bits unsigned ints, it's more effective + # to use an array than a list in Python. + # It's initialized as empty first: + self.fat = array.array(UINT32) + self.loadfat_sect(sect) + #self.dumpfat(self.fat) +## for i in range(0, len(sect), 4): +## ix = i32(sect, i) +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## break +## s = self.getsect(ix) +## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) +## fat = fat + array.array(UINT32, s) + if self.csectDif != 0: + # [PL] There's a DIFAT because file is larger than 6.8MB + # some checks just in case: + if self.csectFat <= 109: + # there must be at least 109 blocks in header and the rest in + # DIFAT, so number of sectors must be >109. + self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') + if self.sectDifStart >= self.nb_sect: + # initial DIFAT block index must be valid + self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') + debug( "DIFAT analysis..." ) + # We compute the necessary number of DIFAT sectors : + # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) + nb_difat = (self.csectFat-109 + 126)/127 + debug( "nb_difat = %d" % nb_difat ) + if self.csectDif != nb_difat: + raise IOError, 'incorrect DIFAT' + isect_difat = self.sectDifStart + for i in xrange(nb_difat): + debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) + #TODO: check if corresponding FAT SID = DIFSECT + sector_difat = self.getsect(isect_difat) + difat = self.sect2array(sector_difat) + self.dumpsect(sector_difat) + self.loadfat_sect(difat[:127]) + # last DIFAT pointer is next DIFAT sector: + isect_difat = difat[127] + debug( "next DIFAT sector: %X" % isect_difat ) + # checks: + if isect_difat not in [ENDOFCHAIN, FREESECT]: + # last DIFAT pointer value must be ENDOFCHAIN or FREESECT + raise IOError, 'incorrect end of DIFAT' +## if len(self.fat) != self.csectFat: +## # FAT should contain csectFat blocks +## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) +## raise IOError, 'incorrect DIFAT' + # since FAT is read from fixed-size sectors, it may contain more values + # than the actual number of sectors in the file. + # Keep only the relevant sector indexes: + if len(self.fat) > self.nb_sect: + debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) + self.fat = self.fat[:self.nb_sect] + debug('\nFAT:') + self.dumpfat(self.fat) + + + def loadminifat(self): + """ + Load the MiniFAT table. + """ + # MiniFAT is stored in a standard sub-stream, pointed to by a header + # field. + # NOTE: there are two sizes to take into account for this stream: + # 1) Stream size is calculated according to the number of sectors + # declared in the OLE header. This allocated stream may be more than + # needed to store the actual sector indexes. + # (self.csectMiniFat is the number of sectors of size self.SectorSize) + stream_size = self.csectMiniFat * self.SectorSize + # 2) Actually used size is calculated by dividing the MiniStream size + # (given by root entry size) by the size of mini sectors, *4 for + # 32 bits indexes: + nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize + used_size = nb_minisectors * 4 + debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % + (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) + if used_size > stream_size: + # This is not really a problem, but may indicate a wrong implementation: + self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') + # In any case, first read stream_size: + s = self._open(self.minifatsect, stream_size, force_FAT=True).read() + #[PL] Old code replaced by an array: + #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) + self.minifat = self.sect2array(s) + # Then shrink the array to used size, to avoid indexes out of MiniStream: + debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) + self.minifat = self.minifat[:nb_minisectors] + debug('loadminifat(): len=%d' % len(self.minifat)) + debug('\nMiniFAT:') + self.dumpfat(self.minifat) + + def getsect(self, sect): + """ + Read given sector from file on disk. + sect: sector index + returns a string containing the sector data. + """ + # [PL] this original code was wrong when sectors are 4KB instead of + # 512 bytes: + #self.fp.seek(512 + self.sectorsize * sect) + #[PL]: added safety checks: + #print "getsect(%X)" % sect + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + debug('getsect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name))) + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') + sector = self.fp.read(self.sectorsize) + if len(sector) != self.sectorsize: + debug('getsect(): sect=%X, read=%d, sectorsize=%d' % + (sect, len(sector), self.sectorsize)) + self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') + return sector + + + def loaddirectory(self, sect): + """ + Load the directory. + sect: sector index of directory stream. + """ + # The directory is stored in a standard + # substream, independent of its size. + + # open directory stream as a read-only file: + # (stream size is not known in advance) + self.directory_fp = self._open(sect) + + #[PL] to detect malformed documents and avoid DoS attacks, the maximum + # number of directory entries can be calculated: + max_entries = self.directory_fp.size / 128 + debug('loaddirectory: size=%d, max_entries=%d' % + (self.directory_fp.size, max_entries)) + + # Create list of directory entries + #self.direntries = [] + # We start with a list of "None" object + self.direntries = [None] * max_entries +## for sid in xrange(max_entries): +## entry = fp.read(128) +## if not entry: +## break +## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) + # load root entry: + root_entry = self._load_direntry(0) + # Root entry is the first entry: + self.root = self.direntries[0] + # read and build all storage trees, starting from the root: + self.root.build_storage_tree() + + + def _load_direntry (self, sid): + """ + Load a directory entry from the directory. + This method should only be called once for each storage/stream when + loading the directory. + sid: index of storage/stream in the directory. + return: a _OleDirectoryEntry object + raise: IOError if the entry has always been referenced. + """ + # check if SID is OK: + if sid<0 or sid>=len(self.direntries): + self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") + # check if entry was already referenced: + if self.direntries[sid] is not None: + self._raise_defect(DEFECT_INCORRECT, + "double reference for OLE stream/storage") + # if exception not raised, return the object + return self.direntries[sid] + self.directory_fp.seek(sid * 128) + entry = self.directory_fp.read(128) + self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) + return self.direntries[sid] + + + def dumpdirectory(self): + """ + Dump directory (for debugging only) + """ + self.root.dump() + + + def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): + """ + Open a stream, either in FAT or MiniFAT according to its size. + (openstream helper) + + start: index of first sector + size: size of stream (or nothing if size is unknown) + force_FAT: if False (default), stream will be opened in FAT or MiniFAT + according to size. If True, it will always be opened in FAT. + """ + debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % + (start, size, str(force_FAT))) + # stream size is compared to the MiniSectorCutoff threshold: + if size < self.minisectorcutoff and not force_FAT: + # ministream object + if not self.ministream: + # load MiniFAT if it wasn't already done: + self.loadminifat() + # The first sector index of the miniFAT stream is stored in the + # root directory entry: + size_ministream = self.root.size + debug('Opening MiniStream: sect=%d, size=%d' % + (self.root.isectStart, size_ministream)) + self.ministream = self._open(self.root.isectStart, + size_ministream, force_FAT=True) + return _OleStream(self.ministream, start, size, 0, + self.minisectorsize, self.minifat) + else: + # standard stream + return _OleStream(self.fp, start, size, 512, + self.sectorsize, self.fat) + + + def _list(self, files, prefix, node): + """ + (listdir helper) + files: list of files to fill in + prefix: current location in storage tree (list of names) + node: current node (_OleDirectoryEntry object) + """ + prefix = prefix + [node.name] + for entry in node.kids: + if entry.kids: + self._list(files, prefix, entry) + else: + files.append(prefix[1:] + [entry.name]) + + + def listdir(self): + """ + Return a list of streams stored in this file + """ + files = [] + self._list(files, [], self.root) + return files + + + def _find(self, filename): + """ + Returns directory entry of given filename. (openstream helper) + Note: this method is case-insensitive. + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: sid of requested filename + raise IOError if file not found + """ + + # if filename is a string instead of a list, split it on slashes to + # convert to a list: + if isinstance(filename, basestring): + filename = filename.split('/') + # walk across storage tree, following given path: + node = self.root + for name in filename: + for kid in node.kids: + if kid.name.lower() == name.lower(): + break + else: + raise IOError, "file not found" + node = kid + return node.sid + + + def openstream(self, filename): + """ + Open a stream as a read-only file object (StringIO). + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: file object (read-only) + raise IOError if filename not found, or if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + raise IOError, "this file is not a stream" + return self._open(entry.isectStart, entry.size) + + + def get_type(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container, and return its type. + + filename: path of stream in storage tree. (see openstream for syntax) + return: False if object does not exist, its entry type (>0) otherwise: + - STGTY_STREAM: a stream + - STGTY_STORAGE: a storage + - STGTY_ROOT: the root entry + """ + try: + sid = self._find(filename) + entry = self.direntries[sid] + return entry.entry_type + except: + return False + + + def exists(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container. + + filename: path of stream in storage tree. (see openstream for syntax) + return: True if object exist, else False. + """ + try: + sid = self._find(filename) + return True + except: + return False + + + def get_size(self, filename): + """ + Return size of a stream in the OLE container, in bytes. + + filename: path of stream in storage tree (see openstream for syntax) + return: size in bytes (long integer) + raise: IOError if file not found, TypeError if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + #TODO: Should it return zero instead of raising an exception ? + raise TypeError, 'object is not an OLE stream' + return entry.size + + + def get_rootentry_name(self): + """ + Return root entry name. Should usually be 'Root Entry' or 'R' in most + implementations. + """ + return self.root.name + + + def getproperties(self, filename): + """ + Return properties described in substream. + + filename: path of stream in storage tree (see openstream for syntax) + return: a dictionary of values indexed by id (integer) + """ + fp = self.openstream(filename) + + data = {} + + # header + s = fp.read(28) + clsid = _clsid(s[8:24]) + + # format id + s = fp.read(20) + fmtid = _clsid(s[:16]) + fp.seek(i32(s, 16)) + + # get section + s = "****" + fp.read(i32(fp.read(4))-4) + + for i in range(i32(s, 4)): + + id = i32(s, 8+i*8) + offset = i32(s, 12+i*8) + type = i32(s, offset) + + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) + + # test for common types first (should perhaps use + # a dictionary instead?) + + if type == VT_I2: + value = i16(s, offset+4) + if value >= 32768: + value = value - 65536 + elif type == VT_UI2: + value = i16(s, offset+4) + elif type in (VT_I4, VT_ERROR): + value = i32(s, offset+4) + elif type == VT_UI4: + value = i32(s, offset+4) # FIXME + elif type in (VT_BSTR, VT_LPSTR): + count = i32(s, offset+4) + value = s[offset+8:offset+8+count-1] + elif type == VT_BLOB: + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif type == VT_LPWSTR: + count = i32(s, offset+4) + value = self._unicode(s[offset+8:offset+8+count*2]) + elif type == VT_FILETIME: + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) + # FIXME: this is a 64-bit int: "number of 100ns periods + # since Jan 1,1601". Should map this to Python time + value = value / 10000000L # seconds + elif type == VT_UI1: + value = ord(s[offset+4]) + elif type == VT_CLSID: + value = _clsid(s[offset+4:offset+20]) + elif type == VT_CF: + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + else: + value = None # everything else yields "None" + + # FIXME: add support for VT_VECTOR + + #print "%08x" % id, repr(value), + #print "(%s)" % VT[i32(s, offset) & 0xFFF] + + data[id] = value + + return data + +# +# -------------------------------------------------------------------- +# This script can be used to dump the directory of any OLE2 structured +# storage file. + +if __name__ == "__main__": + + import sys + + # [PL] display quick usage info if launched from command-line + if len(sys.argv) <= 1: + print __doc__ + print """ +Launched from command line, this script parses OLE files and prints info. + +Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] + +Options: +-d : debug mode (display a lot of debug information, for developers only) +-c : check all streams (for debugging purposes) +""" + sys.exit() + + check_streams = False + for filename in sys.argv[1:]: +## try: + # OPTIONS: + if filename == '-d': + # option to switch debug mode on: + set_debug_mode(True) + continue + if filename == '-c': + # option to switch check streams mode on: + check_streams = True + continue + + ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) + print "-" * 68 + print filename + print "-" * 68 + ole.dumpdirectory() + for streamname in ole.listdir(): + if streamname[-1][0] == "\005": + print streamname, ": properties" + props = ole.getproperties(streamname) + props = props.items() + props.sort() + for k, v in props: + #[PL]: avoid to display too large or binary values: + if isinstance(v, basestring): + if len(v) > 50: + v = v[:50] + # quick and dirty binary check: + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30,31): + if chr(c) in v: + v = '(binary data)' + break + print " ", k, v + + if check_streams: + # Read all streams to check if there are errors: + print '\nChecking streams...' + for streamname in ole.listdir(): + # print name using repr() to convert binary chars to \xNN: + print '-', repr('/'.join(streamname)),'-', + st_type = ole.get_type(streamname) + if st_type == STGTY_STREAM: + print 'size %d' % ole.get_size(streamname) + # just try to read stream in memory: + ole.openstream(streamname) + else: + print 'NOT a stream : type=%d' % st_type + print '' + + #[PL] Test a few new methods: + root = ole.get_rootentry_name() + print 'Root entry name: "%s"' % root + if ole.exists('worddocument'): + print "This is a Word document." + print "type of stream 'WordDocument':", ole.get_type('worddocument') + print "size :", ole.get_size('worddocument') + if ole.exists('macros/vba'): + print "This document may contain VBA macros." +## except IOError, v: +## print "***", "cannot read", file, "-", v From 491f3e9f99a08f292be6d23bd40de9fc2873bf84 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 20 Oct 2011 05:50:14 +0200 Subject: [PATCH 012/101] updated readme --- PIL/OleFileIO-README.txt | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/PIL/OleFileIO-README.txt b/PIL/OleFileIO-README.txt index 3e037a00c..fe7ba8343 100644 --- a/PIL/OleFileIO-README.txt +++ b/PIL/OleFileIO-README.txt @@ -1,11 +1,17 @@ OleFileIO_PL module: -OleFileIO_PL is a Python module to read Microsoft OLE2 files (Structured -Storage), such as Microsoft Office documents, Image Composer and FlashPix files, - Outlook messages, ... +OleFileIO_PL is a Python module to read Microsoft OLE2 files (also called +Structured Storage or Compound Document File Format), such as Microsoft Office +documents, Image Composer and FlashPix files, Outlook messages, etc. -This is an improved version of the OleFileIO module from PIL library v1.1.6 -(See: http://www.pythonware.com/products/pil/index.htm) +This is an improved version of the OleFileIO module from PIL, the excellent +Python Imaging Library v1.1.6, created and maintained by Fredrik Lundh. +(See http://www.pythonware.com/products/pil/index.htm) + +The API is still compatible with PIL, but the internal implementation has been +improved a lot, with bugfixes and a more robust design. As far as I know, this +module is the most complete and robust Python implementation to read MS OLE2 +files, portable on several OSes. WARNING: THIS IS (STILL) WORK IN PROGRESS. From bd833a697265c9f7d607c4d69883f0e23cb3312c Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 16 Feb 2012 22:26:03 +0100 Subject: [PATCH 013/101] Fixed issue 7 in OleFileIO.getproperties reported by chuckleberryfinn --- PIL/OleFileIO.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index bcac82e55..02be06228 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.21 2010-01-22 Philippe Lagadec - http://www.decalage.info +version 0.22 2012-02-16 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -16,7 +16,7 @@ See: http://www.pythonware.com/products/pil/index.htm The Python Imaging Library (PIL) is Copyright (c) 1997-2005 by Secret Labs AB Copyright (c) 1995-2005 by Fredrik Lundh -OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec +OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec See source code and LICENSE.txt for information on usage and redistribution. @@ -24,15 +24,15 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2010-01-22" -__version__ = '0.21' +__date__ = "2012-02-16" +__version__ = '0.22' #--- LICENSE ------------------------------------------------------------------ # OleFileIO_PL is an improved version of the OleFileIO module from the # Python Imaging Library (PIL). -# OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec +# OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec # # The Python Imaging Library (PIL) is # Copyright (c) 1997-2005 by Secret Labs AB @@ -106,6 +106,8 @@ __version__ = '0.21' # (thanks to Ben G. and Martijn for reporting the bug) # 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str # 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs +# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn +# (https://bitbucket.org/decalage/olefileio_pl/issue/7) #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -847,7 +849,7 @@ class OleFileIO: else: # string-like object self.fp = open(filename, "rb") - # old code fails if filename is not a plain string: + # old code fails if filename is not a plain string: #if type(filename) == type(""): # self.fp = open(filename, "rb") #else: @@ -1096,7 +1098,7 @@ class OleFileIO: # if CPU is big endian, swap bytes: if sys.byteorder == 'big': a.byteswap() - return a + return a def loadfat_sect(self, sect): @@ -1540,7 +1542,7 @@ class OleFileIO: value = s[offset+8:offset+8+count] elif type == VT_LPWSTR: count = i32(s, offset+4) - value = self._unicode(s[offset+8:offset+8+count*2]) + value = _unicode(s[offset+8:offset+8+count*2]) elif type == VT_FILETIME: value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) # FIXME: this is a 64-bit int: "number of 100ns periods From eaeb12f695ca26be83dc936dc50d9f25ae84cd28 Mon Sep 17 00:00:00 2001 From: decalage Date: Thu, 16 Feb 2012 22:50:13 +0100 Subject: [PATCH 014/101] Fixed issue 2 - added close method to OleFileIO class --- PIL/OleFileIO.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 02be06228..e0e85f9d8 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -108,6 +108,7 @@ __version__ = '0.22' # 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs # 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn # (https://bitbucket.org/decalage/olefileio_pl/issue/7) +# - added close method to OleFileIO (fixed issue #2) #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -1004,6 +1005,13 @@ class OleFileIO: self.minifatsect = self.MiniFatStart #i32(header, 60) + def close(self): + """ + close the OLE file, to release the file object + """ + self.fp.close() + + def _check_duplicate_stream(self, first_sect, minifat=False): """ Checks if a stream has not been already referenced elsewhere. From bbee2b5b8a495270b9d22e8e4a993fcb1f5a3a85 Mon Sep 17 00:00:00 2001 From: mete0r sarangbang Date: Mon, 16 Jul 2012 05:58:01 +0900 Subject: [PATCH 015/101] support file-like object fix #8 --- PIL/OleFileIO.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index e0e85f9d8..ec24bfd74 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -443,7 +443,7 @@ class _OleStream(StringIO.StringIO): # the fat chain, and load new sectors on demand instead of # loading it all in one go. - def __init__(self, fp, sect, size, offset, sectorsize, fat): + def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize): """ Constructor for _OleStream class. @@ -458,11 +458,6 @@ class _OleStream(StringIO.StringIO): debug('_OleStream.__init__:') debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) - # for debugging messages, size of file where stream is read: - if isinstance(fp, StringIO.StringIO): - filesize = len(fp.getvalue()) # file in MiniFAT - else: - filesize = os.path.getsize(fp.name) # file on disk #[PL] To detect malformed documents with FAT loops, we compute the # expected number of sectors in the stream: unknown_size = False @@ -855,6 +850,12 @@ class OleFileIO: # self.fp = open(filename, "rb") #else: # self.fp = filename + self.fp.seek(0, 2) + try: + filesize = self.fp.tell() + finally: + self.fp.seek(0) + self._filesize = filesize # lists of streams in FAT and MiniFAT, to detect duplicate references # (list of indexes of first sectors of each stream) @@ -976,7 +977,6 @@ class OleFileIO: # calculate the number of sectors in the file # (-1 because header doesn't count) - filesize = os.path.getsize(filename) self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 debug( "Number of sectors in the file: %d" % self.nb_sect ) @@ -1258,7 +1258,7 @@ class OleFileIO: self.fp.seek(self.sectorsize * (sect+1)) except: debug('getsect(): sect=%X, seek=%d, filesize=%d' % - (sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name))) + (sect, self.sectorsize*(sect+1), self._filesize)) self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') sector = self.fp.read(self.sectorsize) if len(sector) != self.sectorsize: @@ -1360,11 +1360,12 @@ class OleFileIO: self.ministream = self._open(self.root.isectStart, size_ministream, force_FAT=True) return _OleStream(self.ministream, start, size, 0, - self.minisectorsize, self.minifat) + self.minisectorsize, self.minifat, + self.ministream.size) else: # standard stream return _OleStream(self.fp, start, size, 512, - self.sectorsize, self.fat) + self.sectorsize, self.fat, self._filesize) def _list(self, files, prefix, node): From ab541b1b4d377a29e5b1774bcc250a6d3c292b73 Mon Sep 17 00:00:00 2001 From: decalage Date: Wed, 25 Jul 2012 07:15:29 +0200 Subject: [PATCH 016/101] Added comments for patch submitted by mete0r_kr, changed version --- PIL/OleFileIO.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index ec24bfd74..508ced552 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.22 2012-02-16 Philippe Lagadec - http://www.decalage.info +version 0.23 2012-07-25 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2012-02-16" -__version__ = '0.22' +__date__ = "2012-07-25" +__version__ = '0.23' #--- LICENSE ------------------------------------------------------------------ @@ -109,6 +109,8 @@ __version__ = '0.22' # 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn # (https://bitbucket.org/decalage/olefileio_pl/issue/7) # - added close method to OleFileIO (fixed issue #2) +# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) + #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -453,6 +455,7 @@ class _OleStream(StringIO.StringIO): offset : offset in bytes for the first FAT or MiniFAT sector sectorsize: size of one sector fat : array/list of sector indexes (FAT or MiniFAT) + filesize : size of OLE file (for debugging) return : a StringIO instance containing the OLE stream """ debug('_OleStream.__init__:') @@ -843,14 +846,19 @@ class OleFileIO: # file-like object self.fp = filename else: - # string-like object + # string-like object: filename of file on disk + #TODO: if larger than 1024 bytes, this could be the actual data => StringIO self.fp = open(filename, "rb") # old code fails if filename is not a plain string: #if type(filename) == type(""): # self.fp = open(filename, "rb") #else: # self.fp = filename - self.fp.seek(0, 2) + # obtain the filesize by using seek and tell, which should work on most + # file-like objects: + #TODO: do it above, using getsize with filename when possible? + #TODO: fix code to fail with clear exception when filesize cannot be obtained + self.fp.seek(0, os.SEEK_END) try: filesize = self.fp.tell() finally: From f88809e5f33b21d140783b480df2c3b9fdce83a1 Mon Sep 17 00:00:00 2001 From: decalage Date: Tue, 11 Sep 2012 23:05:12 +0200 Subject: [PATCH 017/101] Updated readme in markdown and rst formats --- PIL/OleFileIO-README.md | 117 +++++++++++++++++++++++++++++++++++++++ PIL/OleFileIO-README.txt | 36 ------------ 2 files changed, 117 insertions(+), 36 deletions(-) create mode 100644 PIL/OleFileIO-README.md delete mode 100644 PIL/OleFileIO-README.txt diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md new file mode 100644 index 000000000..f29291de3 --- /dev/null +++ b/PIL/OleFileIO-README.md @@ -0,0 +1,117 @@ +OleFileIO_PL +============ + +[OleFileIO_PL](http://www.decalage.info/python/olefileio) is a Python module to read [Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format)](http://en.wikipedia.org/wiki/Compound_File_Binary_Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... + +This is an improved version of the OleFileIO module from [PIL](http://www.pythonware.com/products/pil/index.htm), the excellent Python Imaging Library, created and maintained by Fredrik Lundh. The API is still compatible with PIL, but I have improved the internal implementation significantly, with bugfixes and a more robust design. + +As far as I know, this module is now the most complete and robust Python implementation to read MS OLE2 files, portable on several operating systems. (please tell me if you know other similar Python modules) + +WARNING: THIS IS (STILL) WORK IN PROGRESS. + +Main improvements over PIL version: +----------------------------------- + +- Better compatibility with Python 2.4 up to 2.7 +- Support for files larger than 6.8MB +- Robust: many checks to detect malformed files +- Improved API +- Added setup.py and install.bat to ease installation + +News +---- + +- 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) +- 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking +- 2010-01-24 v0.21: fixed support for big-endian CPUs, such as PowerPC Macs. +- 2009-12-11 v0.20: small bugfix in OleFileIO.open when filename is not plain str. +- 2009-12-10 v0.19: fixed support for 64 bits platforms (thanks to Ben G. and Martijn for reporting the bug) +- see changelog in source code for more info. + +Download: +--------- + +The archive is available on [the project page](https://bitbucket.org/decalage/olefileio_pl/downloads). + + +How to use this module: +----------------------- + +See sample code at the end of the module, and also docstrings. + +Here are a few examples: + + :::python + import OleFileIO_PL + + # Test if a file is an OLE container: + assert OleFileIO_PL.isOleFile('myfile.doc') + + # Open an OLE file: + ole = OleFileIO_PL.OleFileIO('myfile.doc') + + # Get list of streams: + print ole.listdir() + + # Test if known streams/storages exist: + if ole.exists('worddocument'): + print "This is a Word document." + print "size :", ole.get_size('worddocument') + if ole.exists('macros/vba'): + print "This document seems to contain VBA macros." + + # Extract the "Pictures" stream from a PPT file: + if ole.exists('Pictures'): + pics = ole.openstream('Pictures') + data = pics.read() + f = open('Pictures.bin', 'w') + f.write(data) + f.close() + + +It can also be used as a script from the command-line to display the structure of an OLE file, for example: + + OleFileIO_PL.py myfile.doc + +A real-life example: [using OleFileIO_PL for malware analysis and forensics](http://blog.gregback.net/2011/03/using-remnux-for-forensic-puzzle-6/). + +How to contribute: +------------------ + +The code is available in [a Mercurial repository on bitbucket](https://bitbucket.org/decalage/olefileio_pl). You may use it to submit enhancements or to report any issue. + +If you would like to help us improve this module, or simply provide feedback, you may also send an e-mail to decalage(at)laposte.net. You can help in many ways: + +- test this module on different platforms / Python versions +- find and report bugs +- improve documentation, code samples, docstrings +- write unittest test cases +- provide tricky malformed files + +How to report bugs: +------------------- + +To report a bug, for example a normal file which is not parsed correctly, please use the [issue reporting page](https://bitbucket.org/decalage/olefileio_pl/issues?status=new&status=open), or send an e-mail with an attachment containing the debugging output of OleFileIO_PL. + +For this, launch the following command : + + OleFileIO_PL.py -d -c file >debug.txt + +License +------- + +OleFileIO_PL is open-source. + +OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec. + +The Python Imaging Library (PIL) is + +- Copyright (c) 1997-2005 by Secret Labs AB + +- Copyright (c) 1995-2005 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its associated documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appears in all copies, and that both that copyright notice and this permission notice appear in supporting documentation, and that the name of Secret Labs AB or the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/PIL/OleFileIO-README.txt b/PIL/OleFileIO-README.txt deleted file mode 100644 index fe7ba8343..000000000 --- a/PIL/OleFileIO-README.txt +++ /dev/null @@ -1,36 +0,0 @@ -OleFileIO_PL module: - -OleFileIO_PL is a Python module to read Microsoft OLE2 files (also called -Structured Storage or Compound Document File Format), such as Microsoft Office -documents, Image Composer and FlashPix files, Outlook messages, etc. - -This is an improved version of the OleFileIO module from PIL, the excellent -Python Imaging Library v1.1.6, created and maintained by Fredrik Lundh. -(See http://www.pythonware.com/products/pil/index.htm) - -The API is still compatible with PIL, but the internal implementation has been -improved a lot, with bugfixes and a more robust design. As far as I know, this -module is the most complete and robust Python implementation to read MS OLE2 -files, portable on several OSes. - -WARNING: THIS IS (STILL) WORK IN PROGRESS. - - - -INSTALLATION: - -- on Windows, launch install.bat -- on other systems, launch: setup.py install - - - -HOW TO USE THIS MODULE: - -See http://www.decalage.info/python/olefileio -See main at the end of the module, and also docstrings. - - - -LICENSE: - -See LICENSE.txt. From 4bdd2b2bef7234dc3a30b7cad05cb8b0ca1a1224 Mon Sep 17 00:00:00 2001 From: decalage Date: Tue, 11 Sep 2012 23:57:37 +0200 Subject: [PATCH 018/101] version 0.23 - updated readme and setup.py --- PIL/OleFileIO-README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index f29291de3..067806c64 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -21,6 +21,7 @@ Main improvements over PIL version: News ---- +- 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) - 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) - 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking - 2010-01-24 v0.21: fixed support for big-endian CPUs, such as PowerPC Macs. @@ -47,7 +48,7 @@ Here are a few examples: # Test if a file is an OLE container: assert OleFileIO_PL.isOleFile('myfile.doc') - # Open an OLE file: + # Open an OLE file from disk: ole = OleFileIO_PL.OleFileIO('myfile.doc') # Get list of streams: @@ -67,6 +68,16 @@ Here are a few examples: f = open('Pictures.bin', 'w') f.write(data) f.close() + + # Close the OLE file: + ole.close() + + # Work with a file-like object (e.g. StringIO) instead of a file on disk: + data = open('myfile.doc', 'rb').read() + f = StringIO.StringIO(data) + ole = OleFileIO_PL.OleFileIO(f) + print ole.listdir() + ole.close() It can also be used as a script from the command-line to display the structure of an OLE file, for example: From 25158fe8b17e876c3e86560cc0330b2cb9833774 Mon Sep 17 00:00:00 2001 From: decalage Date: Sun, 5 May 2013 16:37:59 +0200 Subject: [PATCH 019/101] - getproperties: added conversion from filetime to python datetime - main: displays properties with date format --- PIL/OleFileIO-README.md | 1 + PIL/OleFileIO.py | 40 +++++++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 067806c64..0d212866f 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -21,6 +21,7 @@ Main improvements over PIL version: News ---- +- 2013-05-03 v0.24: improved get_properties to convert timestamps to Python datetime - 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) - 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) - 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 508ced552..5574eafb6 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.23 2012-07-25 Philippe Lagadec - http://www.decalage.info +version 0.24 2013-05-03 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -16,23 +16,23 @@ See: http://www.pythonware.com/products/pil/index.htm The Python Imaging Library (PIL) is Copyright (c) 1997-2005 by Secret Labs AB Copyright (c) 1995-2005 by Fredrik Lundh -OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec +OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec See source code and LICENSE.txt for information on usage and redistribution. WARNING: THIS IS (STILL) WORK IN PROGRESS. """ -__author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec" -__date__ = "2012-07-25" -__version__ = '0.23' +__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" +__date__ = "2013-05-03" +__version__ = '0.24' #--- LICENSE ------------------------------------------------------------------ # OleFileIO_PL is an improved version of the OleFileIO module from the # Python Imaging Library (PIL). -# OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec +# OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec # # The Python Imaging Library (PIL) is # Copyright (c) 1997-2005 by Secret Labs AB @@ -110,13 +110,18 @@ __version__ = '0.23' # (https://bitbucket.org/decalage/olefileio_pl/issue/7) # - added close method to OleFileIO (fixed issue #2) # 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) +# 2013-05-03 v0.24 PL: - getproperties: added conversion from filetime to python +# datetime +# - main: displays properties with date format #----------------------------------------------------------------------------- # TODO (for version 1.0): +# + add path attrib to _OleDirEntry, set it once and for all in init or +# append_kids (then listdir/_list can be simplified) # - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... # - add underscore to each private method, to avoid their display in -# pydoc/epydoc documentation +# pydoc/epydoc documentation - Remove it for classes to be documented # - replace all raised exceptions with _raise_defect (at least in OleFileIO) # - merge code from _OleStream and OleFileIO.getsect to read sectors # (maybe add a class for FAT and MiniFAT ?) @@ -197,7 +202,7 @@ __version__ = '0.23' #------------------------------------------------------------------------------ -import string, StringIO, struct, array, os.path, sys +import string, StringIO, struct, array, os.path, sys, datetime #[PL] Define explicitly the public API to avoid private objects in pydoc: __all__ = ['OleFileIO', 'isOleFile'] @@ -1507,11 +1512,12 @@ class OleFileIO: return self.root.name - def getproperties(self, filename): + def getproperties(self, filename, convert_time=False): """ Return properties described in substream. filename: path of stream in storage tree (see openstream for syntax) + convert_time: bool, if True timestamps will be converted to Python datetime return: a dictionary of values indexed by id (integer) """ fp = self.openstream(filename) @@ -1562,9 +1568,17 @@ class OleFileIO: value = _unicode(s[offset+8:offset+8+count*2]) elif type == VT_FILETIME: value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) - # FIXME: this is a 64-bit int: "number of 100ns periods - # since Jan 1,1601". Should map this to Python time - value = value / 10000000L # seconds + # FILETIME is a 64-bit int: "number of 100ns periods + # since Jan 1,1601". + if convert_time: + # convert FILETIME to Python datetime.datetime + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) + else: + # legacy code kept for backward compatibility: returns a + # number of seconds since Jan 1,1601 + value = value / 10000000L # seconds elif type == VT_UI1: value = ord(s[offset+4]) elif type == VT_CLSID: @@ -1628,7 +1642,7 @@ Options: for streamname in ole.listdir(): if streamname[-1][0] == "\005": print streamname, ": properties" - props = ole.getproperties(streamname) + props = ole.getproperties(streamname, convert_time=True) props = props.items() props.sort() for k, v in props: From d5166fd97ec238e4f4175399571f8b2d764ab6e8 Mon Sep 17 00:00:00 2001 From: decalage Date: Sun, 5 May 2013 23:52:20 +0200 Subject: [PATCH 020/101] - new class OleMetadata to parse standard properties - added get_metadata method to OleFileIO --- PIL/OleFileIO.py | 123 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 3 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 5574eafb6..a995cb3a3 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.24 2013-05-03 Philippe Lagadec - http://www.decalage.info +version 0.24 2013-05-05 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,7 +24,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-05-03" +__date__ = "2013-05-05" __version__ = '0.24' #--- LICENSE ------------------------------------------------------------------ @@ -110,9 +110,11 @@ __version__ = '0.24' # (https://bitbucket.org/decalage/olefileio_pl/issue/7) # - added close method to OleFileIO (fixed issue #2) # 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) -# 2013-05-03 v0.24 PL: - getproperties: added conversion from filetime to python +# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python # datetime # - main: displays properties with date format +# - new class OleMetadata to parse standard properties +# - added get_metadata method #----------------------------------------------------------------------------- @@ -428,6 +430,107 @@ except NameError: #=== CLASSES ================================================================== +class OleMetadata: + """ + class to parse and store metadata from standard properties of OLE files. + + References for SummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd942545.aspx + - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx + - http://msdn.microsoft.com/en-us/library/aa372045.aspx + - http://sedna-soft.de/summary-information-stream/ + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html + + References for DocumentSummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html + """ + + # attribute names for SummaryInformation stream properties: + SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', + 'template', 'last_saved_by', 'revision_number', 'total_edit_time', + 'last_printed', 'create_time', 'last_saved_time', 'num_pages', + 'num_words', 'num_chars', 'thumbnail', 'creating_application', + 'security'] + + # attribute names for DocumentSummaryInformation stream properties: + DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', + 'slides', 'notes', 'hidden_slides', 'mm_clips', + 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', + 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', + 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', + 'content_type', 'content_status', 'language', 'doc_version'] + + def __init__(self): + self.codepage = None + self.title = None + self.subject = None + self.author = None + self.keywords = None + self.comments = None + self.template = None + self.last_saved_by = None + self.revision_number = None + self.total_edit_time = None + self.last_printed = None + self.create_time = None + self.last_saved_time = None + self.num_pages = None + self.num_words = None + self.num_chars = None + self.thumbnail = None + self.creating_application = None + self.security = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None +## self. = None + + + def parse_properties(self, olefile): + """ + Parse standard properties of an OLE file + """ + if olefile.exists("\x05SummaryInformation"): + # get properties from the stream: + props = olefile.getproperties("\x05SummaryInformation", + convert_time=True) + # store them into this object's attributes: + for i in range(len(self.SUMMARY_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.SUMMARY_ATTRIBS[i], value) + if olefile.exists("\x05DocumentSummaryInformation"): + # get properties from the stream: + props = olefile.getproperties("\x05DocumentSummaryInformation", + convert_time=True) + # store them into this object's attributes: + for i in range(len(self.DOCSUM_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.DOCSUM_ATTRIBS[i], value) + + def dump(self): + print 'Properties from SummaryInformation stream:' + for prop in self.SUMMARY_ATTRIBS: + value = getattr(self, prop) + print '- %s: %s' % (prop, value) + print 'Properties from DocumentSummaryInformation stream:' + for prop in self.DOCSUM_ATTRIBS: + value = getattr(self, prop) + print '- %s: %s' % (prop, value) + + #--- _OleStream --------------------------------------------------------------- class _OleStream(StringIO.StringIO): @@ -1598,6 +1701,16 @@ class OleFileIO: return data + def get_metadata(self): + """ + Parse standard properties streams, return an OleMetadata object + containing all the available metadata. + (also stored in the metadata attribute of the OleFileIO object) + """ + self.metadata = OleMetadata() + self.metadata.parse_properties(self) + return self.metadata + # # -------------------------------------------------------------------- # This script can be used to dump the directory of any OLE2 structured @@ -1673,6 +1786,10 @@ Options: print 'NOT a stream : type=%d' % st_type print '' + # parse and display metadata: + meta = ole.get_metadata() + meta.dump() + print '' #[PL] Test a few new methods: root = ole.get_rootentry_name() print 'Root entry name: "%s"' % root From c5cd0ccefcf1805f1df8e2c186f1990a77843da8 Mon Sep 17 00:00:00 2001 From: decalage Date: Mon, 6 May 2013 07:02:30 +0200 Subject: [PATCH 021/101] updated readme --- PIL/OleFileIO-README.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 0d212866f..2ecf18445 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -3,25 +3,26 @@ OleFileIO_PL [OleFileIO_PL](http://www.decalage.info/python/olefileio) is a Python module to read [Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format)](http://en.wikipedia.org/wiki/Compound_File_Binary_Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -This is an improved version of the OleFileIO module from [PIL](http://www.pythonware.com/products/pil/index.htm), the excellent Python Imaging Library, created and maintained by Fredrik Lundh. The API is still compatible with PIL, but I have improved the internal implementation significantly, with bugfixes and a more robust design. +This is an improved version of the OleFileIO module from [PIL](http://www.pythonware.com/products/pil/index.htm), the excellent Python Imaging Library, created and maintained by Fredrik Lundh. The API is still compatible with PIL, but I have improved the internal implementation significantly, with new features, bugfixes and a more robust design. As far as I know, this module is now the most complete and robust Python implementation to read MS OLE2 files, portable on several operating systems. (please tell me if you know other similar Python modules) WARNING: THIS IS (STILL) WORK IN PROGRESS. -Main improvements over PIL version: ------------------------------------ +Main improvements over PIL version of OleFileIO: +------------------------------------------------ - Better compatibility with Python 2.4 up to 2.7 - Support for files larger than 6.8MB - Robust: many checks to detect malformed files - Improved API +- New features: metadata extraction - Added setup.py and install.bat to ease installation News ---- -- 2013-05-03 v0.24: improved get_properties to convert timestamps to Python datetime +- 2013-05-05 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime - 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) - 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) - 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking @@ -70,6 +71,14 @@ Here are a few examples: f.write(data) f.close() + # Extract metadata (new in v0.24): + meta = ole.get_metadata() + print 'Author:', meta.author + print 'Title:', meta.title + print 'Creation date:', meta.create_time + # print all metadata: + meta.dump() + # Close the OLE file: ole.close() @@ -114,7 +123,7 @@ License OleFileIO_PL is open-source. -OleFileIO_PL changes are Copyright (c) 2005-2012 by Philippe Lagadec. +OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec. The Python Imaging Library (PIL) is From 5b616ca1becc959bcd2da2207c7d0013d01963d6 Mon Sep 17 00:00:00 2001 From: decalage Date: Tue, 7 May 2013 23:44:27 +0200 Subject: [PATCH 022/101] v0.24: slight improvements in OleMetadata, updated readme. --- PIL/OleFileIO-README.md | 4 +-- PIL/OleFileIO.py | 78 +++++++++++++++++++++++++++++++++-------- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 2ecf18445..7ff2691d4 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -22,7 +22,7 @@ Main improvements over PIL version of OleFileIO: News ---- -- 2013-05-05 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime +- 2013-05-07 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime - 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) - 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) - 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking @@ -71,7 +71,7 @@ Here are a few examples: f.write(data) f.close() - # Extract metadata (new in v0.24): + # Extract metadata (new in v0.24) - see source code for all attributes: meta = ole.get_metadata() print 'Author:', meta.author print 'Title:', meta.title diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index a995cb3a3..fa1b9ae07 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.24 2013-05-05 Philippe Lagadec - http://www.decalage.info +version 0.24 2013-05-07 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,7 +24,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-05-05" +__date__ = "2013-05-07" __version__ = '0.24' #--- LICENSE ------------------------------------------------------------------ @@ -115,6 +115,7 @@ __version__ = '0.24' # - main: displays properties with date format # - new class OleMetadata to parse standard properties # - added get_metadata method +# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata #----------------------------------------------------------------------------- @@ -434,6 +435,19 @@ class OleMetadata: """ class to parse and store metadata from standard properties of OLE files. + Available attributes: + codepage, title, subject, author, keywords, comments, template, + last_saved_by, revision_number, total_edit_time, last_printed, create_time, + last_saved_time, num_pages, num_words, num_chars, thumbnail, + creating_application, security, codepage_doc, category, presentation_target, + bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, + scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, + chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, + version, dig_sig, content_type, content_status, language, doc_version + + Note: an attribute is set to None when not present in the properties of the + OLE file. + References for SummaryInformation stream: - http://msdn.microsoft.com/en-us/library/dd942545.aspx - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx @@ -449,6 +463,7 @@ class OleMetadata: """ # attribute names for SummaryInformation stream properties: + # (ordered by property id, starting at 1) SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', 'template', 'last_saved_by', 'revision_number', 'total_edit_time', 'last_printed', 'create_time', 'last_saved_time', 'num_pages', @@ -456,6 +471,7 @@ class OleMetadata: 'security'] # attribute names for DocumentSummaryInformation stream properties: + # (ordered by property id, starting at 1) DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', 'slides', 'notes', 'hidden_slides', 'mm_clips', 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', @@ -464,6 +480,11 @@ class OleMetadata: 'content_type', 'content_status', 'language', 'doc_version'] def __init__(self): + """ + Constructor for OleMetadata + All attributes are set to None by default + """ + # properties from SummaryInformation stream self.codepage = None self.title = None self.subject = None @@ -483,24 +504,48 @@ class OleMetadata: self.thumbnail = None self.creating_application = None self.security = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None -## self. = None + # properties from DocumentSummaryInformation stream + self.codepage_doc = None + self.category = None + self.presentation_target = None + self.bytes = None + self.lines = None + self.paragraphs = None + self.slides = None + self.notes = None + self.hidden_slides = None + self.mm_clips = None + self.scale_crop = None + self.heading_pairs = None + self.titles_of_parts = None + self.manager = None + self.company = None + self.links_dirty = None + self.chars_with_spaces = None + self.unused = None + self.shared_doc = None + self.link_base = None + self.hlinks = None + self.hlinks_changed = None + self.version = None + self.dig_sig = None + self.content_type = None + self.content_status = None + self.language = None + self.doc_version = None def parse_properties(self, olefile): """ - Parse standard properties of an OLE file + Parse standard properties of an OLE file, from the streams + "\x05SummaryInformation" and "\x05DocumentSummaryInformation", + if present. + Properties are converted to strings, integers or python datetime objects. + If a property is not present, its value is set to None. """ + # first set all attributes to None: + for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): + setattr(self, attrib, None) if olefile.exists("\x05SummaryInformation"): # get properties from the stream: props = olefile.getproperties("\x05SummaryInformation", @@ -521,6 +566,9 @@ class OleMetadata: setattr(self, self.DOCSUM_ATTRIBS[i], value) def dump(self): + """ + Dump all metadata, for debugging purposes. + """ print 'Properties from SummaryInformation stream:' for prop in self.SUMMARY_ATTRIBS: value = getattr(self, prop) From 90f0b6796e6e5764d0923424fa4d8596bfbbe9ce Mon Sep 17 00:00:00 2001 From: decalage Date: Tue, 7 May 2013 23:50:46 +0200 Subject: [PATCH 023/101] updated readme --- PIL/OleFileIO-README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 7ff2691d4..c169ab73d 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -23,6 +23,7 @@ News ---- - 2013-05-07 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime +- 2012-10-09: published [python-oletools](http://www.decalage.info/python/oletools), a package of analysis tools based on OleFileIO_PL - 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) - 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method) - 2011-10-20: code hosted on bitbucket to ease contributions and bug tracking From 8e826441b2dec739a95bffba8f3382420de95873 Mon Sep 17 00:00:00 2001 From: decalage Date: Sat, 25 May 2013 00:28:36 +0200 Subject: [PATCH 024/101] 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps - OleMetaData: total_edit_time is now a number of seconds, not a timestamp - getproperties: added support for VT_BOOL, VT_INT, V_UINT - getproperties: filter out null chars from strings - getproperties: raise non-fatal defects instead of exceptions when properties cannot be parsed properly --- PIL/OleFileIO.py | 200 +++++++++++++++++++++++++++++++---------------- 1 file changed, 132 insertions(+), 68 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index fa1b9ae07..d3839c9e3 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.24 2013-05-07 Philippe Lagadec - http://www.decalage.info +version 0.25 2013-05-24 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-05-07" -__version__ = '0.24' +__date__ = "2013-05-24" +__version__ = '0.25' #--- LICENSE ------------------------------------------------------------------ @@ -116,10 +116,19 @@ __version__ = '0.24' # - new class OleMetadata to parse standard properties # - added get_metadata method # 2013-05-07 v0.24 PL: - a few improvements in OleMetadata +# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps +# - OleMetaData: total_edit_time is now a number of seconds, +# not a timestamp +# - getproperties: added support for VT_BOOL, VT_INT, V_UINT +# - getproperties: filter out null chars from strings +# - getproperties: raise non-fatal defects instead of +# exceptions when properties cannot be parsed properly #----------------------------------------------------------------------------- # TODO (for version 1.0): +# + _raise_defect: store all defects that are not raised as exceptions, and +# display them in main for information. # + add path attrib to _OleDirEntry, set it once and for all in init or # append_kids (then listdir/_list can be simplified) # - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... @@ -138,6 +147,8 @@ __version__ = '0.24' # - improve docstrings to show more sample uses # - see also original notes and FIXME below # - remove all obsolete FIXMEs +# - OleMetadata: fix version attrib according to +# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx # IDEAS: # - allow _raise_defect to raise different exceptions, not only IOError @@ -548,8 +559,10 @@ class OleMetadata: setattr(self, attrib, None) if olefile.exists("\x05SummaryInformation"): # get properties from the stream: + # (converting timestamps to python datetime, except total_edit_time, + # which is property #10) props = olefile.getproperties("\x05SummaryInformation", - convert_time=True) + convert_time=True, no_conversion=[10]) # store them into this object's attributes: for i in range(len(self.SUMMARY_ATTRIBS)): # ids for standards properties start at 0x01, until 0x13 @@ -572,11 +585,11 @@ class OleMetadata: print 'Properties from SummaryInformation stream:' for prop in self.SUMMARY_ATTRIBS: value = getattr(self, prop) - print '- %s: %s' % (prop, value) + print '- %s: %s' % (prop, repr(value)) print 'Properties from DocumentSummaryInformation stream:' for prop in self.DOCSUM_ATTRIBS: value = getattr(self, prop) - print '- %s: %s' % (prop, value) + print '- %s: %s' % (prop, repr(value)) #--- _OleStream --------------------------------------------------------------- @@ -1663,89 +1676,140 @@ class OleFileIO: return self.root.name - def getproperties(self, filename, convert_time=False): + def getproperties(self, filename, convert_time=False, no_conversion=None): """ Return properties described in substream. filename: path of stream in storage tree (see openstream for syntax) convert_time: bool, if True timestamps will be converted to Python datetime + no_conversion: None or list of int, timestamps not to be converted + (for example total editing time is not a real timestamp) return: a dictionary of values indexed by id (integer) """ + # make sure no_conversion is a list, just to simplify code below: + if no_conversion == None: + no_conversion = [] fp = self.openstream(filename) data = {} - # header - s = fp.read(28) - clsid = _clsid(s[8:24]) + try: + # header + s = fp.read(28) + clsid = _clsid(s[8:24]) - # format id - s = fp.read(20) - fmtid = _clsid(s[:16]) - fp.seek(i32(s, 16)) + # format id + s = fp.read(20) + fmtid = _clsid(s[:16]) + fp.seek(i32(s, 16)) - # get section - s = "****" + fp.read(i32(fp.read(4))-4) + # get section + s = "****" + fp.read(i32(fp.read(4))-4) + except: + # catch exception while parsing property header, and only raise + # a DEFECT_INCORRECT then return an empty dict, because this is not + # a fatal error when parsing the whole file + exctype, excvalue = sys.exc_info()[:2] + self._raise_defect(DEFECT_INCORRECT, excvalue) + return data for i in range(i32(s, 4)): + try: + id = i32(s, 8+i*8) + offset = i32(s, 12+i*8) + type = i32(s, offset) - id = i32(s, 8+i*8) - offset = i32(s, 12+i*8) - type = i32(s, offset) + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) - debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) + # test for common types first (should perhaps use + # a dictionary instead?) - # test for common types first (should perhaps use - # a dictionary instead?) - - if type == VT_I2: - value = i16(s, offset+4) - if value >= 32768: - value = value - 65536 - elif type == VT_UI2: - value = i16(s, offset+4) - elif type in (VT_I4, VT_ERROR): - value = i32(s, offset+4) - elif type == VT_UI4: - value = i32(s, offset+4) # FIXME - elif type in (VT_BSTR, VT_LPSTR): - count = i32(s, offset+4) - value = s[offset+8:offset+8+count-1] - elif type == VT_BLOB: - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - elif type == VT_LPWSTR: - count = i32(s, offset+4) - value = _unicode(s[offset+8:offset+8+count*2]) - elif type == VT_FILETIME: - value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) - # FILETIME is a 64-bit int: "number of 100ns periods - # since Jan 1,1601". - if convert_time: - # convert FILETIME to Python datetime.datetime - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) + if type == VT_I2: # 16-bit signed integer + value = i16(s, offset+4) + if value >= 32768: + value = value - 65536 + elif type == VT_UI2: # 2-byte unsigned integer + value = i16(s, offset+4) + elif type in (VT_I4, VT_INT, VT_ERROR): + # VT_I4: 32-bit signed integer + # VT_ERROR: HRESULT, similar to 32-bit signed integer, + # see http://msdn.microsoft.com/en-us/library/cc230330.aspx + value = i32(s, offset+4) + elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer + value = i32(s, offset+4) # FIXME + elif type in (VT_BSTR, VT_LPSTR): + # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx + # size is a 32 bits integer, including the null terminator, and + # possibly trailing or embedded null chars + #TODO: if codepage is unicode, the string should be converted as such + count = i32(s, offset+4) + value = s[offset+8:offset+8+count-1] + # remove all null chars: + value = value.replace('\x00', '') + elif type == VT_BLOB: + # binary large object (BLOB) + # see http://msdn.microsoft.com/en-us/library/dd942282.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif type == VT_LPWSTR: + # UnicodeString + # see http://msdn.microsoft.com/en-us/library/dd942313.aspx + # "the string should NOT contain embedded or additional trailing + # null characters." + count = i32(s, offset+4) + value = _unicode(s[offset+8:offset+8+count*2]) + elif type == VT_FILETIME: + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) + # FILETIME is a 64-bit int: "number of 100ns periods + # since Jan 1,1601". + if convert_time and id not in no_conversion: + debug('Converting property #%d to python datetime, value=%d=%fs' + %(id, value, float(value)/10000000L)) + # convert FILETIME to Python datetime.datetime + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + debug('timedelta days=%d' % (value/(10*1000000*3600*24))) + value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) + else: + # legacy code kept for backward compatibility: returns a + # number of seconds since Jan 1,1601 + value = value / 10000000L # seconds + elif type == VT_UI1: # 1-byte unsigned integer + value = ord(s[offset+4]) + elif type == VT_CLSID: + value = _clsid(s[offset+4:offset+20]) + elif type == VT_CF: + # PropertyIdentifier or ClipboardData?? + # see http://msdn.microsoft.com/en-us/library/dd941945.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif type == VT_BOOL: + # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True + # see http://msdn.microsoft.com/en-us/library/cc237864.aspx + value = bool(i16(s, offset+4)) else: - # legacy code kept for backward compatibility: returns a - # number of seconds since Jan 1,1601 - value = value / 10000000L # seconds - elif type == VT_UI1: - value = ord(s[offset+4]) - elif type == VT_CLSID: - value = _clsid(s[offset+4:offset+20]) - elif type == VT_CF: - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - else: - value = None # everything else yields "None" + value = None # everything else yields "None" + debug ('property id=%d: type=%d not implemented in parser yet' % (id, type)) - # FIXME: add support for VT_VECTOR + # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, + # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, + # see http://msdn.microsoft.com/en-us/library/dd942033.aspx - #print "%08x" % id, repr(value), - #print "(%s)" % VT[i32(s, offset) & 0xFFF] + # FIXME: add support for VT_VECTOR + # VT_VECTOR is a 32 uint giving the number of items, followed by + # the items in sequence. The VT_VECTOR value is combined with the + # type of items, e.g. VT_VECTOR|VT_BSTR + # see http://msdn.microsoft.com/en-us/library/dd942011.aspx - data[id] = value + #print "%08x" % id, repr(value), + #print "(%s)" % VT[i32(s, offset) & 0xFFF] + + data[id] = value + except: + # catch exception while parsing each property, and only raise + # a DEFECT_INCORRECT, because parsing can go on + exctype, excvalue = sys.exc_info()[:2] + self._raise_defect(DEFECT_INCORRECT, excvalue) return data @@ -1795,7 +1859,7 @@ Options: check_streams = True continue - ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT) + ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) print "-" * 68 print filename print "-" * 68 From 7d37cf071f9d29028d05b46e948c9307727dcb86 Mon Sep 17 00:00:00 2001 From: decalage Date: Mon, 27 May 2013 07:27:04 +0200 Subject: [PATCH 025/101] - getproperties: improved exception handling - _raise_defect: added option to set exception type - all non-fatal issues are now recorded, and displayed when run as a script --- PIL/OleFileIO.py | 51 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index d3839c9e3..f7c12010f 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.25 2013-05-24 Philippe Lagadec - http://www.decalage.info +version 0.25 2013-05-27 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,7 +24,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-05-24" +__date__ = "2013-05-27" __version__ = '0.25' #--- LICENSE ------------------------------------------------------------------ @@ -123,12 +123,14 @@ __version__ = '0.25' # - getproperties: filter out null chars from strings # - getproperties: raise non-fatal defects instead of # exceptions when properties cannot be parsed properly +# 2013-05-27 PL: - getproperties: improved exception handling +# - _raise_defect: added option to set exception type +# - all non-fatal issues are now recorded, and displayed +# when run as a script #----------------------------------------------------------------------------- # TODO (for version 1.0): -# + _raise_defect: store all defects that are not raised as exceptions, and -# display them in main for information. # + add path attrib to _OleDirEntry, set it once and for all in init or # append_kids (then listdir/_list can be simplified) # - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... @@ -151,9 +153,6 @@ __version__ = '0.25' # http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx # IDEAS: -# - allow _raise_defect to raise different exceptions, not only IOError -# - provide a class with named attributes to get well-known properties of -# MS Office documents (title, author, ...) ? # - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for # streams with unknown size # - use arrays of int instead of long integers for FAT/MiniFAT, to improve @@ -979,12 +978,16 @@ class OleFileIO: (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a security-oriented application, see source code for details) """ + # minimal level for defects to be raised as exceptions: self._raise_defects_level = raise_defects + # list of defects/issues not raised as exceptions: + # tuples of (exception type, message) + self.parsing_issues = [] if filename: self.open(filename) - def _raise_defect(self, defect_level, message): + def _raise_defect(self, defect_level, message, exception_type=IOError): """ This method should be called for any defect found during file parsing. It may raise an IOError exception according to the minimal level chosen @@ -996,10 +999,14 @@ class OleFileIO: DEFECT_INCORRECT : an error according to specifications, but parsing can go on DEFECT_FATAL : an error which cannot be ignored, parsing is impossible message: string describing the defect, used with raised exception. + exception_type: exception class to be raised, IOError by default """ # added by [PL] if defect_level >= self._raise_defects_level: - raise IOError, message + raise exception_type, message + else: + # just record the issue, no exception raised: + self.parsing_issues.append((exception_type, message)) def open(self, filename): @@ -1689,6 +1696,11 @@ class OleFileIO: # make sure no_conversion is a list, just to simplify code below: if no_conversion == None: no_conversion = [] + # stream path as a string to report exceptions: + streampath = filename + if not isinstance(streampath, str): + streampath = '/'.join(streampath) + fp = self.openstream(filename) data = {} @@ -1705,16 +1717,21 @@ class OleFileIO: # get section s = "****" + fp.read(i32(fp.read(4))-4) + # number of properties: + num_props = i32(s, 4) except: # catch exception while parsing property header, and only raise # a DEFECT_INCORRECT then return an empty dict, because this is not # a fatal error when parsing the whole file exctype, excvalue = sys.exc_info()[:2] - self._raise_defect(DEFECT_INCORRECT, excvalue) + msg = 'Error while parsing properties header in stream %s: %s' % ( + repr(streampath), excvalue) + self._raise_defect(DEFECT_INCORRECT, msg, exctype) return data - for i in range(i32(s, 4)): + for i in range(num_props): try: + id = 0 # just in case of an exception id = i32(s, 8+i*8) offset = i32(s, 12+i*8) type = i32(s, offset) @@ -1809,7 +1826,9 @@ class OleFileIO: # catch exception while parsing each property, and only raise # a DEFECT_INCORRECT, because parsing can go on exctype, excvalue = sys.exc_info()[:2] - self._raise_defect(DEFECT_INCORRECT, excvalue) + msg = 'Error while parsing property id %d in stream %s: %s' % ( + id, repr(streampath), excvalue) + self._raise_defect(DEFECT_INCORRECT, msg, exctype) return data @@ -1911,5 +1930,13 @@ Options: print "size :", ole.get_size('worddocument') if ole.exists('macros/vba'): print "This document may contain VBA macros." + + # print parsing issues: + print '\nNon-fatal issues raised during parsing:' + if ole.parsing_issues: + for exctype, msg in ole.parsing_issues: + print '- %s: %s' % (exctype.__name__, msg) + else: + print 'None' ## except IOError, v: ## print "***", "cannot read", file, "-", v From e1fe77b48bfccc4b9a23c979c468c0a3cf01fa60 Mon Sep 17 00:00:00 2001 From: decalage Date: Mon, 27 May 2013 07:34:36 +0200 Subject: [PATCH 026/101] fix #12 updated readme --- PIL/OleFileIO-README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index c169ab73d..700e92d20 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -22,6 +22,7 @@ Main improvements over PIL version of OleFileIO: News ---- +- 2013-05-27 v0.25: improved metadata extraction, properties parsing and exception handling, fixed [issue #12](https://bitbucket.org/decalage/olefileio_pl/issue/12/error-when-converting-timestamps-in-ole) - 2013-05-07 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime - 2012-10-09: published [python-oletools](http://www.decalage.info/python/oletools), a package of analysis tools based on OleFileIO_PL - 2012-09-11 v0.23: added support for file-like objects, fixed [issue #8](https://bitbucket.org/decalage/olefileio_pl/issue/8/bug-with-file-object) From a468016b630e544d16f113c17806991de5581978 Mon Sep 17 00:00:00 2001 From: decalage Date: Wed, 24 Jul 2013 00:34:12 +0200 Subject: [PATCH 027/101] v0.26: added methods to parse stream/storage timestamps, improved listdir to include storages, fixed parsing of direntry timestamps --- PIL/OleFileIO-README.md | 3 +- PIL/OleFileIO.py | 132 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 121 insertions(+), 14 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 700e92d20..e94d72593 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -16,12 +16,13 @@ Main improvements over PIL version of OleFileIO: - Support for files larger than 6.8MB - Robust: many checks to detect malformed files - Improved API -- New features: metadata extraction +- New features: metadata extraction, stream/storage timestamps - Added setup.py and install.bat to ease installation News ---- +- 2013-07-24 v0.26: added methods to parse stream/storage timestamps, improved listdir to include storages - 2013-05-27 v0.25: improved metadata extraction, properties parsing and exception handling, fixed [issue #12](https://bitbucket.org/decalage/olefileio_pl/issue/12/error-when-converting-timestamps-in-ole) - 2013-05-07 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime - 2012-10-09: published [python-oletools](http://www.decalage.info/python/oletools), a package of analysis tools based on OleFileIO_PL diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index f7c12010f..8f10d7b33 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -6,7 +6,7 @@ OleFileIO_PL: Microsoft Compound Document File Format), such as Microsoft Office documents, Image Composer and FlashPix files, Outlook messages, ... -version 0.25 2013-05-27 Philippe Lagadec - http://www.decalage.info +version 0.26 2013-07-24 Philippe Lagadec - http://www.decalage.info Project website: http://www.decalage.info/python/olefileio @@ -24,8 +24,8 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. """ __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-05-27" -__version__ = '0.25' +__date__ = "2013-07-24" +__version__ = '0.26' #--- LICENSE ------------------------------------------------------------------ @@ -127,7 +127,10 @@ __version__ = '0.25' # - _raise_defect: added option to set exception type # - all non-fatal issues are now recorded, and displayed # when run as a script - +# 2013-07-11 v0.26 PL: - added methods to get modification and creation times +# of a directory entry or a storage/stream +# - fixed parsing of direntry timestamps +# 2013-07-24 PL: - new options in listdir to list storages and/or streams #----------------------------------------------------------------------------- # TODO (for version 1.0): @@ -437,6 +440,16 @@ except NameError: return filter(ord, s) +def filetime2datetime(filetime): + """ + convert FILETIME (64 bits int) to Python datetime.datetime + """ + # TODO: manage exception when microseconds is too large + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24))) + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10) + #=== CLASSES ================================================================== @@ -470,6 +483,8 @@ class OleMetadata: - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html + + new in version 0.25 """ # attribute names for SummaryInformation stream properties: @@ -732,7 +747,8 @@ class _OleDirectoryEntry: #[PL] parsing code moved from OleFileIO.loaddirectory # struct to parse directory entries: - # <: little-endian byte order + # <: little-endian byte order, standard sizes + # (note: this should guarantee that Q returns a 64 bits int) # 64s: string containing entry name in unicode (max 31 chars) + null char # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 # B: uint8, dir entry type (between 0 and 5) @@ -742,13 +758,13 @@ class _OleDirectoryEntry: # I: uint32, index of child root node if it is a storage, else NOSTREAM # 16s: CLSID, unique identifier (only used if it is a storage) # I: uint32, user flags - # 8s: uint64, creation timestamp or zero - # 8s: uint64, modification timestamp or zero + # Q (was 8s): uint64, creation timestamp or zero + # Q (was 8s): uint64, modification timestamp or zero # I: uint32, SID of first sector if stream or ministream, SID of 1st sector # of stream containing ministreams if root entry, 0 otherwise # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise - STRUCT_DIRENTRY = '<64sHBBIII16sI8s8sIII' + STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' # size of a directory entry: 128 bytes DIRENTRY_SIZE = 128 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE @@ -938,6 +954,34 @@ class _OleDirectoryEntry: kid.dump(tab + 2) + def getmtime(self): + """ + Return modification time of a directory entry. + + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.modifyTime == 0: + return None + return filetime2datetime(self.modifyTime) + + + def getctime(self): + """ + Return creation time of a directory entry. + + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.createTime == 0: + return None + return filetime2datetime(self.createTime) + + #--- OleFileIO ---------------------------------------------------------------- class OleFileIO: @@ -1552,27 +1596,42 @@ class OleFileIO: self.sectorsize, self.fat, self._filesize) - def _list(self, files, prefix, node): + def _list(self, files, prefix, node, streams=True, storages=False): """ (listdir helper) files: list of files to fill in prefix: current location in storage tree (list of names) node: current node (_OleDirectoryEntry object) + streams: bool, include streams if True (True by default) - new in v0.26 + storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) """ prefix = prefix + [node.name] for entry in node.kids: if entry.kids: - self._list(files, prefix, entry) + # this is a storage + if storages: + # add it to the list + files.append(prefix[1:] + [entry.name]) + # check its kids + self._list(files, prefix, entry, streams, storages) else: - files.append(prefix[1:] + [entry.name]) + # this is a stream + if streams: + # add it to the list + files.append(prefix[1:] + [entry.name]) - def listdir(self): + def listdir(self, streams=True, storages=False): """ Return a list of streams stored in this file + + streams: bool, include streams if True (True by default) - new in v0.26 + storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) """ files = [] - self._list(files, [], self.root) + self._list(files, [], self.root, streams, storages) return files @@ -1644,6 +1703,38 @@ class OleFileIO: return False + def getmtime(self, filename): + """ + Return modification time of a stream/storage. + + filename: path of stream/storage in storage tree. (see openstream for + syntax) + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getmtime() + + + def getctime(self, filename): + """ + Return creation time of a stream/storage. + + filename: path of stream/storage in storage tree. (see openstream for + syntax) + return: None if creation time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getctime() + + def exists(self, filename): """ Test if given filename exists as a stream or a storage in the OLE @@ -1837,6 +1928,8 @@ class OleFileIO: Parse standard properties streams, return an OleMetadata object containing all the available metadata. (also stored in the metadata attribute of the OleFileIO object) + + new in version 0.25 """ self.metadata = OleMetadata() self.metadata.parse_properties(self) @@ -1917,6 +2010,19 @@ Options: print 'NOT a stream : type=%d' % st_type print '' +## for streamname in ole.listdir(): +## # print name using repr() to convert binary chars to \xNN: +## print '-', repr('/'.join(streamname)),'-', +## print ole.getmtime(streamname) +## print '' + + print 'Modification/Creation times of all directory entries:' + for entry in ole.direntries: + if entry is not None: + print '- %s: mtime=%s ctime=%s' % (entry.name, + entry.getmtime(), entry.getctime()) + print '' + # parse and display metadata: meta = ole.get_metadata() meta.dump() From 0b79c83709a55f8d7b2721ae7fa32c5cd2f53e40 Mon Sep 17 00:00:00 2001 From: decalage Date: Wed, 24 Jul 2013 00:37:53 +0200 Subject: [PATCH 028/101] updated readme --- PIL/OleFileIO-README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index e94d72593..3914a11a7 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -22,7 +22,7 @@ Main improvements over PIL version of OleFileIO: News ---- -- 2013-07-24 v0.26: added methods to parse stream/storage timestamps, improved listdir to include storages +- 2013-07-24 v0.26: added methods to parse stream/storage timestamps, improved listdir to include storages, fixed parsing of direntry timestamps - 2013-05-27 v0.25: improved metadata extraction, properties parsing and exception handling, fixed [issue #12](https://bitbucket.org/decalage/olefileio_pl/issue/12/error-when-converting-timestamps-in-ole) - 2013-05-07 v0.24: new features to extract metadata (get\_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime - 2012-10-09: published [python-oletools](http://www.decalage.info/python/oletools), a package of analysis tools based on OleFileIO_PL From ac5cb028be80f232e2bfd27bf5d89214a49c57b3 Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Tue, 28 Jan 2014 23:42:24 +0000 Subject: [PATCH 029/101] Convert OleFileIO.py to LF newlines before merging --- PIL/OleFileIO.py | 4096 +++++++++++++++++++++++----------------------- 1 file changed, 2048 insertions(+), 2048 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 8f10d7b33..631a8ed84 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -1,2048 +1,2048 @@ -#!/usr/local/bin/python -# -*- coding: latin-1 -*- -""" -OleFileIO_PL: - Module to read Microsoft OLE2 files (also called Structured Storage or - Microsoft Compound Document File Format), such as Microsoft Office - documents, Image Composer and FlashPix files, Outlook messages, ... - -version 0.26 2013-07-24 Philippe Lagadec - http://www.decalage.info - -Project website: http://www.decalage.info/python/olefileio - -Improved version of the OleFileIO module from PIL library v1.1.6 -See: http://www.pythonware.com/products/pil/index.htm - -The Python Imaging Library (PIL) is - Copyright (c) 1997-2005 by Secret Labs AB - Copyright (c) 1995-2005 by Fredrik Lundh -OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec - -See source code and LICENSE.txt for information on usage and redistribution. - -WARNING: THIS IS (STILL) WORK IN PROGRESS. -""" - -__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2013-07-24" -__version__ = '0.26' - -#--- LICENSE ------------------------------------------------------------------ - -# OleFileIO_PL is an improved version of the OleFileIO module from the -# Python Imaging Library (PIL). - -# OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec -# -# The Python Imaging Library (PIL) is -# Copyright (c) 1997-2005 by Secret Labs AB -# Copyright (c) 1995-2005 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its associated -# documentation, you agree that you have read, understood, and will comply with -# the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and its -# associated documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appears in all copies, and that both -# that copyright notice and this permission notice appear in supporting -# documentation, and that the name of Secret Labs AB or the author(s) not be used -# in advertising or publicity pertaining to distribution of the software -# without specific, written prior permission. -# -# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS -# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. -# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, -# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THIS SOFTWARE. - -#----------------------------------------------------------------------------- -# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) -# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility -# (all changes flagged with [PL]) -# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise -# exceptions in _OleStream.__init__() -# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) -# - added some constants -# - added header values checks -# - added some docstrings -# - getsect: bugfix in case sectors >512 bytes -# - getsect: added conformity checks -# - DEBUG_MODE constant to activate debug display -# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments -# - updated license -# - converted tabs to 4 spaces -# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity -# - improved _unicode() to use Python 2.x unicode support -# - fixed bug in _OleDirectoryEntry -# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops -# - fixed _OleStream which didn't check stream size -# - added/improved many docstrings and comments -# - moved helper functions _unicode and _clsid out of -# OleFileIO class -# - improved OleFileIO._find() to add Unix path syntax -# - OleFileIO._find() is now case-insensitive -# - added get_type() and get_rootentry_name() -# - rewritten loaddirectory and _OleDirectoryEntry -# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict -# - added detection of duplicate filenames in storages -# - added detection of duplicate references to streams -# - added get_size() and exists() to _OleDirectoryEntry -# - added isOleFile to check header before parsing -# - added __all__ list to control public keywords in pydoc -# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory -# - improved _unicode(), added workarounds for Python <2.3 -# - added set_debug_mode and -d option to set debug mode -# - fixed bugs in OleFileIO.open and _OleDirectoryEntry -# - added safety check in main for large or binary -# properties -# - allow size>0 for storages for some implementations -# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and -# streams -# - added option '-c' in main to check all streams -# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms -# (thanks to Ben G. and Martijn for reporting the bug) -# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str -# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs -# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn -# (https://bitbucket.org/decalage/olefileio_pl/issue/7) -# - added close method to OleFileIO (fixed issue #2) -# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) -# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python -# datetime -# - main: displays properties with date format -# - new class OleMetadata to parse standard properties -# - added get_metadata method -# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata -# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps -# - OleMetaData: total_edit_time is now a number of seconds, -# not a timestamp -# - getproperties: added support for VT_BOOL, VT_INT, V_UINT -# - getproperties: filter out null chars from strings -# - getproperties: raise non-fatal defects instead of -# exceptions when properties cannot be parsed properly -# 2013-05-27 PL: - getproperties: improved exception handling -# - _raise_defect: added option to set exception type -# - all non-fatal issues are now recorded, and displayed -# when run as a script -# 2013-07-11 v0.26 PL: - added methods to get modification and creation times -# of a directory entry or a storage/stream -# - fixed parsing of direntry timestamps -# 2013-07-24 PL: - new options in listdir to list storages and/or streams - -#----------------------------------------------------------------------------- -# TODO (for version 1.0): -# + add path attrib to _OleDirEntry, set it once and for all in init or -# append_kids (then listdir/_list can be simplified) -# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... -# - add underscore to each private method, to avoid their display in -# pydoc/epydoc documentation - Remove it for classes to be documented -# - replace all raised exceptions with _raise_defect (at least in OleFileIO) -# - merge code from _OleStream and OleFileIO.getsect to read sectors -# (maybe add a class for FAT and MiniFAT ?) -# - add method to check all streams (follow sectors chains without storing all -# stream in memory, and report anomalies) -# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? -# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) -# => if possible avoid converting names to Latin-1 -# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) -# - rewrite OleFileIO.getproperties -# - improve docstrings to show more sample uses -# - see also original notes and FIXME below -# - remove all obsolete FIXMEs -# - OleMetadata: fix version attrib according to -# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx - -# IDEAS: -# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for -# streams with unknown size -# - use arrays of int instead of long integers for FAT/MiniFAT, to improve -# performance and reduce memory usage ? (possible issue with values >2^31) -# - provide tests with unittest (may need write support to create samples) -# - move all debug code (and maybe dump methods) to a separate module, with -# a class which inherits OleFileIO ? -# - fix docstrings to follow epydoc format -# - add support for 4K sectors ? -# - add support for big endian byte order ? -# - create a simple OLE explorer with wxPython - -# FUTURE EVOLUTIONS to add write support: -# 1) add ability to write a stream back on disk from StringIO (same size, no -# change in FAT/MiniFAT). -# 2) rename a stream/storage if it doesn't change the RB tree -# 3) use rbtree module to update the red-black tree + any rename -# 4) remove a stream/storage: free sectors in FAT/MiniFAT -# 5) allocate new sectors in FAT/MiniFAT -# 6) create new storage/stream -#----------------------------------------------------------------------------- - -# -# THIS IS WORK IN PROGRESS -# -# The Python Imaging Library -# $Id$ -# -# stuff to deal with OLE2 Structured Storage files. this module is -# used by PIL to read Image Composer and FlashPix files, but can also -# be used to read other files of this type. -# -# History: -# 1997-01-20 fl Created -# 1997-01-22 fl Fixed 64-bit portability quirk -# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) -# 2004-02-29 fl Changed long hex constants to signed integers -# -# Notes: -# FIXME: sort out sign problem (eliminate long hex constants) -# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] -# FIXME: provide a glob mechanism function (using fnmatchcase) -# -# Literature: -# -# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, -# September 1996. -# -# Quotes: -# -# "If this document and functionality of the Software conflict, -# the actual functionality of the Software represents the correct -# functionality" -- Microsoft, in the OLE format specification -# -# Copyright (c) Secret Labs AB 1997. -# Copyright (c) Fredrik Lundh 1997. -# -# See the README file for information on usage and redistribution. -# - -#------------------------------------------------------------------------------ - -import string, StringIO, struct, array, os.path, sys, datetime - -#[PL] Define explicitly the public API to avoid private objects in pydoc: -__all__ = ['OleFileIO', 'isOleFile'] - -#[PL] workaround to fix an issue with array item size on 64 bits systems: -if array.array('L').itemsize == 4: - # on 32 bits platforms, long integers in an array are 32 bits: - UINT32 = 'L' -elif array.array('I').itemsize == 4: - # on 64 bits platforms, integers in an array are 32 bits: - UINT32 = 'I' -else: - raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' - - -#[PL] These workarounds were inspired from the Path module -# (see http://www.jorendorff.com/articles/python/path/) -#TODO: test with old Python versions - -# Pre-2.3 workaround for booleans -try: - True, False -except NameError: - True, False = 1, 0 - -# Pre-2.3 workaround for basestring. -try: - basestring -except NameError: - try: - # is Unicode supported (Python >2.0 or >1.6 ?) - basestring = (str, unicode) - except NameError: - basestring = str - -#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode -# if False (default PIL behaviour), all filenames are converted to Latin-1. -KEEP_UNICODE_NAMES = False - -#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on -# command line to change it. -DEBUG_MODE = False -def debug_print(msg): - print msg -def debug_pass(msg): - pass -debug = debug_pass - -def set_debug_mode(debug_mode): - """ - Set debug mode on or off, to control display of debugging messages. - mode: True or False - """ - global DEBUG_MODE, debug - DEBUG_MODE = debug_mode - if debug_mode: - debug = debug_print - else: - debug = debug_pass - -#TODO: convert this to hex -MAGIC = '\320\317\021\340\241\261\032\341' - -#[PL]: added constants for Sector IDs (from AAF specifications) -MAXREGSECT = 0xFFFFFFFAL; # maximum SECT -DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT -FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT -ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain -FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector - -#[PL]: added constants for Directory Entry IDs (from AAF specifications) -MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID -NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry - -#[PL] object types in storage (from AAF specifications) -STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) -STGTY_STORAGE = 1 # element is a storage object -STGTY_STREAM = 2 # element is a stream object -STGTY_LOCKBYTES = 3 # element is an ILockBytes object -STGTY_PROPERTY = 4 # element is an IPropertyStorage object -STGTY_ROOT = 5 # element is a root storage - - -# -# -------------------------------------------------------------------- -# property types - -VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; -VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; -VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; -VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; -VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; -VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; -VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; -VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; -VT_VECTOR=0x1000; - -# map property id to name (for debugging purposes) - -VT = {} -for keyword, var in vars().items(): - if keyword[:3] == "VT_": - VT[var] = keyword - -# -# -------------------------------------------------------------------- -# Some common document types (root.clsid fields) - -WORD_CLSID = "00020900-0000-0000-C000-000000000046" -#TODO: check Excel, PPT, ... - -#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() -DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect -DEFECT_POTENTIAL = 20 # a potential defect -DEFECT_INCORRECT = 30 # an error according to specifications, but parsing - # can go on -DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is - # impossible - -#[PL] add useful constants to __all__: -for key in vars().keys(): - if key.startswith('STGTY_') or key.startswith('DEFECT_'): - __all__.append(key) - - -#--- FUNCTIONS ---------------------------------------------------------------- - -def isOleFile (filename): - """ - Test if file is an OLE container (according to its header). - filename: file name or path (str, unicode) - return: True if OLE, False otherwise. - """ - f = open(filename, 'rb') - header = f.read(len(MAGIC)) - if header == MAGIC: - return True - else: - return False - - -#TODO: replace i16 and i32 with more readable struct.unpack equivalent -def i16(c, o = 0): - """ - Converts a 2-bytes (16 bits) string to an integer. - - c: string containing bytes to convert - o: offset of bytes to convert in string - """ - return ord(c[o])+(ord(c[o+1])<<8) - - -def i32(c, o = 0): - """ - Converts a 4-bytes (32 bits) string to an integer. - - c: string containing bytes to convert - o: offset of bytes to convert in string - """ - return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) - # [PL]: added int() because "<<" gives long int since Python 2.4 - - -def _clsid(clsid): - """ - Converts a CLSID to a human-readable string. - clsid: string of length 16. - """ - assert len(clsid) == 16 - if clsid == "\0" * len(clsid): - return "" - return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % - ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + - tuple(map(ord, clsid[8:16])))) - - - -# UNICODE support for Old Python versions: -# (necessary to handle storages/streams names which use Unicode) - -try: - # is Unicode supported ? - unicode - - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python with Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() - """ - #TODO: test if it OleFileIO works with Unicode strings, instead of - # converting to Latin-1. - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = s.decode('UTF-16LE', errors) - if KEEP_UNICODE_NAMES: - return u - else: - # Second the unicode string is converted to Latin-1 - return u.encode('latin_1', errors) - except: - # there was an error during Unicode to Latin-1 conversion: - raise IOError, 'incorrect Unicode name' - -except NameError: - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python without native Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. (ignored in this version) - """ - # If the unicode function does not exist, we assume this is an old - # Python version without Unicode support. - # Null bytes are simply removed (this only works with usual Latin-1 - # strings which do not contain unicode characters>256): - return filter(ord, s) - - -def filetime2datetime(filetime): - """ - convert FILETIME (64 bits int) to Python datetime.datetime - """ - # TODO: manage exception when microseconds is too large - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24))) - return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10) - - - -#=== CLASSES ================================================================== - -class OleMetadata: - """ - class to parse and store metadata from standard properties of OLE files. - - Available attributes: - codepage, title, subject, author, keywords, comments, template, - last_saved_by, revision_number, total_edit_time, last_printed, create_time, - last_saved_time, num_pages, num_words, num_chars, thumbnail, - creating_application, security, codepage_doc, category, presentation_target, - bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, - scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, - chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, - version, dig_sig, content_type, content_status, language, doc_version - - Note: an attribute is set to None when not present in the properties of the - OLE file. - - References for SummaryInformation stream: - - http://msdn.microsoft.com/en-us/library/dd942545.aspx - - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx - - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx - - http://msdn.microsoft.com/en-us/library/aa372045.aspx - - http://sedna-soft.de/summary-information-stream/ - - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html - - References for DocumentSummaryInformation stream: - - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx - - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx - - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html - - new in version 0.25 - """ - - # attribute names for SummaryInformation stream properties: - # (ordered by property id, starting at 1) - SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', - 'template', 'last_saved_by', 'revision_number', 'total_edit_time', - 'last_printed', 'create_time', 'last_saved_time', 'num_pages', - 'num_words', 'num_chars', 'thumbnail', 'creating_application', - 'security'] - - # attribute names for DocumentSummaryInformation stream properties: - # (ordered by property id, starting at 1) - DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', - 'slides', 'notes', 'hidden_slides', 'mm_clips', - 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', - 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', - 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', - 'content_type', 'content_status', 'language', 'doc_version'] - - def __init__(self): - """ - Constructor for OleMetadata - All attributes are set to None by default - """ - # properties from SummaryInformation stream - self.codepage = None - self.title = None - self.subject = None - self.author = None - self.keywords = None - self.comments = None - self.template = None - self.last_saved_by = None - self.revision_number = None - self.total_edit_time = None - self.last_printed = None - self.create_time = None - self.last_saved_time = None - self.num_pages = None - self.num_words = None - self.num_chars = None - self.thumbnail = None - self.creating_application = None - self.security = None - # properties from DocumentSummaryInformation stream - self.codepage_doc = None - self.category = None - self.presentation_target = None - self.bytes = None - self.lines = None - self.paragraphs = None - self.slides = None - self.notes = None - self.hidden_slides = None - self.mm_clips = None - self.scale_crop = None - self.heading_pairs = None - self.titles_of_parts = None - self.manager = None - self.company = None - self.links_dirty = None - self.chars_with_spaces = None - self.unused = None - self.shared_doc = None - self.link_base = None - self.hlinks = None - self.hlinks_changed = None - self.version = None - self.dig_sig = None - self.content_type = None - self.content_status = None - self.language = None - self.doc_version = None - - - def parse_properties(self, olefile): - """ - Parse standard properties of an OLE file, from the streams - "\x05SummaryInformation" and "\x05DocumentSummaryInformation", - if present. - Properties are converted to strings, integers or python datetime objects. - If a property is not present, its value is set to None. - """ - # first set all attributes to None: - for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): - setattr(self, attrib, None) - if olefile.exists("\x05SummaryInformation"): - # get properties from the stream: - # (converting timestamps to python datetime, except total_edit_time, - # which is property #10) - props = olefile.getproperties("\x05SummaryInformation", - convert_time=True, no_conversion=[10]) - # store them into this object's attributes: - for i in range(len(self.SUMMARY_ATTRIBS)): - # ids for standards properties start at 0x01, until 0x13 - value = props.get(i+1, None) - setattr(self, self.SUMMARY_ATTRIBS[i], value) - if olefile.exists("\x05DocumentSummaryInformation"): - # get properties from the stream: - props = olefile.getproperties("\x05DocumentSummaryInformation", - convert_time=True) - # store them into this object's attributes: - for i in range(len(self.DOCSUM_ATTRIBS)): - # ids for standards properties start at 0x01, until 0x13 - value = props.get(i+1, None) - setattr(self, self.DOCSUM_ATTRIBS[i], value) - - def dump(self): - """ - Dump all metadata, for debugging purposes. - """ - print 'Properties from SummaryInformation stream:' - for prop in self.SUMMARY_ATTRIBS: - value = getattr(self, prop) - print '- %s: %s' % (prop, repr(value)) - print 'Properties from DocumentSummaryInformation stream:' - for prop in self.DOCSUM_ATTRIBS: - value = getattr(self, prop) - print '- %s: %s' % (prop, repr(value)) - - -#--- _OleStream --------------------------------------------------------------- - -class _OleStream(StringIO.StringIO): - """ - OLE2 Stream - - Returns a read-only file object which can be used to read - the contents of a OLE stream (instance of the StringIO class). - To open a stream, use the openstream method in the OleFile class. - - This function can be used with either ordinary streams, - or ministreams, depending on the offset, sectorsize, and - fat table arguments. - - Attributes: - - size: actual size of data stream, after it was opened. - """ - - # FIXME: should store the list of sects obtained by following - # the fat chain, and load new sectors on demand instead of - # loading it all in one go. - - def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize): - """ - Constructor for _OleStream class. - - fp : file object, the OLE container or the MiniFAT stream - sect : sector index of first sector in the stream - size : total size of the stream - offset : offset in bytes for the first FAT or MiniFAT sector - sectorsize: size of one sector - fat : array/list of sector indexes (FAT or MiniFAT) - filesize : size of OLE file (for debugging) - return : a StringIO instance containing the OLE stream - """ - debug('_OleStream.__init__:') - debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' - %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) - #[PL] To detect malformed documents with FAT loops, we compute the - # expected number of sectors in the stream: - unknown_size = False - if size==0x7FFFFFFF: - # this is the case when called from OleFileIO._open(), and stream - # size is not known in advance (for example when reading the - # Directory stream). Then we can only guess maximum size: - size = len(fat)*sectorsize - # and we keep a record that size was unknown: - unknown_size = True - debug(' stream with UNKNOWN SIZE') - nb_sectors = (size + (sectorsize-1)) / sectorsize - debug('nb_sectors = %d' % nb_sectors) - # This number should (at least) be less than the total number of - # sectors in the given FAT: - if nb_sectors > len(fat): - raise IOError, 'malformed OLE document, stream too large' - # optimization(?): data is first a list of strings, and join() is called - # at the end to concatenate all in one string. - # (this may not be really useful with recent Python versions) - data = [] - # if size is zero, then first sector index should be ENDOFCHAIN: - if size == 0 and sect != ENDOFCHAIN: - debug('size == 0 and sect != ENDOFCHAIN:') - raise IOError, 'incorrect OLE sector index for empty stream' - #[PL] A fixed-length for loop is used instead of an undefined while - # loop to avoid DoS attacks: - for i in xrange(nb_sectors): - # Sector index may be ENDOFCHAIN, but only if size was unknown - if sect == ENDOFCHAIN: - if unknown_size: - break - else: - # else this means that the stream is smaller than declared: - debug('sect=ENDOFCHAIN before expected size') - raise IOError, 'incomplete OLE stream' - # sector index should be within FAT: - if sect<0 or sect>=len(fat): - debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) - debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) -## tmp_data = string.join(data, "") -## f = open('test_debug.bin', 'wb') -## f.write(tmp_data) -## f.close() -## debug('data read so far: %d bytes' % len(tmp_data)) - raise IOError, 'incorrect OLE FAT, sector index out of range' - #TODO: merge this code with OleFileIO.getsect() ? - #TODO: check if this works with 4K sectors: - try: - fp.seek(offset + sectorsize * sect) - except: - debug('sect=%d, seek=%d, filesize=%d' % - (sect, offset+sectorsize*sect, filesize)) - raise IOError, 'OLE sector index out of range' - sector_data = fp.read(sectorsize) - # [PL] check if there was enough data: - # Note: if sector is the last of the file, sometimes it is not a - # complete sector (of 512 or 4K), so we may read less than - # sectorsize. - if len(sector_data)!=sectorsize and sect!=(len(fat)-1): - debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % - (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) - debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) - raise IOError, 'incomplete OLE sector' - data.append(sector_data) - # jump to next sector in the FAT: - try: - sect = fat[sect] - except IndexError: - # [PL] if pointer is out of the FAT an exception is raised - raise IOError, 'incorrect OLE FAT, sector index out of range' - #[PL] Last sector should be a "end of chain" marker: - if sect != ENDOFCHAIN: - raise IOError, 'incorrect last sector index in OLE stream' - data = string.join(data, "") - # Data is truncated to the actual stream size: - if len(data) >= size: - data = data[:size] - # actual stream size is stored for future use: - self.size = size - elif unknown_size: - # actual stream size was not known, now we know the size of read - # data: - self.size = len(data) - else: - # read data is less than expected: - debug('len(data)=%d, size=%d' % (len(data), size)) - raise IOError, 'OLE stream size is less than declared' - # when all data is read in memory, StringIO constructor is called - StringIO.StringIO.__init__(self, data) - # Then the _OleStream object can be used as a read-only file object. - - -#--- _OleDirectoryEntry ------------------------------------------------------- - -class _OleDirectoryEntry: - - """ - OLE2 Directory Entry - """ - #[PL] parsing code moved from OleFileIO.loaddirectory - - # struct to parse directory entries: - # <: little-endian byte order, standard sizes - # (note: this should guarantee that Q returns a 64 bits int) - # 64s: string containing entry name in unicode (max 31 chars) + null char - # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 - # B: uint8, dir entry type (between 0 and 5) - # B: uint8, color: 0=black, 1=red - # I: uint32, index of left child node in the red-black tree, NOSTREAM if none - # I: uint32, index of right child node in the red-black tree, NOSTREAM if none - # I: uint32, index of child root node if it is a storage, else NOSTREAM - # 16s: CLSID, unique identifier (only used if it is a storage) - # I: uint32, user flags - # Q (was 8s): uint64, creation timestamp or zero - # Q (was 8s): uint64, modification timestamp or zero - # I: uint32, SID of first sector if stream or ministream, SID of 1st sector - # of stream containing ministreams if root entry, 0 otherwise - # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise - # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise - STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' - # size of a directory entry: 128 bytes - DIRENTRY_SIZE = 128 - assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE - - - def __init__(self, entry, sid, olefile): - """ - Constructor for an _OleDirectoryEntry object. - Parses a 128-bytes entry from the OLE Directory stream. - - entry : string (must be 128 bytes long) - sid : index of this directory entry in the OLE file directory - olefile: OleFileIO containing this directory entry - """ - self.sid = sid - # ref to olefile is stored for future use - self.olefile = olefile - # kids is a list of children entries, if this entry is a storage: - # (list of _OleDirectoryEntry objects) - self.kids = [] - # kids_dict is a dictionary of children entries, indexed by their - # name in lowercase: used to quickly find an entry, and to detect - # duplicates - self.kids_dict = {} - # flag used to detect if the entry is referenced more than once in - # directory: - self.used = False - # decode DirEntry - ( - name, - namelength, - self.entry_type, - self.color, - self.sid_left, - self.sid_right, - self.sid_child, - clsid, - self.dwUserFlags, - self.createTime, - self.modifyTime, - self.isectStart, - sizeLow, - sizeHigh - ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) - if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: - olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') - # only first directory entry can (and should) be root: - if self.entry_type == STGTY_ROOT and sid != 0: - olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') - if sid == 0 and self.entry_type != STGTY_ROOT: - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') - #debug (struct.unpack(fmt_entry, entry[:len_entry])) - # name should be at most 31 unicode characters + null character, - # so 64 bytes in total (31*2 + 2): - if namelength>64: - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') - # if exception not raised, namelength is set to the maximum value: - namelength = 64 - # only characters without ending null char are kept: - name = name[:(namelength-2)] - # name is converted from unicode to Latin-1: - self.name = _unicode(name) - - debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) - debug(' - type: %d' % self.entry_type) - debug(' - sect: %d' % self.isectStart) - debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, - self.sid_right, self.sid_child)) - - # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes - # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 - # or some other value so it cannot be raised as a defect in general: - if olefile.sectorsize == 512: - if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: - debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % - (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) - olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') - self.size = sizeLow - else: - self.size = sizeLow + (long(sizeHigh)<<32) - debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) - - self.clsid = _clsid(clsid) - # a storage should have a null size, BUT some implementations such as - # Word 8 for Mac seem to allow non-null values => Potential defect: - if self.entry_type == STGTY_STORAGE and self.size != 0: - olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') - # check if stream is not already referenced elsewhere: - if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: - if self.size < olefile.minisectorcutoff \ - and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT - # ministream object - minifat = True - else: - minifat = False - olefile._check_duplicate_stream(self.isectStart, minifat) - - - - def build_storage_tree(self): - """ - Read and build the red-black tree attached to this _OleDirectoryEntry - object, if it is a storage. - Note that this method builds a tree of all subentries, so it should - only be called for the root object once. - """ - debug('build_storage_tree: SID=%d - %s - sid_child=%d' - % (self.sid, repr(self.name), self.sid_child)) - if self.sid_child != NOSTREAM: - # if child SID is not NOSTREAM, then this entry is a storage. - # Let's walk through the tree of children to fill the kids list: - self.append_kids(self.sid_child) - - # Note from OpenOffice documentation: the safest way is to - # recreate the tree because some implementations may store broken - # red-black trees... - - # in the OLE file, entries are sorted on (length, name). - # for convenience, we sort them on name instead: - # (see __cmp__ method in this class) - self.kids.sort() - - - def append_kids(self, child_sid): - """ - Walk through red-black tree of children of this directory entry to add - all of them to the kids list. (recursive method) - - child_sid : index of child directory entry to use, or None when called - first time for the root. (only used during recursion) - """ - #[PL] this method was added to use simple recursion instead of a complex - # algorithm. - # if this is not a storage or a leaf of the tree, nothing to do: - if child_sid == NOSTREAM: - return - # check if child SID is in the proper range: - if child_sid<0 or child_sid>=len(self.olefile.direntries): - self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') - # get child direntry: - child = self.olefile._load_direntry(child_sid) #direntries[child_sid] - debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' - % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) - # the directory entries are organized as a red-black tree. - # (cf. Wikipedia for details) - # First walk through left side of the tree: - self.append_kids(child.sid_left) - # Check if its name is not already used (case-insensitive): - name_lower = child.name.lower() - if self.kids_dict.has_key(name_lower): - self.olefile._raise_defect(DEFECT_INCORRECT, - "Duplicate filename in OLE storage") - # Then the child_sid _OleDirectoryEntry object is appended to the - # kids list and dictionary: - self.kids.append(child) - self.kids_dict[name_lower] = child - # Check if kid was not already referenced in a storage: - if child.used: - self.olefile._raise_defect(DEFECT_INCORRECT, - 'OLE Entry referenced more than once') - child.used = True - # Finally walk through right side of the tree: - self.append_kids(child.sid_right) - # Afterwards build kid's own tree if it's also a storage: - child.build_storage_tree() - - - def __cmp__(self, other): - "Compare entries by name" - return cmp(self.name, other.name) - #TODO: replace by the same function as MS implementation ? - # (order by name length first, then case-insensitive order) - - - def dump(self, tab = 0): - "Dump this entry, and all its subentries (for debug purposes only)" - TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", - "(property)", "(root)"] - print " "*tab + repr(self.name), TYPES[self.entry_type], - if self.entry_type in (STGTY_STREAM, STGTY_ROOT): - print self.size, "bytes", - print - if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: - print " "*tab + "{%s}" % self.clsid - - for kid in self.kids: - kid.dump(tab + 2) - - - def getmtime(self): - """ - Return modification time of a directory entry. - - return: None if modification time is null, a python datetime object - otherwise (UTC timezone) - - new in version 0.26 - """ - if self.modifyTime == 0: - return None - return filetime2datetime(self.modifyTime) - - - def getctime(self): - """ - Return creation time of a directory entry. - - return: None if modification time is null, a python datetime object - otherwise (UTC timezone) - - new in version 0.26 - """ - if self.createTime == 0: - return None - return filetime2datetime(self.createTime) - - -#--- OleFileIO ---------------------------------------------------------------- - -class OleFileIO: - """ - OLE container object - - This class encapsulates the interface to an OLE 2 structured - storage file. Use the {@link listdir} and {@link openstream} methods to - access the contents of this file. - - Object names are given as a list of strings, one for each subentry - level. The root entry should be omitted. For example, the following - code extracts all image streams from a Microsoft Image Composer file: - - ole = OleFileIO("fan.mic") - - for entry in ole.listdir(): - if entry[1:2] == "Image": - fin = ole.openstream(entry) - fout = open(entry[0:1], "wb") - while 1: - s = fin.read(8192) - if not s: - break - fout.write(s) - - You can use the viewer application provided with the Python Imaging - Library to view the resulting files (which happens to be standard - TIFF files). - """ - - def __init__(self, filename = None, raise_defects=DEFECT_FATAL): - """ - Constructor for OleFileIO class. - - filename: file to open. - raise_defects: minimal level for defects to be raised as exceptions. - (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a - security-oriented application, see source code for details) - """ - # minimal level for defects to be raised as exceptions: - self._raise_defects_level = raise_defects - # list of defects/issues not raised as exceptions: - # tuples of (exception type, message) - self.parsing_issues = [] - if filename: - self.open(filename) - - - def _raise_defect(self, defect_level, message, exception_type=IOError): - """ - This method should be called for any defect found during file parsing. - It may raise an IOError exception according to the minimal level chosen - for the OleFileIO object. - - defect_level: defect level, possible values are: - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect - DEFECT_POTENTIAL : a potential defect - DEFECT_INCORRECT : an error according to specifications, but parsing can go on - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible - message: string describing the defect, used with raised exception. - exception_type: exception class to be raised, IOError by default - """ - # added by [PL] - if defect_level >= self._raise_defects_level: - raise exception_type, message - else: - # just record the issue, no exception raised: - self.parsing_issues.append((exception_type, message)) - - - def open(self, filename): - """ - Open an OLE2 file. - Reads the header, FAT and directory. - - filename: string-like or file-like object - """ - #[PL] check if filename is a string-like or file-like object: - # (it is better to check for a read() method) - if hasattr(filename, 'read'): - # file-like object - self.fp = filename - else: - # string-like object: filename of file on disk - #TODO: if larger than 1024 bytes, this could be the actual data => StringIO - self.fp = open(filename, "rb") - # old code fails if filename is not a plain string: - #if type(filename) == type(""): - # self.fp = open(filename, "rb") - #else: - # self.fp = filename - # obtain the filesize by using seek and tell, which should work on most - # file-like objects: - #TODO: do it above, using getsize with filename when possible? - #TODO: fix code to fail with clear exception when filesize cannot be obtained - self.fp.seek(0, os.SEEK_END) - try: - filesize = self.fp.tell() - finally: - self.fp.seek(0) - self._filesize = filesize - - # lists of streams in FAT and MiniFAT, to detect duplicate references - # (list of indexes of first sectors of each stream) - self._used_streams_fat = [] - self._used_streams_minifat = [] - - header = self.fp.read(512) - - if len(header) != 512 or header[:8] != MAGIC: - self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") - - # [PL] header structure according to AAF specifications: - ##Header - ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] - ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, - ## // 0x1a, 0xe1} for current version - ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ - ## // GetClassFile uses root directory class id) - ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is - ## // written by reference implementation - ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for - ## // 512-byte sectors, 4 for 4 KB sectors - ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering - ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; - ## // typically 9 indicating 512-byte sectors - ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; - ## // typically 6 indicating 64-byte mini-sectors - ##USHORT _usReserved; // [22H,02] reserved, must be zero - ##ULONG _ulReserved1; // [24H,04] reserved, must be zero - ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, - ## // number of SECTs in directory chain for 4 KB - ## // sectors - ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain - ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain - ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must - ## // be zero. The reference implementation - ## // does not support transactions - ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; - ## // typically 4096 bytes - ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain - ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain - ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain - ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain - ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors - ##}; - - # [PL] header decoding: - # '<' indicates little-endian byte ordering for Intel (cf. struct module help) - fmt_header = '<8s16sHHHHHHLLLLLLLLLL' - header_size = struct.calcsize(fmt_header) - debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) - header1 = header[:header_size] - ( - self.Sig, - self.clsid, - self.MinorVersion, - self.DllVersion, - self.ByteOrder, - self.SectorShift, - self.MiniSectorShift, - self.Reserved, self.Reserved1, - self.csectDir, - self.csectFat, - self.sectDirStart, - self.signature, - self.MiniSectorCutoff, - self.MiniFatStart, - self.csectMiniFat, - self.sectDifStart, - self.csectDif - ) = struct.unpack(fmt_header, header1) - debug( struct.unpack(fmt_header, header1)) - - if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': - # OLE signature should always be present - self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") - if self.clsid != '\x00'*16: - # according to AAF specs, CLSID should always be zero - self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") - debug( "MinorVersion = %d" % self.MinorVersion ) - debug( "DllVersion = %d" % self.DllVersion ) - if self.DllVersion not in [3, 4]: - # version 3: usual format, 512 bytes per sector - # version 4: large format, 4K per sector - self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") - debug( "ByteOrder = %X" % self.ByteOrder ) - if self.ByteOrder != 0xFFFE: - # For now only common little-endian documents are handled correctly - self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") - # TODO: add big-endian support for documents created on Mac ? - self.SectorSize = 2**self.SectorShift - debug( "SectorSize = %d" % self.SectorSize ) - if self.SectorSize not in [512, 4096]: - self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") - if (self.DllVersion==3 and self.SectorSize!=512) \ - or (self.DllVersion==4 and self.SectorSize!=4096): - self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") - self.MiniSectorSize = 2**self.MiniSectorShift - debug( "MiniSectorSize = %d" % self.MiniSectorSize ) - if self.MiniSectorSize not in [64]: - self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") - if self.Reserved != 0 or self.Reserved1 != 0: - self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") - debug( "csectDir = %d" % self.csectDir ) - if self.SectorSize==512 and self.csectDir!=0: - self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") - debug( "csectFat = %d" % self.csectFat ) - debug( "sectDirStart = %X" % self.sectDirStart ) - debug( "signature = %d" % self.signature ) - # Signature should be zero, BUT some implementations do not follow this - # rule => only a potential defect: - if self.signature != 0: - self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") - debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) - debug( "MiniFatStart = %X" % self.MiniFatStart ) - debug( "csectMiniFat = %d" % self.csectMiniFat ) - debug( "sectDifStart = %X" % self.sectDifStart ) - debug( "csectDif = %d" % self.csectDif ) - - # calculate the number of sectors in the file - # (-1 because header doesn't count) - self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 - debug( "Number of sectors in the file: %d" % self.nb_sect ) - - # file clsid (probably never used, so we don't store it) - clsid = _clsid(header[8:24]) - self.sectorsize = self.SectorSize #1 << i16(header, 30) - self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) - self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) - - # check known streams for duplicate references (these are always in FAT, - # never in MiniFAT): - self._check_duplicate_stream(self.sectDirStart) - # check MiniFAT only if it is not empty: - if self.csectMiniFat: - self._check_duplicate_stream(self.MiniFatStart) - # check DIFAT only if it is not empty: - if self.csectDif: - self._check_duplicate_stream(self.sectDifStart) - - # Load file allocation tables - self.loadfat(header) - # Load direcory. This sets both the direntries list (ordered by sid) - # and the root (ordered by hierarchy) members. - self.loaddirectory(self.sectDirStart)#i32(header, 48)) - self.ministream = None - self.minifatsect = self.MiniFatStart #i32(header, 60) - - - def close(self): - """ - close the OLE file, to release the file object - """ - self.fp.close() - - - def _check_duplicate_stream(self, first_sect, minifat=False): - """ - Checks if a stream has not been already referenced elsewhere. - This method should only be called once for each known stream, and only - if stream size is not null. - first_sect: index of first sector of the stream in FAT - minifat: if True, stream is located in the MiniFAT, else in the FAT - """ - if minifat: - debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) - used_streams = self._used_streams_minifat - else: - debug('_check_duplicate_stream: sect=%d in FAT' % first_sect) - # some values can be safely ignored (not a real stream): - if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): - return - used_streams = self._used_streams_fat - #TODO: would it be more efficient using a dict or hash values, instead - # of a list of long ? - if first_sect in used_streams: - self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') - else: - used_streams.append(first_sect) - - - def dumpfat(self, fat, firstindex=0): - "Displays a part of FAT in human-readable form for debugging purpose" - # [PL] added only for debug - if not DEBUG_MODE: - return - # dictionary to convert special FAT values in human-readable strings - VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) - fatnames = { - FREESECT: "..free..", - ENDOFCHAIN: "[ END. ]", - FATSECT: "FATSECT ", - DIFSECT: "DIFSECT " - } - nbsect = len(fat) - nlines = (nbsect+VPL-1)/VPL - print "index", - for i in range(VPL): - print ("%8X" % i), - print "" - for l in range(nlines): - index = l*VPL - print ("%8X:" % (firstindex+index)), - for i in range(index, index+VPL): - if i>=nbsect: - break - sect = fat[i] - if sect in fatnames: - nom = fatnames[sect] - else: - if sect == i+1: - nom = " --->" - else: - nom = "%8X" % sect - print nom, - print "" - - - def dumpsect(self, sector, firstindex=0): - "Displays a sector in a human-readable form, for debugging purpose." - if not DEBUG_MODE: - return - VPL=8 # number of values per line (8+1 * 8+1 = 81) - tab = array.array(UINT32, sector) - nbsect = len(tab) - nlines = (nbsect+VPL-1)/VPL - print "index", - for i in range(VPL): - print ("%8X" % i), - print "" - for l in range(nlines): - index = l*VPL - print ("%8X:" % (firstindex+index)), - for i in range(index, index+VPL): - if i>=nbsect: - break - sect = tab[i] - nom = "%8X" % sect - print nom, - print "" - - def sect2array(self, sect): - """ - convert a sector to an array of 32 bits unsigned integers, - swapping bytes on big endian CPUs such as PowerPC (old Macs) - """ - a = array.array(UINT32, sect) - # if CPU is big endian, swap bytes: - if sys.byteorder == 'big': - a.byteswap() - return a - - - def loadfat_sect(self, sect): - """ - Adds the indexes of the given sector to the FAT - sect: string containing the first FAT sector, or array of long integers - return: index of last FAT sector. - """ - # a FAT sector is an array of ulong integers. - if isinstance(sect, array.array): - # if sect is already an array it is directly used - fat1 = sect - else: - # if it's a raw sector, it is parsed in an array - fat1 = self.sect2array(sect) - self.dumpsect(sect) - # The FAT is a sector chain starting at the first index of itself. - for isect in fat1: - #print "isect = %X" % isect - if isect == ENDOFCHAIN or isect == FREESECT: - # the end of the sector chain has been reached - break - # read the FAT sector - s = self.getsect(isect) - # parse it as an array of 32 bits integers, and add it to the - # global FAT array - nextfat = self.sect2array(s) - self.fat = self.fat + nextfat - return isect - - - def loadfat(self, header): - """ - Load the FAT table. - """ - # The header contains a sector numbers - # for the first 109 FAT sectors. Additional sectors are - # described by DIF blocks - - sect = header[76:512] - debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) - #fat = [] - # [PL] FAT is an array of 32 bits unsigned ints, it's more effective - # to use an array than a list in Python. - # It's initialized as empty first: - self.fat = array.array(UINT32) - self.loadfat_sect(sect) - #self.dumpfat(self.fat) -## for i in range(0, len(sect), 4): -## ix = i32(sect, i) -## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: -## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: -## break -## s = self.getsect(ix) -## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) -## fat = fat + array.array(UINT32, s) - if self.csectDif != 0: - # [PL] There's a DIFAT because file is larger than 6.8MB - # some checks just in case: - if self.csectFat <= 109: - # there must be at least 109 blocks in header and the rest in - # DIFAT, so number of sectors must be >109. - self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') - if self.sectDifStart >= self.nb_sect: - # initial DIFAT block index must be valid - self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') - debug( "DIFAT analysis..." ) - # We compute the necessary number of DIFAT sectors : - # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) - nb_difat = (self.csectFat-109 + 126)/127 - debug( "nb_difat = %d" % nb_difat ) - if self.csectDif != nb_difat: - raise IOError, 'incorrect DIFAT' - isect_difat = self.sectDifStart - for i in xrange(nb_difat): - debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) - #TODO: check if corresponding FAT SID = DIFSECT - sector_difat = self.getsect(isect_difat) - difat = self.sect2array(sector_difat) - self.dumpsect(sector_difat) - self.loadfat_sect(difat[:127]) - # last DIFAT pointer is next DIFAT sector: - isect_difat = difat[127] - debug( "next DIFAT sector: %X" % isect_difat ) - # checks: - if isect_difat not in [ENDOFCHAIN, FREESECT]: - # last DIFAT pointer value must be ENDOFCHAIN or FREESECT - raise IOError, 'incorrect end of DIFAT' -## if len(self.fat) != self.csectFat: -## # FAT should contain csectFat blocks -## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) -## raise IOError, 'incorrect DIFAT' - # since FAT is read from fixed-size sectors, it may contain more values - # than the actual number of sectors in the file. - # Keep only the relevant sector indexes: - if len(self.fat) > self.nb_sect: - debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) - self.fat = self.fat[:self.nb_sect] - debug('\nFAT:') - self.dumpfat(self.fat) - - - def loadminifat(self): - """ - Load the MiniFAT table. - """ - # MiniFAT is stored in a standard sub-stream, pointed to by a header - # field. - # NOTE: there are two sizes to take into account for this stream: - # 1) Stream size is calculated according to the number of sectors - # declared in the OLE header. This allocated stream may be more than - # needed to store the actual sector indexes. - # (self.csectMiniFat is the number of sectors of size self.SectorSize) - stream_size = self.csectMiniFat * self.SectorSize - # 2) Actually used size is calculated by dividing the MiniStream size - # (given by root entry size) by the size of mini sectors, *4 for - # 32 bits indexes: - nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize - used_size = nb_minisectors * 4 - debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % - (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) - if used_size > stream_size: - # This is not really a problem, but may indicate a wrong implementation: - self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') - # In any case, first read stream_size: - s = self._open(self.minifatsect, stream_size, force_FAT=True).read() - #[PL] Old code replaced by an array: - #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) - self.minifat = self.sect2array(s) - # Then shrink the array to used size, to avoid indexes out of MiniStream: - debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) - self.minifat = self.minifat[:nb_minisectors] - debug('loadminifat(): len=%d' % len(self.minifat)) - debug('\nMiniFAT:') - self.dumpfat(self.minifat) - - def getsect(self, sect): - """ - Read given sector from file on disk. - sect: sector index - returns a string containing the sector data. - """ - # [PL] this original code was wrong when sectors are 4KB instead of - # 512 bytes: - #self.fp.seek(512 + self.sectorsize * sect) - #[PL]: added safety checks: - #print "getsect(%X)" % sect - try: - self.fp.seek(self.sectorsize * (sect+1)) - except: - debug('getsect(): sect=%X, seek=%d, filesize=%d' % - (sect, self.sectorsize*(sect+1), self._filesize)) - self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') - sector = self.fp.read(self.sectorsize) - if len(sector) != self.sectorsize: - debug('getsect(): sect=%X, read=%d, sectorsize=%d' % - (sect, len(sector), self.sectorsize)) - self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') - return sector - - - def loaddirectory(self, sect): - """ - Load the directory. - sect: sector index of directory stream. - """ - # The directory is stored in a standard - # substream, independent of its size. - - # open directory stream as a read-only file: - # (stream size is not known in advance) - self.directory_fp = self._open(sect) - - #[PL] to detect malformed documents and avoid DoS attacks, the maximum - # number of directory entries can be calculated: - max_entries = self.directory_fp.size / 128 - debug('loaddirectory: size=%d, max_entries=%d' % - (self.directory_fp.size, max_entries)) - - # Create list of directory entries - #self.direntries = [] - # We start with a list of "None" object - self.direntries = [None] * max_entries -## for sid in xrange(max_entries): -## entry = fp.read(128) -## if not entry: -## break -## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) - # load root entry: - root_entry = self._load_direntry(0) - # Root entry is the first entry: - self.root = self.direntries[0] - # read and build all storage trees, starting from the root: - self.root.build_storage_tree() - - - def _load_direntry (self, sid): - """ - Load a directory entry from the directory. - This method should only be called once for each storage/stream when - loading the directory. - sid: index of storage/stream in the directory. - return: a _OleDirectoryEntry object - raise: IOError if the entry has always been referenced. - """ - # check if SID is OK: - if sid<0 or sid>=len(self.direntries): - self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") - # check if entry was already referenced: - if self.direntries[sid] is not None: - self._raise_defect(DEFECT_INCORRECT, - "double reference for OLE stream/storage") - # if exception not raised, return the object - return self.direntries[sid] - self.directory_fp.seek(sid * 128) - entry = self.directory_fp.read(128) - self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) - return self.direntries[sid] - - - def dumpdirectory(self): - """ - Dump directory (for debugging only) - """ - self.root.dump() - - - def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): - """ - Open a stream, either in FAT or MiniFAT according to its size. - (openstream helper) - - start: index of first sector - size: size of stream (or nothing if size is unknown) - force_FAT: if False (default), stream will be opened in FAT or MiniFAT - according to size. If True, it will always be opened in FAT. - """ - debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % - (start, size, str(force_FAT))) - # stream size is compared to the MiniSectorCutoff threshold: - if size < self.minisectorcutoff and not force_FAT: - # ministream object - if not self.ministream: - # load MiniFAT if it wasn't already done: - self.loadminifat() - # The first sector index of the miniFAT stream is stored in the - # root directory entry: - size_ministream = self.root.size - debug('Opening MiniStream: sect=%d, size=%d' % - (self.root.isectStart, size_ministream)) - self.ministream = self._open(self.root.isectStart, - size_ministream, force_FAT=True) - return _OleStream(self.ministream, start, size, 0, - self.minisectorsize, self.minifat, - self.ministream.size) - else: - # standard stream - return _OleStream(self.fp, start, size, 512, - self.sectorsize, self.fat, self._filesize) - - - def _list(self, files, prefix, node, streams=True, storages=False): - """ - (listdir helper) - files: list of files to fill in - prefix: current location in storage tree (list of names) - node: current node (_OleDirectoryEntry object) - streams: bool, include streams if True (True by default) - new in v0.26 - storages: bool, include storages if True (False by default) - new in v0.26 - (note: the root storage is never included) - """ - prefix = prefix + [node.name] - for entry in node.kids: - if entry.kids: - # this is a storage - if storages: - # add it to the list - files.append(prefix[1:] + [entry.name]) - # check its kids - self._list(files, prefix, entry, streams, storages) - else: - # this is a stream - if streams: - # add it to the list - files.append(prefix[1:] + [entry.name]) - - - def listdir(self, streams=True, storages=False): - """ - Return a list of streams stored in this file - - streams: bool, include streams if True (True by default) - new in v0.26 - storages: bool, include storages if True (False by default) - new in v0.26 - (note: the root storage is never included) - """ - files = [] - self._list(files, [], self.root, streams, storages) - return files - - - def _find(self, filename): - """ - Returns directory entry of given filename. (openstream helper) - Note: this method is case-insensitive. - - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] - return: sid of requested filename - raise IOError if file not found - """ - - # if filename is a string instead of a list, split it on slashes to - # convert to a list: - if isinstance(filename, basestring): - filename = filename.split('/') - # walk across storage tree, following given path: - node = self.root - for name in filename: - for kid in node.kids: - if kid.name.lower() == name.lower(): - break - else: - raise IOError, "file not found" - node = kid - return node.sid - - - def openstream(self, filename): - """ - Open a stream as a read-only file object (StringIO). - - filename: path of stream in storage tree (except root entry), either: - - a string using Unix path syntax, for example: - 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. - Example: ['storage_1', 'storage_1.2', 'stream'] - return: file object (read-only) - raise IOError if filename not found, or if this is not a stream. - """ - sid = self._find(filename) - entry = self.direntries[sid] - if entry.entry_type != STGTY_STREAM: - raise IOError, "this file is not a stream" - return self._open(entry.isectStart, entry.size) - - - def get_type(self, filename): - """ - Test if given filename exists as a stream or a storage in the OLE - container, and return its type. - - filename: path of stream in storage tree. (see openstream for syntax) - return: False if object does not exist, its entry type (>0) otherwise: - - STGTY_STREAM: a stream - - STGTY_STORAGE: a storage - - STGTY_ROOT: the root entry - """ - try: - sid = self._find(filename) - entry = self.direntries[sid] - return entry.entry_type - except: - return False - - - def getmtime(self, filename): - """ - Return modification time of a stream/storage. - - filename: path of stream/storage in storage tree. (see openstream for - syntax) - return: None if modification time is null, a python datetime object - otherwise (UTC timezone) - - new in version 0.26 - """ - sid = self._find(filename) - entry = self.direntries[sid] - return entry.getmtime() - - - def getctime(self, filename): - """ - Return creation time of a stream/storage. - - filename: path of stream/storage in storage tree. (see openstream for - syntax) - return: None if creation time is null, a python datetime object - otherwise (UTC timezone) - - new in version 0.26 - """ - sid = self._find(filename) - entry = self.direntries[sid] - return entry.getctime() - - - def exists(self, filename): - """ - Test if given filename exists as a stream or a storage in the OLE - container. - - filename: path of stream in storage tree. (see openstream for syntax) - return: True if object exist, else False. - """ - try: - sid = self._find(filename) - return True - except: - return False - - - def get_size(self, filename): - """ - Return size of a stream in the OLE container, in bytes. - - filename: path of stream in storage tree (see openstream for syntax) - return: size in bytes (long integer) - raise: IOError if file not found, TypeError if this is not a stream. - """ - sid = self._find(filename) - entry = self.direntries[sid] - if entry.entry_type != STGTY_STREAM: - #TODO: Should it return zero instead of raising an exception ? - raise TypeError, 'object is not an OLE stream' - return entry.size - - - def get_rootentry_name(self): - """ - Return root entry name. Should usually be 'Root Entry' or 'R' in most - implementations. - """ - return self.root.name - - - def getproperties(self, filename, convert_time=False, no_conversion=None): - """ - Return properties described in substream. - - filename: path of stream in storage tree (see openstream for syntax) - convert_time: bool, if True timestamps will be converted to Python datetime - no_conversion: None or list of int, timestamps not to be converted - (for example total editing time is not a real timestamp) - return: a dictionary of values indexed by id (integer) - """ - # make sure no_conversion is a list, just to simplify code below: - if no_conversion == None: - no_conversion = [] - # stream path as a string to report exceptions: - streampath = filename - if not isinstance(streampath, str): - streampath = '/'.join(streampath) - - fp = self.openstream(filename) - - data = {} - - try: - # header - s = fp.read(28) - clsid = _clsid(s[8:24]) - - # format id - s = fp.read(20) - fmtid = _clsid(s[:16]) - fp.seek(i32(s, 16)) - - # get section - s = "****" + fp.read(i32(fp.read(4))-4) - # number of properties: - num_props = i32(s, 4) - except: - # catch exception while parsing property header, and only raise - # a DEFECT_INCORRECT then return an empty dict, because this is not - # a fatal error when parsing the whole file - exctype, excvalue = sys.exc_info()[:2] - msg = 'Error while parsing properties header in stream %s: %s' % ( - repr(streampath), excvalue) - self._raise_defect(DEFECT_INCORRECT, msg, exctype) - return data - - for i in range(num_props): - try: - id = 0 # just in case of an exception - id = i32(s, 8+i*8) - offset = i32(s, 12+i*8) - type = i32(s, offset) - - debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) - - # test for common types first (should perhaps use - # a dictionary instead?) - - if type == VT_I2: # 16-bit signed integer - value = i16(s, offset+4) - if value >= 32768: - value = value - 65536 - elif type == VT_UI2: # 2-byte unsigned integer - value = i16(s, offset+4) - elif type in (VT_I4, VT_INT, VT_ERROR): - # VT_I4: 32-bit signed integer - # VT_ERROR: HRESULT, similar to 32-bit signed integer, - # see http://msdn.microsoft.com/en-us/library/cc230330.aspx - value = i32(s, offset+4) - elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer - value = i32(s, offset+4) # FIXME - elif type in (VT_BSTR, VT_LPSTR): - # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx - # size is a 32 bits integer, including the null terminator, and - # possibly trailing or embedded null chars - #TODO: if codepage is unicode, the string should be converted as such - count = i32(s, offset+4) - value = s[offset+8:offset+8+count-1] - # remove all null chars: - value = value.replace('\x00', '') - elif type == VT_BLOB: - # binary large object (BLOB) - # see http://msdn.microsoft.com/en-us/library/dd942282.aspx - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - elif type == VT_LPWSTR: - # UnicodeString - # see http://msdn.microsoft.com/en-us/library/dd942313.aspx - # "the string should NOT contain embedded or additional trailing - # null characters." - count = i32(s, offset+4) - value = _unicode(s[offset+8:offset+8+count*2]) - elif type == VT_FILETIME: - value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) - # FILETIME is a 64-bit int: "number of 100ns periods - # since Jan 1,1601". - if convert_time and id not in no_conversion: - debug('Converting property #%d to python datetime, value=%d=%fs' - %(id, value, float(value)/10000000L)) - # convert FILETIME to Python datetime.datetime - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - debug('timedelta days=%d' % (value/(10*1000000*3600*24))) - value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) - else: - # legacy code kept for backward compatibility: returns a - # number of seconds since Jan 1,1601 - value = value / 10000000L # seconds - elif type == VT_UI1: # 1-byte unsigned integer - value = ord(s[offset+4]) - elif type == VT_CLSID: - value = _clsid(s[offset+4:offset+20]) - elif type == VT_CF: - # PropertyIdentifier or ClipboardData?? - # see http://msdn.microsoft.com/en-us/library/dd941945.aspx - count = i32(s, offset+4) - value = s[offset+8:offset+8+count] - elif type == VT_BOOL: - # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True - # see http://msdn.microsoft.com/en-us/library/cc237864.aspx - value = bool(i16(s, offset+4)) - else: - value = None # everything else yields "None" - debug ('property id=%d: type=%d not implemented in parser yet' % (id, type)) - - # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, - # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, - # see http://msdn.microsoft.com/en-us/library/dd942033.aspx - - # FIXME: add support for VT_VECTOR - # VT_VECTOR is a 32 uint giving the number of items, followed by - # the items in sequence. The VT_VECTOR value is combined with the - # type of items, e.g. VT_VECTOR|VT_BSTR - # see http://msdn.microsoft.com/en-us/library/dd942011.aspx - - #print "%08x" % id, repr(value), - #print "(%s)" % VT[i32(s, offset) & 0xFFF] - - data[id] = value - except: - # catch exception while parsing each property, and only raise - # a DEFECT_INCORRECT, because parsing can go on - exctype, excvalue = sys.exc_info()[:2] - msg = 'Error while parsing property id %d in stream %s: %s' % ( - id, repr(streampath), excvalue) - self._raise_defect(DEFECT_INCORRECT, msg, exctype) - - return data - - def get_metadata(self): - """ - Parse standard properties streams, return an OleMetadata object - containing all the available metadata. - (also stored in the metadata attribute of the OleFileIO object) - - new in version 0.25 - """ - self.metadata = OleMetadata() - self.metadata.parse_properties(self) - return self.metadata - -# -# -------------------------------------------------------------------- -# This script can be used to dump the directory of any OLE2 structured -# storage file. - -if __name__ == "__main__": - - import sys - - # [PL] display quick usage info if launched from command-line - if len(sys.argv) <= 1: - print __doc__ - print """ -Launched from command line, this script parses OLE files and prints info. - -Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] - -Options: --d : debug mode (display a lot of debug information, for developers only) --c : check all streams (for debugging purposes) -""" - sys.exit() - - check_streams = False - for filename in sys.argv[1:]: -## try: - # OPTIONS: - if filename == '-d': - # option to switch debug mode on: - set_debug_mode(True) - continue - if filename == '-c': - # option to switch check streams mode on: - check_streams = True - continue - - ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) - print "-" * 68 - print filename - print "-" * 68 - ole.dumpdirectory() - for streamname in ole.listdir(): - if streamname[-1][0] == "\005": - print streamname, ": properties" - props = ole.getproperties(streamname, convert_time=True) - props = props.items() - props.sort() - for k, v in props: - #[PL]: avoid to display too large or binary values: - if isinstance(v, basestring): - if len(v) > 50: - v = v[:50] - # quick and dirty binary check: - for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, - 21,22,23,24,25,26,27,28,29,30,31): - if chr(c) in v: - v = '(binary data)' - break - print " ", k, v - - if check_streams: - # Read all streams to check if there are errors: - print '\nChecking streams...' - for streamname in ole.listdir(): - # print name using repr() to convert binary chars to \xNN: - print '-', repr('/'.join(streamname)),'-', - st_type = ole.get_type(streamname) - if st_type == STGTY_STREAM: - print 'size %d' % ole.get_size(streamname) - # just try to read stream in memory: - ole.openstream(streamname) - else: - print 'NOT a stream : type=%d' % st_type - print '' - -## for streamname in ole.listdir(): -## # print name using repr() to convert binary chars to \xNN: -## print '-', repr('/'.join(streamname)),'-', -## print ole.getmtime(streamname) -## print '' - - print 'Modification/Creation times of all directory entries:' - for entry in ole.direntries: - if entry is not None: - print '- %s: mtime=%s ctime=%s' % (entry.name, - entry.getmtime(), entry.getctime()) - print '' - - # parse and display metadata: - meta = ole.get_metadata() - meta.dump() - print '' - #[PL] Test a few new methods: - root = ole.get_rootentry_name() - print 'Root entry name: "%s"' % root - if ole.exists('worddocument'): - print "This is a Word document." - print "type of stream 'WordDocument':", ole.get_type('worddocument') - print "size :", ole.get_size('worddocument') - if ole.exists('macros/vba'): - print "This document may contain VBA macros." - - # print parsing issues: - print '\nNon-fatal issues raised during parsing:' - if ole.parsing_issues: - for exctype, msg in ole.parsing_issues: - print '- %s: %s' % (exctype.__name__, msg) - else: - print 'None' -## except IOError, v: -## print "***", "cannot read", file, "-", v +#!/usr/local/bin/python +# -*- coding: latin-1 -*- +""" +OleFileIO_PL: + Module to read Microsoft OLE2 files (also called Structured Storage or + Microsoft Compound Document File Format), such as Microsoft Office + documents, Image Composer and FlashPix files, Outlook messages, ... + +version 0.26 2013-07-24 Philippe Lagadec - http://www.decalage.info + +Project website: http://www.decalage.info/python/olefileio + +Improved version of the OleFileIO module from PIL library v1.1.6 +See: http://www.pythonware.com/products/pil/index.htm + +The Python Imaging Library (PIL) is + Copyright (c) 1997-2005 by Secret Labs AB + Copyright (c) 1995-2005 by Fredrik Lundh +OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec + +See source code and LICENSE.txt for information on usage and redistribution. + +WARNING: THIS IS (STILL) WORK IN PROGRESS. +""" + +__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" +__date__ = "2013-07-24" +__version__ = '0.26' + +#--- LICENSE ------------------------------------------------------------------ + +# OleFileIO_PL is an improved version of the OleFileIO module from the +# Python Imaging Library (PIL). + +# OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its associated +# documentation, you agree that you have read, understood, and will comply with +# the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and its +# associated documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appears in all copies, and that both +# that copyright notice and this permission notice appear in supporting +# documentation, and that the name of Secret Labs AB or the author(s) not be used +# in advertising or publicity pertaining to distribution of the software +# without specific, written prior permission. +# +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +#----------------------------------------------------------------------------- +# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) +# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility +# (all changes flagged with [PL]) +# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise +# exceptions in _OleStream.__init__() +# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) +# - added some constants +# - added header values checks +# - added some docstrings +# - getsect: bugfix in case sectors >512 bytes +# - getsect: added conformity checks +# - DEBUG_MODE constant to activate debug display +# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments +# - updated license +# - converted tabs to 4 spaces +# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity +# - improved _unicode() to use Python 2.x unicode support +# - fixed bug in _OleDirectoryEntry +# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops +# - fixed _OleStream which didn't check stream size +# - added/improved many docstrings and comments +# - moved helper functions _unicode and _clsid out of +# OleFileIO class +# - improved OleFileIO._find() to add Unix path syntax +# - OleFileIO._find() is now case-insensitive +# - added get_type() and get_rootentry_name() +# - rewritten loaddirectory and _OleDirectoryEntry +# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict +# - added detection of duplicate filenames in storages +# - added detection of duplicate references to streams +# - added get_size() and exists() to _OleDirectoryEntry +# - added isOleFile to check header before parsing +# - added __all__ list to control public keywords in pydoc +# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory +# - improved _unicode(), added workarounds for Python <2.3 +# - added set_debug_mode and -d option to set debug mode +# - fixed bugs in OleFileIO.open and _OleDirectoryEntry +# - added safety check in main for large or binary +# properties +# - allow size>0 for storages for some implementations +# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and +# streams +# - added option '-c' in main to check all streams +# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms +# (thanks to Ben G. and Martijn for reporting the bug) +# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str +# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs +# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn +# (https://bitbucket.org/decalage/olefileio_pl/issue/7) +# - added close method to OleFileIO (fixed issue #2) +# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) +# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python +# datetime +# - main: displays properties with date format +# - new class OleMetadata to parse standard properties +# - added get_metadata method +# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata +# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps +# - OleMetaData: total_edit_time is now a number of seconds, +# not a timestamp +# - getproperties: added support for VT_BOOL, VT_INT, V_UINT +# - getproperties: filter out null chars from strings +# - getproperties: raise non-fatal defects instead of +# exceptions when properties cannot be parsed properly +# 2013-05-27 PL: - getproperties: improved exception handling +# - _raise_defect: added option to set exception type +# - all non-fatal issues are now recorded, and displayed +# when run as a script +# 2013-07-11 v0.26 PL: - added methods to get modification and creation times +# of a directory entry or a storage/stream +# - fixed parsing of direntry timestamps +# 2013-07-24 PL: - new options in listdir to list storages and/or streams + +#----------------------------------------------------------------------------- +# TODO (for version 1.0): +# + add path attrib to _OleDirEntry, set it once and for all in init or +# append_kids (then listdir/_list can be simplified) +# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... +# - add underscore to each private method, to avoid their display in +# pydoc/epydoc documentation - Remove it for classes to be documented +# - replace all raised exceptions with _raise_defect (at least in OleFileIO) +# - merge code from _OleStream and OleFileIO.getsect to read sectors +# (maybe add a class for FAT and MiniFAT ?) +# - add method to check all streams (follow sectors chains without storing all +# stream in memory, and report anomalies) +# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? +# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) +# => if possible avoid converting names to Latin-1 +# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) +# - rewrite OleFileIO.getproperties +# - improve docstrings to show more sample uses +# - see also original notes and FIXME below +# - remove all obsolete FIXMEs +# - OleMetadata: fix version attrib according to +# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx + +# IDEAS: +# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for +# streams with unknown size +# - use arrays of int instead of long integers for FAT/MiniFAT, to improve +# performance and reduce memory usage ? (possible issue with values >2^31) +# - provide tests with unittest (may need write support to create samples) +# - move all debug code (and maybe dump methods) to a separate module, with +# a class which inherits OleFileIO ? +# - fix docstrings to follow epydoc format +# - add support for 4K sectors ? +# - add support for big endian byte order ? +# - create a simple OLE explorer with wxPython + +# FUTURE EVOLUTIONS to add write support: +# 1) add ability to write a stream back on disk from StringIO (same size, no +# change in FAT/MiniFAT). +# 2) rename a stream/storage if it doesn't change the RB tree +# 3) use rbtree module to update the red-black tree + any rename +# 4) remove a stream/storage: free sectors in FAT/MiniFAT +# 5) allocate new sectors in FAT/MiniFAT +# 6) create new storage/stream +#----------------------------------------------------------------------------- + +# +# THIS IS WORK IN PROGRESS +# +# The Python Imaging Library +# $Id$ +# +# stuff to deal with OLE2 Structured Storage files. this module is +# used by PIL to read Image Composer and FlashPix files, but can also +# be used to read other files of this type. +# +# History: +# 1997-01-20 fl Created +# 1997-01-22 fl Fixed 64-bit portability quirk +# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) +# 2004-02-29 fl Changed long hex constants to signed integers +# +# Notes: +# FIXME: sort out sign problem (eliminate long hex constants) +# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] +# FIXME: provide a glob mechanism function (using fnmatchcase) +# +# Literature: +# +# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, +# September 1996. +# +# Quotes: +# +# "If this document and functionality of the Software conflict, +# the actual functionality of the Software represents the correct +# functionality" -- Microsoft, in the OLE format specification +# +# Copyright (c) Secret Labs AB 1997. +# Copyright (c) Fredrik Lundh 1997. +# +# See the README file for information on usage and redistribution. +# + +#------------------------------------------------------------------------------ + +import string, StringIO, struct, array, os.path, sys, datetime + +#[PL] Define explicitly the public API to avoid private objects in pydoc: +__all__ = ['OleFileIO', 'isOleFile'] + +#[PL] workaround to fix an issue with array item size on 64 bits systems: +if array.array('L').itemsize == 4: + # on 32 bits platforms, long integers in an array are 32 bits: + UINT32 = 'L' +elif array.array('I').itemsize == 4: + # on 64 bits platforms, integers in an array are 32 bits: + UINT32 = 'I' +else: + raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' + + +#[PL] These workarounds were inspired from the Path module +# (see http://www.jorendorff.com/articles/python/path/) +#TODO: test with old Python versions + +# Pre-2.3 workaround for booleans +try: + True, False +except NameError: + True, False = 1, 0 + +# Pre-2.3 workaround for basestring. +try: + basestring +except NameError: + try: + # is Unicode supported (Python >2.0 or >1.6 ?) + basestring = (str, unicode) + except NameError: + basestring = str + +#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode +# if False (default PIL behaviour), all filenames are converted to Latin-1. +KEEP_UNICODE_NAMES = False + +#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on +# command line to change it. +DEBUG_MODE = False +def debug_print(msg): + print msg +def debug_pass(msg): + pass +debug = debug_pass + +def set_debug_mode(debug_mode): + """ + Set debug mode on or off, to control display of debugging messages. + mode: True or False + """ + global DEBUG_MODE, debug + DEBUG_MODE = debug_mode + if debug_mode: + debug = debug_print + else: + debug = debug_pass + +#TODO: convert this to hex +MAGIC = '\320\317\021\340\241\261\032\341' + +#[PL]: added constants for Sector IDs (from AAF specifications) +MAXREGSECT = 0xFFFFFFFAL; # maximum SECT +DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector + +#[PL]: added constants for Directory Entry IDs (from AAF specifications) +MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID +NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry + +#[PL] object types in storage (from AAF specifications) +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) +STGTY_STORAGE = 1 # element is a storage object +STGTY_STREAM = 2 # element is a stream object +STGTY_LOCKBYTES = 3 # element is an ILockBytes object +STGTY_PROPERTY = 4 # element is an IPropertyStorage object +STGTY_ROOT = 5 # element is a root storage + + +# +# -------------------------------------------------------------------- +# property types + +VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; +VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; +VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; +VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; +VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; +VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; +VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; +VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; +VT_VECTOR=0x1000; + +# map property id to name (for debugging purposes) + +VT = {} +for keyword, var in vars().items(): + if keyword[:3] == "VT_": + VT[var] = keyword + +# +# -------------------------------------------------------------------- +# Some common document types (root.clsid fields) + +WORD_CLSID = "00020900-0000-0000-C000-000000000046" +#TODO: check Excel, PPT, ... + +#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect +DEFECT_POTENTIAL = 20 # a potential defect +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing + # can go on +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is + # impossible + +#[PL] add useful constants to __all__: +for key in vars().keys(): + if key.startswith('STGTY_') or key.startswith('DEFECT_'): + __all__.append(key) + + +#--- FUNCTIONS ---------------------------------------------------------------- + +def isOleFile (filename): + """ + Test if file is an OLE container (according to its header). + filename: file name or path (str, unicode) + return: True if OLE, False otherwise. + """ + f = open(filename, 'rb') + header = f.read(len(MAGIC)) + if header == MAGIC: + return True + else: + return False + + +#TODO: replace i16 and i32 with more readable struct.unpack equivalent +def i16(c, o = 0): + """ + Converts a 2-bytes (16 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return ord(c[o])+(ord(c[o+1])<<8) + + +def i32(c, o = 0): + """ + Converts a 4-bytes (32 bits) string to an integer. + + c: string containing bytes to convert + o: offset of bytes to convert in string + """ + return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) + # [PL]: added int() because "<<" gives long int since Python 2.4 + + +def _clsid(clsid): + """ + Converts a CLSID to a human-readable string. + clsid: string of length 16. + """ + assert len(clsid) == 16 + if clsid == "\0" * len(clsid): + return "" + return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % + ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + + tuple(map(ord, clsid[8:16])))) + + + +# UNICODE support for Old Python versions: +# (necessary to handle storages/streams names which use Unicode) + +try: + # is Unicode supported ? + unicode + + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python with Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() + """ + #TODO: test if it OleFileIO works with Unicode strings, instead of + # converting to Latin-1. + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = s.decode('UTF-16LE', errors) + if KEEP_UNICODE_NAMES: + return u + else: + # Second the unicode string is converted to Latin-1 + return u.encode('latin_1', errors) + except: + # there was an error during Unicode to Latin-1 conversion: + raise IOError, 'incorrect Unicode name' + +except NameError: + def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python without native Unicode support) + + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. (ignored in this version) + """ + # If the unicode function does not exist, we assume this is an old + # Python version without Unicode support. + # Null bytes are simply removed (this only works with usual Latin-1 + # strings which do not contain unicode characters>256): + return filter(ord, s) + + +def filetime2datetime(filetime): + """ + convert FILETIME (64 bits int) to Python datetime.datetime + """ + # TODO: manage exception when microseconds is too large + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24))) + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10) + + + +#=== CLASSES ================================================================== + +class OleMetadata: + """ + class to parse and store metadata from standard properties of OLE files. + + Available attributes: + codepage, title, subject, author, keywords, comments, template, + last_saved_by, revision_number, total_edit_time, last_printed, create_time, + last_saved_time, num_pages, num_words, num_chars, thumbnail, + creating_application, security, codepage_doc, category, presentation_target, + bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, + scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, + chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, + version, dig_sig, content_type, content_status, language, doc_version + + Note: an attribute is set to None when not present in the properties of the + OLE file. + + References for SummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd942545.aspx + - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx + - http://msdn.microsoft.com/en-us/library/aa372045.aspx + - http://sedna-soft.de/summary-information-stream/ + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html + + References for DocumentSummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html + + new in version 0.25 + """ + + # attribute names for SummaryInformation stream properties: + # (ordered by property id, starting at 1) + SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', + 'template', 'last_saved_by', 'revision_number', 'total_edit_time', + 'last_printed', 'create_time', 'last_saved_time', 'num_pages', + 'num_words', 'num_chars', 'thumbnail', 'creating_application', + 'security'] + + # attribute names for DocumentSummaryInformation stream properties: + # (ordered by property id, starting at 1) + DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', + 'slides', 'notes', 'hidden_slides', 'mm_clips', + 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', + 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', + 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', + 'content_type', 'content_status', 'language', 'doc_version'] + + def __init__(self): + """ + Constructor for OleMetadata + All attributes are set to None by default + """ + # properties from SummaryInformation stream + self.codepage = None + self.title = None + self.subject = None + self.author = None + self.keywords = None + self.comments = None + self.template = None + self.last_saved_by = None + self.revision_number = None + self.total_edit_time = None + self.last_printed = None + self.create_time = None + self.last_saved_time = None + self.num_pages = None + self.num_words = None + self.num_chars = None + self.thumbnail = None + self.creating_application = None + self.security = None + # properties from DocumentSummaryInformation stream + self.codepage_doc = None + self.category = None + self.presentation_target = None + self.bytes = None + self.lines = None + self.paragraphs = None + self.slides = None + self.notes = None + self.hidden_slides = None + self.mm_clips = None + self.scale_crop = None + self.heading_pairs = None + self.titles_of_parts = None + self.manager = None + self.company = None + self.links_dirty = None + self.chars_with_spaces = None + self.unused = None + self.shared_doc = None + self.link_base = None + self.hlinks = None + self.hlinks_changed = None + self.version = None + self.dig_sig = None + self.content_type = None + self.content_status = None + self.language = None + self.doc_version = None + + + def parse_properties(self, olefile): + """ + Parse standard properties of an OLE file, from the streams + "\x05SummaryInformation" and "\x05DocumentSummaryInformation", + if present. + Properties are converted to strings, integers or python datetime objects. + If a property is not present, its value is set to None. + """ + # first set all attributes to None: + for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): + setattr(self, attrib, None) + if olefile.exists("\x05SummaryInformation"): + # get properties from the stream: + # (converting timestamps to python datetime, except total_edit_time, + # which is property #10) + props = olefile.getproperties("\x05SummaryInformation", + convert_time=True, no_conversion=[10]) + # store them into this object's attributes: + for i in range(len(self.SUMMARY_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.SUMMARY_ATTRIBS[i], value) + if olefile.exists("\x05DocumentSummaryInformation"): + # get properties from the stream: + props = olefile.getproperties("\x05DocumentSummaryInformation", + convert_time=True) + # store them into this object's attributes: + for i in range(len(self.DOCSUM_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.DOCSUM_ATTRIBS[i], value) + + def dump(self): + """ + Dump all metadata, for debugging purposes. + """ + print 'Properties from SummaryInformation stream:' + for prop in self.SUMMARY_ATTRIBS: + value = getattr(self, prop) + print '- %s: %s' % (prop, repr(value)) + print 'Properties from DocumentSummaryInformation stream:' + for prop in self.DOCSUM_ATTRIBS: + value = getattr(self, prop) + print '- %s: %s' % (prop, repr(value)) + + +#--- _OleStream --------------------------------------------------------------- + +class _OleStream(StringIO.StringIO): + """ + OLE2 Stream + + Returns a read-only file object which can be used to read + the contents of a OLE stream (instance of the StringIO class). + To open a stream, use the openstream method in the OleFile class. + + This function can be used with either ordinary streams, + or ministreams, depending on the offset, sectorsize, and + fat table arguments. + + Attributes: + - size: actual size of data stream, after it was opened. + """ + + # FIXME: should store the list of sects obtained by following + # the fat chain, and load new sectors on demand instead of + # loading it all in one go. + + def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize): + """ + Constructor for _OleStream class. + + fp : file object, the OLE container or the MiniFAT stream + sect : sector index of first sector in the stream + size : total size of the stream + offset : offset in bytes for the first FAT or MiniFAT sector + sectorsize: size of one sector + fat : array/list of sector indexes (FAT or MiniFAT) + filesize : size of OLE file (for debugging) + return : a StringIO instance containing the OLE stream + """ + debug('_OleStream.__init__:') + debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) + #[PL] To detect malformed documents with FAT loops, we compute the + # expected number of sectors in the stream: + unknown_size = False + if size==0x7FFFFFFF: + # this is the case when called from OleFileIO._open(), and stream + # size is not known in advance (for example when reading the + # Directory stream). Then we can only guess maximum size: + size = len(fat)*sectorsize + # and we keep a record that size was unknown: + unknown_size = True + debug(' stream with UNKNOWN SIZE') + nb_sectors = (size + (sectorsize-1)) / sectorsize + debug('nb_sectors = %d' % nb_sectors) + # This number should (at least) be less than the total number of + # sectors in the given FAT: + if nb_sectors > len(fat): + raise IOError, 'malformed OLE document, stream too large' + # optimization(?): data is first a list of strings, and join() is called + # at the end to concatenate all in one string. + # (this may not be really useful with recent Python versions) + data = [] + # if size is zero, then first sector index should be ENDOFCHAIN: + if size == 0 and sect != ENDOFCHAIN: + debug('size == 0 and sect != ENDOFCHAIN:') + raise IOError, 'incorrect OLE sector index for empty stream' + #[PL] A fixed-length for loop is used instead of an undefined while + # loop to avoid DoS attacks: + for i in xrange(nb_sectors): + # Sector index may be ENDOFCHAIN, but only if size was unknown + if sect == ENDOFCHAIN: + if unknown_size: + break + else: + # else this means that the stream is smaller than declared: + debug('sect=ENDOFCHAIN before expected size') + raise IOError, 'incomplete OLE stream' + # sector index should be within FAT: + if sect<0 or sect>=len(fat): + debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) + debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) +## tmp_data = string.join(data, "") +## f = open('test_debug.bin', 'wb') +## f.write(tmp_data) +## f.close() +## debug('data read so far: %d bytes' % len(tmp_data)) + raise IOError, 'incorrect OLE FAT, sector index out of range' + #TODO: merge this code with OleFileIO.getsect() ? + #TODO: check if this works with 4K sectors: + try: + fp.seek(offset + sectorsize * sect) + except: + debug('sect=%d, seek=%d, filesize=%d' % + (sect, offset+sectorsize*sect, filesize)) + raise IOError, 'OLE sector index out of range' + sector_data = fp.read(sectorsize) + # [PL] check if there was enough data: + # Note: if sector is the last of the file, sometimes it is not a + # complete sector (of 512 or 4K), so we may read less than + # sectorsize. + if len(sector_data)!=sectorsize and sect!=(len(fat)-1): + debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % + (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) + debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) + raise IOError, 'incomplete OLE sector' + data.append(sector_data) + # jump to next sector in the FAT: + try: + sect = fat[sect] + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + raise IOError, 'incorrect OLE FAT, sector index out of range' + #[PL] Last sector should be a "end of chain" marker: + if sect != ENDOFCHAIN: + raise IOError, 'incorrect last sector index in OLE stream' + data = string.join(data, "") + # Data is truncated to the actual stream size: + if len(data) >= size: + data = data[:size] + # actual stream size is stored for future use: + self.size = size + elif unknown_size: + # actual stream size was not known, now we know the size of read + # data: + self.size = len(data) + else: + # read data is less than expected: + debug('len(data)=%d, size=%d' % (len(data), size)) + raise IOError, 'OLE stream size is less than declared' + # when all data is read in memory, StringIO constructor is called + StringIO.StringIO.__init__(self, data) + # Then the _OleStream object can be used as a read-only file object. + + +#--- _OleDirectoryEntry ------------------------------------------------------- + +class _OleDirectoryEntry: + + """ + OLE2 Directory Entry + """ + #[PL] parsing code moved from OleFileIO.loaddirectory + + # struct to parse directory entries: + # <: little-endian byte order, standard sizes + # (note: this should guarantee that Q returns a 64 bits int) + # 64s: string containing entry name in unicode (max 31 chars) + null char + # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 + # B: uint8, dir entry type (between 0 and 5) + # B: uint8, color: 0=black, 1=red + # I: uint32, index of left child node in the red-black tree, NOSTREAM if none + # I: uint32, index of right child node in the red-black tree, NOSTREAM if none + # I: uint32, index of child root node if it is a storage, else NOSTREAM + # 16s: CLSID, unique identifier (only used if it is a storage) + # I: uint32, user flags + # Q (was 8s): uint64, creation timestamp or zero + # Q (was 8s): uint64, modification timestamp or zero + # I: uint32, SID of first sector if stream or ministream, SID of 1st sector + # of stream containing ministreams if root entry, 0 otherwise + # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise + # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise + STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' + # size of a directory entry: 128 bytes + DIRENTRY_SIZE = 128 + assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE + + + def __init__(self, entry, sid, olefile): + """ + Constructor for an _OleDirectoryEntry object. + Parses a 128-bytes entry from the OLE Directory stream. + + entry : string (must be 128 bytes long) + sid : index of this directory entry in the OLE file directory + olefile: OleFileIO containing this directory entry + """ + self.sid = sid + # ref to olefile is stored for future use + self.olefile = olefile + # kids is a list of children entries, if this entry is a storage: + # (list of _OleDirectoryEntry objects) + self.kids = [] + # kids_dict is a dictionary of children entries, indexed by their + # name in lowercase: used to quickly find an entry, and to detect + # duplicates + self.kids_dict = {} + # flag used to detect if the entry is referenced more than once in + # directory: + self.used = False + # decode DirEntry + ( + name, + namelength, + self.entry_type, + self.color, + self.sid_left, + self.sid_right, + self.sid_child, + clsid, + self.dwUserFlags, + self.createTime, + self.modifyTime, + self.isectStart, + sizeLow, + sizeHigh + ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) + if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: + olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + # only first directory entry can (and should) be root: + if self.entry_type == STGTY_ROOT and sid != 0: + olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') + if sid == 0 and self.entry_type != STGTY_ROOT: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') + #debug (struct.unpack(fmt_entry, entry[:len_entry])) + # name should be at most 31 unicode characters + null character, + # so 64 bytes in total (31*2 + 2): + if namelength>64: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length') + # if exception not raised, namelength is set to the maximum value: + namelength = 64 + # only characters without ending null char are kept: + name = name[:(namelength-2)] + # name is converted from unicode to Latin-1: + self.name = _unicode(name) + + debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) + debug(' - type: %d' % self.entry_type) + debug(' - sect: %d' % self.isectStart) + debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, + self.sid_right, self.sid_child)) + + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes + # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 + # or some other value so it cannot be raised as a defect in general: + if olefile.sectorsize == 512: + if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: + debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % + (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) + olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') + self.size = sizeLow + else: + self.size = sizeLow + (long(sizeHigh)<<32) + debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh)) + + self.clsid = _clsid(clsid) + # a storage should have a null size, BUT some implementations such as + # Word 8 for Mac seem to allow non-null values => Potential defect: + if self.entry_type == STGTY_STORAGE and self.size != 0: + olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') + # check if stream is not already referenced elsewhere: + if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: + if self.size < olefile.minisectorcutoff \ + and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT + # ministream object + minifat = True + else: + minifat = False + olefile._check_duplicate_stream(self.isectStart, minifat) + + + + def build_storage_tree(self): + """ + Read and build the red-black tree attached to this _OleDirectoryEntry + object, if it is a storage. + Note that this method builds a tree of all subentries, so it should + only be called for the root object once. + """ + debug('build_storage_tree: SID=%d - %s - sid_child=%d' + % (self.sid, repr(self.name), self.sid_child)) + if self.sid_child != NOSTREAM: + # if child SID is not NOSTREAM, then this entry is a storage. + # Let's walk through the tree of children to fill the kids list: + self.append_kids(self.sid_child) + + # Note from OpenOffice documentation: the safest way is to + # recreate the tree because some implementations may store broken + # red-black trees... + + # in the OLE file, entries are sorted on (length, name). + # for convenience, we sort them on name instead: + # (see __cmp__ method in this class) + self.kids.sort() + + + def append_kids(self, child_sid): + """ + Walk through red-black tree of children of this directory entry to add + all of them to the kids list. (recursive method) + + child_sid : index of child directory entry to use, or None when called + first time for the root. (only used during recursion) + """ + #[PL] this method was added to use simple recursion instead of a complex + # algorithm. + # if this is not a storage or a leaf of the tree, nothing to do: + if child_sid == NOSTREAM: + return + # check if child SID is in the proper range: + if child_sid<0 or child_sid>=len(self.olefile.direntries): + self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') + # get child direntry: + child = self.olefile._load_direntry(child_sid) #direntries[child_sid] + debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' + % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) + # the directory entries are organized as a red-black tree. + # (cf. Wikipedia for details) + # First walk through left side of the tree: + self.append_kids(child.sid_left) + # Check if its name is not already used (case-insensitive): + name_lower = child.name.lower() + if self.kids_dict.has_key(name_lower): + self.olefile._raise_defect(DEFECT_INCORRECT, + "Duplicate filename in OLE storage") + # Then the child_sid _OleDirectoryEntry object is appended to the + # kids list and dictionary: + self.kids.append(child) + self.kids_dict[name_lower] = child + # Check if kid was not already referenced in a storage: + if child.used: + self.olefile._raise_defect(DEFECT_INCORRECT, + 'OLE Entry referenced more than once') + child.used = True + # Finally walk through right side of the tree: + self.append_kids(child.sid_right) + # Afterwards build kid's own tree if it's also a storage: + child.build_storage_tree() + + + def __cmp__(self, other): + "Compare entries by name" + return cmp(self.name, other.name) + #TODO: replace by the same function as MS implementation ? + # (order by name length first, then case-insensitive order) + + + def dump(self, tab = 0): + "Dump this entry, and all its subentries (for debug purposes only)" + TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", + "(property)", "(root)"] + print " "*tab + repr(self.name), TYPES[self.entry_type], + if self.entry_type in (STGTY_STREAM, STGTY_ROOT): + print self.size, "bytes", + print + if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: + print " "*tab + "{%s}" % self.clsid + + for kid in self.kids: + kid.dump(tab + 2) + + + def getmtime(self): + """ + Return modification time of a directory entry. + + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.modifyTime == 0: + return None + return filetime2datetime(self.modifyTime) + + + def getctime(self): + """ + Return creation time of a directory entry. + + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.createTime == 0: + return None + return filetime2datetime(self.createTime) + + +#--- OleFileIO ---------------------------------------------------------------- + +class OleFileIO: + """ + OLE container object + + This class encapsulates the interface to an OLE 2 structured + storage file. Use the {@link listdir} and {@link openstream} methods to + access the contents of this file. + + Object names are given as a list of strings, one for each subentry + level. The root entry should be omitted. For example, the following + code extracts all image streams from a Microsoft Image Composer file: + + ole = OleFileIO("fan.mic") + + for entry in ole.listdir(): + if entry[1:2] == "Image": + fin = ole.openstream(entry) + fout = open(entry[0:1], "wb") + while 1: + s = fin.read(8192) + if not s: + break + fout.write(s) + + You can use the viewer application provided with the Python Imaging + Library to view the resulting files (which happens to be standard + TIFF files). + """ + + def __init__(self, filename = None, raise_defects=DEFECT_FATAL): + """ + Constructor for OleFileIO class. + + filename: file to open. + raise_defects: minimal level for defects to be raised as exceptions. + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a + security-oriented application, see source code for details) + """ + # minimal level for defects to be raised as exceptions: + self._raise_defects_level = raise_defects + # list of defects/issues not raised as exceptions: + # tuples of (exception type, message) + self.parsing_issues = [] + if filename: + self.open(filename) + + + def _raise_defect(self, defect_level, message, exception_type=IOError): + """ + This method should be called for any defect found during file parsing. + It may raise an IOError exception according to the minimal level chosen + for the OleFileIO object. + + defect_level: defect level, possible values are: + DEFECT_UNSURE : a case which looks weird, but not sure it's a defect + DEFECT_POTENTIAL : a potential defect + DEFECT_INCORRECT : an error according to specifications, but parsing can go on + DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + message: string describing the defect, used with raised exception. + exception_type: exception class to be raised, IOError by default + """ + # added by [PL] + if defect_level >= self._raise_defects_level: + raise exception_type, message + else: + # just record the issue, no exception raised: + self.parsing_issues.append((exception_type, message)) + + + def open(self, filename): + """ + Open an OLE2 file. + Reads the header, FAT and directory. + + filename: string-like or file-like object + """ + #[PL] check if filename is a string-like or file-like object: + # (it is better to check for a read() method) + if hasattr(filename, 'read'): + # file-like object + self.fp = filename + else: + # string-like object: filename of file on disk + #TODO: if larger than 1024 bytes, this could be the actual data => StringIO + self.fp = open(filename, "rb") + # old code fails if filename is not a plain string: + #if type(filename) == type(""): + # self.fp = open(filename, "rb") + #else: + # self.fp = filename + # obtain the filesize by using seek and tell, which should work on most + # file-like objects: + #TODO: do it above, using getsize with filename when possible? + #TODO: fix code to fail with clear exception when filesize cannot be obtained + self.fp.seek(0, os.SEEK_END) + try: + filesize = self.fp.tell() + finally: + self.fp.seek(0) + self._filesize = filesize + + # lists of streams in FAT and MiniFAT, to detect duplicate references + # (list of indexes of first sectors of each stream) + self._used_streams_fat = [] + self._used_streams_minifat = [] + + header = self.fp.read(512) + + if len(header) != 512 or header[:8] != MAGIC: + self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") + + # [PL] header structure according to AAF specifications: + ##Header + ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] + ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, + ## // 0x1a, 0xe1} for current version + ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ + ## // GetClassFile uses root directory class id) + ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is + ## // written by reference implementation + ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for + ## // 512-byte sectors, 4 for 4 KB sectors + ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering + ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; + ## // typically 9 indicating 512-byte sectors + ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; + ## // typically 6 indicating 64-byte mini-sectors + ##USHORT _usReserved; // [22H,02] reserved, must be zero + ##ULONG _ulReserved1; // [24H,04] reserved, must be zero + ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, + ## // number of SECTs in directory chain for 4 KB + ## // sectors + ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain + ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain + ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must + ## // be zero. The reference implementation + ## // does not support transactions + ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; + ## // typically 4096 bytes + ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain + ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain + ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain + ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain + ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors + ##}; + + # [PL] header decoding: + # '<' indicates little-endian byte ordering for Intel (cf. struct module help) + fmt_header = '<8s16sHHHHHHLLLLLLLLLL' + header_size = struct.calcsize(fmt_header) + debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) + header1 = header[:header_size] + ( + self.Sig, + self.clsid, + self.MinorVersion, + self.DllVersion, + self.ByteOrder, + self.SectorShift, + self.MiniSectorShift, + self.Reserved, self.Reserved1, + self.csectDir, + self.csectFat, + self.sectDirStart, + self.signature, + self.MiniSectorCutoff, + self.MiniFatStart, + self.csectMiniFat, + self.sectDifStart, + self.csectDif + ) = struct.unpack(fmt_header, header1) + debug( struct.unpack(fmt_header, header1)) + + if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + # OLE signature should always be present + self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") + if self.clsid != '\x00'*16: + # according to AAF specs, CLSID should always be zero + self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") + debug( "MinorVersion = %d" % self.MinorVersion ) + debug( "DllVersion = %d" % self.DllVersion ) + if self.DllVersion not in [3, 4]: + # version 3: usual format, 512 bytes per sector + # version 4: large format, 4K per sector + self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") + debug( "ByteOrder = %X" % self.ByteOrder ) + if self.ByteOrder != 0xFFFE: + # For now only common little-endian documents are handled correctly + self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") + # TODO: add big-endian support for documents created on Mac ? + self.SectorSize = 2**self.SectorShift + debug( "SectorSize = %d" % self.SectorSize ) + if self.SectorSize not in [512, 4096]: + self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header") + if (self.DllVersion==3 and self.SectorSize!=512) \ + or (self.DllVersion==4 and self.SectorSize!=4096): + self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header") + self.MiniSectorSize = 2**self.MiniSectorShift + debug( "MiniSectorSize = %d" % self.MiniSectorSize ) + if self.MiniSectorSize not in [64]: + self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header") + if self.Reserved != 0 or self.Reserved1 != 0: + self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") + debug( "csectDir = %d" % self.csectDir ) + if self.SectorSize==512 and self.csectDir!=0: + self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") + debug( "csectFat = %d" % self.csectFat ) + debug( "sectDirStart = %X" % self.sectDirStart ) + debug( "signature = %d" % self.signature ) + # Signature should be zero, BUT some implementations do not follow this + # rule => only a potential defect: + if self.signature != 0: + self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") + debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) + debug( "MiniFatStart = %X" % self.MiniFatStart ) + debug( "csectMiniFat = %d" % self.csectMiniFat ) + debug( "sectDifStart = %X" % self.sectDifStart ) + debug( "csectDif = %d" % self.csectDif ) + + # calculate the number of sectors in the file + # (-1 because header doesn't count) + self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 + debug( "Number of sectors in the file: %d" % self.nb_sect ) + + # file clsid (probably never used, so we don't store it) + clsid = _clsid(header[8:24]) + self.sectorsize = self.SectorSize #1 << i16(header, 30) + self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) + self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) + + # check known streams for duplicate references (these are always in FAT, + # never in MiniFAT): + self._check_duplicate_stream(self.sectDirStart) + # check MiniFAT only if it is not empty: + if self.csectMiniFat: + self._check_duplicate_stream(self.MiniFatStart) + # check DIFAT only if it is not empty: + if self.csectDif: + self._check_duplicate_stream(self.sectDifStart) + + # Load file allocation tables + self.loadfat(header) + # Load direcory. This sets both the direntries list (ordered by sid) + # and the root (ordered by hierarchy) members. + self.loaddirectory(self.sectDirStart)#i32(header, 48)) + self.ministream = None + self.minifatsect = self.MiniFatStart #i32(header, 60) + + + def close(self): + """ + close the OLE file, to release the file object + """ + self.fp.close() + + + def _check_duplicate_stream(self, first_sect, minifat=False): + """ + Checks if a stream has not been already referenced elsewhere. + This method should only be called once for each known stream, and only + if stream size is not null. + first_sect: index of first sector of the stream in FAT + minifat: if True, stream is located in the MiniFAT, else in the FAT + """ + if minifat: + debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) + used_streams = self._used_streams_minifat + else: + debug('_check_duplicate_stream: sect=%d in FAT' % first_sect) + # some values can be safely ignored (not a real stream): + if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): + return + used_streams = self._used_streams_fat + #TODO: would it be more efficient using a dict or hash values, instead + # of a list of long ? + if first_sect in used_streams: + self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') + else: + used_streams.append(first_sect) + + + def dumpfat(self, fat, firstindex=0): + "Displays a part of FAT in human-readable form for debugging purpose" + # [PL] added only for debug + if not DEBUG_MODE: + return + # dictionary to convert special FAT values in human-readable strings + VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) + fatnames = { + FREESECT: "..free..", + ENDOFCHAIN: "[ END. ]", + FATSECT: "FATSECT ", + DIFSECT: "DIFSECT " + } + nbsect = len(fat) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = fat[i] + if sect in fatnames: + nom = fatnames[sect] + else: + if sect == i+1: + nom = " --->" + else: + nom = "%8X" % sect + print nom, + print "" + + + def dumpsect(self, sector, firstindex=0): + "Displays a sector in a human-readable form, for debugging purpose." + if not DEBUG_MODE: + return + VPL=8 # number of values per line (8+1 * 8+1 = 81) + tab = array.array(UINT32, sector) + nbsect = len(tab) + nlines = (nbsect+VPL-1)/VPL + print "index", + for i in range(VPL): + print ("%8X" % i), + print "" + for l in range(nlines): + index = l*VPL + print ("%8X:" % (firstindex+index)), + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = tab[i] + nom = "%8X" % sect + print nom, + print "" + + def sect2array(self, sect): + """ + convert a sector to an array of 32 bits unsigned integers, + swapping bytes on big endian CPUs such as PowerPC (old Macs) + """ + a = array.array(UINT32, sect) + # if CPU is big endian, swap bytes: + if sys.byteorder == 'big': + a.byteswap() + return a + + + def loadfat_sect(self, sect): + """ + Adds the indexes of the given sector to the FAT + sect: string containing the first FAT sector, or array of long integers + return: index of last FAT sector. + """ + # a FAT sector is an array of ulong integers. + if isinstance(sect, array.array): + # if sect is already an array it is directly used + fat1 = sect + else: + # if it's a raw sector, it is parsed in an array + fat1 = self.sect2array(sect) + self.dumpsect(sect) + # The FAT is a sector chain starting at the first index of itself. + for isect in fat1: + #print "isect = %X" % isect + if isect == ENDOFCHAIN or isect == FREESECT: + # the end of the sector chain has been reached + break + # read the FAT sector + s = self.getsect(isect) + # parse it as an array of 32 bits integers, and add it to the + # global FAT array + nextfat = self.sect2array(s) + self.fat = self.fat + nextfat + return isect + + + def loadfat(self, header): + """ + Load the FAT table. + """ + # The header contains a sector numbers + # for the first 109 FAT sectors. Additional sectors are + # described by DIF blocks + + sect = header[76:512] + debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) + #fat = [] + # [PL] FAT is an array of 32 bits unsigned ints, it's more effective + # to use an array than a list in Python. + # It's initialized as empty first: + self.fat = array.array(UINT32) + self.loadfat_sect(sect) + #self.dumpfat(self.fat) +## for i in range(0, len(sect), 4): +## ix = i32(sect, i) +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## break +## s = self.getsect(ix) +## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) +## fat = fat + array.array(UINT32, s) + if self.csectDif != 0: + # [PL] There's a DIFAT because file is larger than 6.8MB + # some checks just in case: + if self.csectFat <= 109: + # there must be at least 109 blocks in header and the rest in + # DIFAT, so number of sectors must be >109. + self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') + if self.sectDifStart >= self.nb_sect: + # initial DIFAT block index must be valid + self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') + debug( "DIFAT analysis..." ) + # We compute the necessary number of DIFAT sectors : + # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) + nb_difat = (self.csectFat-109 + 126)/127 + debug( "nb_difat = %d" % nb_difat ) + if self.csectDif != nb_difat: + raise IOError, 'incorrect DIFAT' + isect_difat = self.sectDifStart + for i in xrange(nb_difat): + debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) + #TODO: check if corresponding FAT SID = DIFSECT + sector_difat = self.getsect(isect_difat) + difat = self.sect2array(sector_difat) + self.dumpsect(sector_difat) + self.loadfat_sect(difat[:127]) + # last DIFAT pointer is next DIFAT sector: + isect_difat = difat[127] + debug( "next DIFAT sector: %X" % isect_difat ) + # checks: + if isect_difat not in [ENDOFCHAIN, FREESECT]: + # last DIFAT pointer value must be ENDOFCHAIN or FREESECT + raise IOError, 'incorrect end of DIFAT' +## if len(self.fat) != self.csectFat: +## # FAT should contain csectFat blocks +## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) +## raise IOError, 'incorrect DIFAT' + # since FAT is read from fixed-size sectors, it may contain more values + # than the actual number of sectors in the file. + # Keep only the relevant sector indexes: + if len(self.fat) > self.nb_sect: + debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) + self.fat = self.fat[:self.nb_sect] + debug('\nFAT:') + self.dumpfat(self.fat) + + + def loadminifat(self): + """ + Load the MiniFAT table. + """ + # MiniFAT is stored in a standard sub-stream, pointed to by a header + # field. + # NOTE: there are two sizes to take into account for this stream: + # 1) Stream size is calculated according to the number of sectors + # declared in the OLE header. This allocated stream may be more than + # needed to store the actual sector indexes. + # (self.csectMiniFat is the number of sectors of size self.SectorSize) + stream_size = self.csectMiniFat * self.SectorSize + # 2) Actually used size is calculated by dividing the MiniStream size + # (given by root entry size) by the size of mini sectors, *4 for + # 32 bits indexes: + nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize + used_size = nb_minisectors * 4 + debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % + (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) + if used_size > stream_size: + # This is not really a problem, but may indicate a wrong implementation: + self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') + # In any case, first read stream_size: + s = self._open(self.minifatsect, stream_size, force_FAT=True).read() + #[PL] Old code replaced by an array: + #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4)) + self.minifat = self.sect2array(s) + # Then shrink the array to used size, to avoid indexes out of MiniStream: + debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) + self.minifat = self.minifat[:nb_minisectors] + debug('loadminifat(): len=%d' % len(self.minifat)) + debug('\nMiniFAT:') + self.dumpfat(self.minifat) + + def getsect(self, sect): + """ + Read given sector from file on disk. + sect: sector index + returns a string containing the sector data. + """ + # [PL] this original code was wrong when sectors are 4KB instead of + # 512 bytes: + #self.fp.seek(512 + self.sectorsize * sect) + #[PL]: added safety checks: + #print "getsect(%X)" % sect + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + debug('getsect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), self._filesize)) + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') + sector = self.fp.read(self.sectorsize) + if len(sector) != self.sectorsize: + debug('getsect(): sect=%X, read=%d, sectorsize=%d' % + (sect, len(sector), self.sectorsize)) + self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') + return sector + + + def loaddirectory(self, sect): + """ + Load the directory. + sect: sector index of directory stream. + """ + # The directory is stored in a standard + # substream, independent of its size. + + # open directory stream as a read-only file: + # (stream size is not known in advance) + self.directory_fp = self._open(sect) + + #[PL] to detect malformed documents and avoid DoS attacks, the maximum + # number of directory entries can be calculated: + max_entries = self.directory_fp.size / 128 + debug('loaddirectory: size=%d, max_entries=%d' % + (self.directory_fp.size, max_entries)) + + # Create list of directory entries + #self.direntries = [] + # We start with a list of "None" object + self.direntries = [None] * max_entries +## for sid in xrange(max_entries): +## entry = fp.read(128) +## if not entry: +## break +## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) + # load root entry: + root_entry = self._load_direntry(0) + # Root entry is the first entry: + self.root = self.direntries[0] + # read and build all storage trees, starting from the root: + self.root.build_storage_tree() + + + def _load_direntry (self, sid): + """ + Load a directory entry from the directory. + This method should only be called once for each storage/stream when + loading the directory. + sid: index of storage/stream in the directory. + return: a _OleDirectoryEntry object + raise: IOError if the entry has always been referenced. + """ + # check if SID is OK: + if sid<0 or sid>=len(self.direntries): + self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") + # check if entry was already referenced: + if self.direntries[sid] is not None: + self._raise_defect(DEFECT_INCORRECT, + "double reference for OLE stream/storage") + # if exception not raised, return the object + return self.direntries[sid] + self.directory_fp.seek(sid * 128) + entry = self.directory_fp.read(128) + self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) + return self.direntries[sid] + + + def dumpdirectory(self): + """ + Dump directory (for debugging only) + """ + self.root.dump() + + + def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): + """ + Open a stream, either in FAT or MiniFAT according to its size. + (openstream helper) + + start: index of first sector + size: size of stream (or nothing if size is unknown) + force_FAT: if False (default), stream will be opened in FAT or MiniFAT + according to size. If True, it will always be opened in FAT. + """ + debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % + (start, size, str(force_FAT))) + # stream size is compared to the MiniSectorCutoff threshold: + if size < self.minisectorcutoff and not force_FAT: + # ministream object + if not self.ministream: + # load MiniFAT if it wasn't already done: + self.loadminifat() + # The first sector index of the miniFAT stream is stored in the + # root directory entry: + size_ministream = self.root.size + debug('Opening MiniStream: sect=%d, size=%d' % + (self.root.isectStart, size_ministream)) + self.ministream = self._open(self.root.isectStart, + size_ministream, force_FAT=True) + return _OleStream(self.ministream, start, size, 0, + self.minisectorsize, self.minifat, + self.ministream.size) + else: + # standard stream + return _OleStream(self.fp, start, size, 512, + self.sectorsize, self.fat, self._filesize) + + + def _list(self, files, prefix, node, streams=True, storages=False): + """ + (listdir helper) + files: list of files to fill in + prefix: current location in storage tree (list of names) + node: current node (_OleDirectoryEntry object) + streams: bool, include streams if True (True by default) - new in v0.26 + storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) + """ + prefix = prefix + [node.name] + for entry in node.kids: + if entry.kids: + # this is a storage + if storages: + # add it to the list + files.append(prefix[1:] + [entry.name]) + # check its kids + self._list(files, prefix, entry, streams, storages) + else: + # this is a stream + if streams: + # add it to the list + files.append(prefix[1:] + [entry.name]) + + + def listdir(self, streams=True, storages=False): + """ + Return a list of streams stored in this file + + streams: bool, include streams if True (True by default) - new in v0.26 + storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) + """ + files = [] + self._list(files, [], self.root, streams, storages) + return files + + + def _find(self, filename): + """ + Returns directory entry of given filename. (openstream helper) + Note: this method is case-insensitive. + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: sid of requested filename + raise IOError if file not found + """ + + # if filename is a string instead of a list, split it on slashes to + # convert to a list: + if isinstance(filename, basestring): + filename = filename.split('/') + # walk across storage tree, following given path: + node = self.root + for name in filename: + for kid in node.kids: + if kid.name.lower() == name.lower(): + break + else: + raise IOError, "file not found" + node = kid + return node.sid + + + def openstream(self, filename): + """ + Open a stream as a read-only file object (StringIO). + + filename: path of stream in storage tree (except root entry), either: + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + return: file object (read-only) + raise IOError if filename not found, or if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + raise IOError, "this file is not a stream" + return self._open(entry.isectStart, entry.size) + + + def get_type(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container, and return its type. + + filename: path of stream in storage tree. (see openstream for syntax) + return: False if object does not exist, its entry type (>0) otherwise: + - STGTY_STREAM: a stream + - STGTY_STORAGE: a storage + - STGTY_ROOT: the root entry + """ + try: + sid = self._find(filename) + entry = self.direntries[sid] + return entry.entry_type + except: + return False + + + def getmtime(self, filename): + """ + Return modification time of a stream/storage. + + filename: path of stream/storage in storage tree. (see openstream for + syntax) + return: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getmtime() + + + def getctime(self, filename): + """ + Return creation time of a stream/storage. + + filename: path of stream/storage in storage tree. (see openstream for + syntax) + return: None if creation time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getctime() + + + def exists(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container. + + filename: path of stream in storage tree. (see openstream for syntax) + return: True if object exist, else False. + """ + try: + sid = self._find(filename) + return True + except: + return False + + + def get_size(self, filename): + """ + Return size of a stream in the OLE container, in bytes. + + filename: path of stream in storage tree (see openstream for syntax) + return: size in bytes (long integer) + raise: IOError if file not found, TypeError if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + #TODO: Should it return zero instead of raising an exception ? + raise TypeError, 'object is not an OLE stream' + return entry.size + + + def get_rootentry_name(self): + """ + Return root entry name. Should usually be 'Root Entry' or 'R' in most + implementations. + """ + return self.root.name + + + def getproperties(self, filename, convert_time=False, no_conversion=None): + """ + Return properties described in substream. + + filename: path of stream in storage tree (see openstream for syntax) + convert_time: bool, if True timestamps will be converted to Python datetime + no_conversion: None or list of int, timestamps not to be converted + (for example total editing time is not a real timestamp) + return: a dictionary of values indexed by id (integer) + """ + # make sure no_conversion is a list, just to simplify code below: + if no_conversion == None: + no_conversion = [] + # stream path as a string to report exceptions: + streampath = filename + if not isinstance(streampath, str): + streampath = '/'.join(streampath) + + fp = self.openstream(filename) + + data = {} + + try: + # header + s = fp.read(28) + clsid = _clsid(s[8:24]) + + # format id + s = fp.read(20) + fmtid = _clsid(s[:16]) + fp.seek(i32(s, 16)) + + # get section + s = "****" + fp.read(i32(fp.read(4))-4) + # number of properties: + num_props = i32(s, 4) + except: + # catch exception while parsing property header, and only raise + # a DEFECT_INCORRECT then return an empty dict, because this is not + # a fatal error when parsing the whole file + exctype, excvalue = sys.exc_info()[:2] + msg = 'Error while parsing properties header in stream %s: %s' % ( + repr(streampath), excvalue) + self._raise_defect(DEFECT_INCORRECT, msg, exctype) + return data + + for i in range(num_props): + try: + id = 0 # just in case of an exception + id = i32(s, 8+i*8) + offset = i32(s, 12+i*8) + type = i32(s, offset) + + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) + + # test for common types first (should perhaps use + # a dictionary instead?) + + if type == VT_I2: # 16-bit signed integer + value = i16(s, offset+4) + if value >= 32768: + value = value - 65536 + elif type == VT_UI2: # 2-byte unsigned integer + value = i16(s, offset+4) + elif type in (VT_I4, VT_INT, VT_ERROR): + # VT_I4: 32-bit signed integer + # VT_ERROR: HRESULT, similar to 32-bit signed integer, + # see http://msdn.microsoft.com/en-us/library/cc230330.aspx + value = i32(s, offset+4) + elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer + value = i32(s, offset+4) # FIXME + elif type in (VT_BSTR, VT_LPSTR): + # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx + # size is a 32 bits integer, including the null terminator, and + # possibly trailing or embedded null chars + #TODO: if codepage is unicode, the string should be converted as such + count = i32(s, offset+4) + value = s[offset+8:offset+8+count-1] + # remove all null chars: + value = value.replace('\x00', '') + elif type == VT_BLOB: + # binary large object (BLOB) + # see http://msdn.microsoft.com/en-us/library/dd942282.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif type == VT_LPWSTR: + # UnicodeString + # see http://msdn.microsoft.com/en-us/library/dd942313.aspx + # "the string should NOT contain embedded or additional trailing + # null characters." + count = i32(s, offset+4) + value = _unicode(s[offset+8:offset+8+count*2]) + elif type == VT_FILETIME: + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) + # FILETIME is a 64-bit int: "number of 100ns periods + # since Jan 1,1601". + if convert_time and id not in no_conversion: + debug('Converting property #%d to python datetime, value=%d=%fs' + %(id, value, float(value)/10000000L)) + # convert FILETIME to Python datetime.datetime + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + debug('timedelta days=%d' % (value/(10*1000000*3600*24))) + value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) + else: + # legacy code kept for backward compatibility: returns a + # number of seconds since Jan 1,1601 + value = value / 10000000L # seconds + elif type == VT_UI1: # 1-byte unsigned integer + value = ord(s[offset+4]) + elif type == VT_CLSID: + value = _clsid(s[offset+4:offset+20]) + elif type == VT_CF: + # PropertyIdentifier or ClipboardData?? + # see http://msdn.microsoft.com/en-us/library/dd941945.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif type == VT_BOOL: + # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True + # see http://msdn.microsoft.com/en-us/library/cc237864.aspx + value = bool(i16(s, offset+4)) + else: + value = None # everything else yields "None" + debug ('property id=%d: type=%d not implemented in parser yet' % (id, type)) + + # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, + # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, + # see http://msdn.microsoft.com/en-us/library/dd942033.aspx + + # FIXME: add support for VT_VECTOR + # VT_VECTOR is a 32 uint giving the number of items, followed by + # the items in sequence. The VT_VECTOR value is combined with the + # type of items, e.g. VT_VECTOR|VT_BSTR + # see http://msdn.microsoft.com/en-us/library/dd942011.aspx + + #print "%08x" % id, repr(value), + #print "(%s)" % VT[i32(s, offset) & 0xFFF] + + data[id] = value + except: + # catch exception while parsing each property, and only raise + # a DEFECT_INCORRECT, because parsing can go on + exctype, excvalue = sys.exc_info()[:2] + msg = 'Error while parsing property id %d in stream %s: %s' % ( + id, repr(streampath), excvalue) + self._raise_defect(DEFECT_INCORRECT, msg, exctype) + + return data + + def get_metadata(self): + """ + Parse standard properties streams, return an OleMetadata object + containing all the available metadata. + (also stored in the metadata attribute of the OleFileIO object) + + new in version 0.25 + """ + self.metadata = OleMetadata() + self.metadata.parse_properties(self) + return self.metadata + +# +# -------------------------------------------------------------------- +# This script can be used to dump the directory of any OLE2 structured +# storage file. + +if __name__ == "__main__": + + import sys + + # [PL] display quick usage info if launched from command-line + if len(sys.argv) <= 1: + print __doc__ + print """ +Launched from command line, this script parses OLE files and prints info. + +Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] + +Options: +-d : debug mode (display a lot of debug information, for developers only) +-c : check all streams (for debugging purposes) +""" + sys.exit() + + check_streams = False + for filename in sys.argv[1:]: +## try: + # OPTIONS: + if filename == '-d': + # option to switch debug mode on: + set_debug_mode(True) + continue + if filename == '-c': + # option to switch check streams mode on: + check_streams = True + continue + + ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) + print "-" * 68 + print filename + print "-" * 68 + ole.dumpdirectory() + for streamname in ole.listdir(): + if streamname[-1][0] == "\005": + print streamname, ": properties" + props = ole.getproperties(streamname, convert_time=True) + props = props.items() + props.sort() + for k, v in props: + #[PL]: avoid to display too large or binary values: + if isinstance(v, basestring): + if len(v) > 50: + v = v[:50] + # quick and dirty binary check: + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30,31): + if chr(c) in v: + v = '(binary data)' + break + print " ", k, v + + if check_streams: + # Read all streams to check if there are errors: + print '\nChecking streams...' + for streamname in ole.listdir(): + # print name using repr() to convert binary chars to \xNN: + print '-', repr('/'.join(streamname)),'-', + st_type = ole.get_type(streamname) + if st_type == STGTY_STREAM: + print 'size %d' % ole.get_size(streamname) + # just try to read stream in memory: + ole.openstream(streamname) + else: + print 'NOT a stream : type=%d' % st_type + print '' + +## for streamname in ole.listdir(): +## # print name using repr() to convert binary chars to \xNN: +## print '-', repr('/'.join(streamname)),'-', +## print ole.getmtime(streamname) +## print '' + + print 'Modification/Creation times of all directory entries:' + for entry in ole.direntries: + if entry is not None: + print '- %s: mtime=%s ctime=%s' % (entry.name, + entry.getmtime(), entry.getctime()) + print '' + + # parse and display metadata: + meta = ole.get_metadata() + meta.dump() + print '' + #[PL] Test a few new methods: + root = ole.get_rootentry_name() + print 'Root entry name: "%s"' % root + if ole.exists('worddocument'): + print "This is a Word document." + print "type of stream 'WordDocument':", ole.get_type('worddocument') + print "size :", ole.get_size('worddocument') + if ole.exists('macros/vba'): + print "This document may contain VBA macros." + + # print parsing issues: + print '\nNon-fatal issues raised during parsing:' + if ole.parsing_issues: + for exctype, msg in ole.parsing_issues: + print '- %s: %s' % (exctype.__name__, msg) + else: + print 'None' +## except IOError, v: +## print "***", "cannot read", file, "-", v From 6977b592e37c1d496fa7effebbe096af7c2e7939 Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Fri, 17 Jan 2014 11:58:34 +0000 Subject: [PATCH 030/101] Port more stuff to Python 3, dropping some pre-2.6 support --- PIL/OleFileIO.py | 284 +++++++++++++++++++++-------------------------- 1 file changed, 129 insertions(+), 155 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 5007fd1d7..f258b31f6 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -171,7 +171,7 @@ __version__ = '0.26' # - create a simple OLE explorer with wxPython # FUTURE EVOLUTIONS to add write support: -# 1) add ability to write a stream back on disk from StringIO (same size, no +# 1) add ability to write a stream back on disk from BytesIO (same size, no # change in FAT/MiniFAT). # 2) rename a stream/storage if it doesn't change the RB tree # 3) use rbtree module to update the red-black tree + any rename @@ -223,8 +223,7 @@ __version__ = '0.26' import io import sys from PIL import _binary -from PIL._util import isPath -import struct, array, os.path, sys, datetime +import struct, array, os.path, datetime #[PL] Define explicitly the public API to avoid private objects in pydoc: __all__ = ['OleFileIO', 'isOleFile'] @@ -240,19 +239,13 @@ elif array.array('I').itemsize == 4: # on 64 bits platforms, integers in an array are 32 bits: UINT32 = 'I' else: - raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...' + raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') #[PL] These workarounds were inspired from the Path module # (see http://www.jorendorff.com/articles/python/path/) #TODO: test with old Python versions -# Pre-2.3 workaround for booleans -try: - True, False -except NameError: - True, False = 1, 0 - # Pre-2.3 workaround for basestring. try: basestring @@ -271,7 +264,7 @@ KEEP_UNICODE_NAMES = False # command line to change it. DEBUG_MODE = False def debug_print(msg): - print msg + print(msg) def debug_pass(msg): pass debug = debug_pass @@ -292,15 +285,15 @@ def set_debug_mode(debug_mode): MAGIC = b'\320\317\021\340\241\261\032\341' #[PL]: added constants for Sector IDs (from AAF specifications) -MAXREGSECT = 0xFFFFFFFAL; # maximum SECT -DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT -FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT -ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain -FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector +MAXREGSECT = 0xFFFFFFFA; # maximum SECT +DIFSECT = 0xFFFFFFFC; # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFD; # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFE; # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFF; # (-1) unallocated sector #[PL]: added constants for Directory Entry IDs (from AAF specifications) -MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID -NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry +MAXREGSID = 0xFFFFFFFA; # maximum directory entry ID +NOSTREAM = 0xFFFFFFFF; # (-1) unallocated directory entry #[PL] object types in storage (from AAF specifications) STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) @@ -348,7 +341,7 @@ DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is # impossible #[PL] add useful constants to __all__: -for key in vars().keys(): +for key in list(vars().keys()): if key.startswith('STGTY_') or key.startswith('DEFECT_'): __all__.append(key) @@ -380,7 +373,7 @@ def _clsid(clsid): clsid: string of length 16. """ assert len(clsid) == 16 - if clsid == "\0" * len(clsid): + if clsid == bytearray(16): return "" return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + @@ -388,48 +381,30 @@ def _clsid(clsid): -# UNICODE support for Old Python versions: +# UNICODE support: # (necessary to handle storages/streams names which use Unicode) -try: - # is Unicode supported ? - unicode +def _unicode(s, errors='replace'): + """ + Map unicode string to Latin 1. (Python with Unicode support) - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python with Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode() - """ - #TODO: test if it OleFileIO works with Unicode strings, instead of - # converting to Latin-1. - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = s.decode('UTF-16LE', errors) - if bytes is not str or KEEP_UNICODE_NAMES: - return u - else: - # Second the unicode string is converted to Latin-1 - return u.encode('latin_1', errors) - except: - # there was an error during Unicode to Latin-1 conversion: - raise IOError, 'incorrect Unicode name' - -except NameError: - def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python without native Unicode support) - - s: UTF-16LE unicode string to convert to Latin-1 - errors: 'replace', 'ignore' or 'strict'. (ignored in this version) - """ - # If the unicode function does not exist, we assume this is an old - # Python version without Unicode support. - # Null bytes are simply removed (this only works with usual Latin-1 - # strings which do not contain unicode characters>256): - return filter(ord, s) + s: UTF-16LE unicode string to convert to Latin-1 + errors: 'replace', 'ignore' or 'strict'. + """ + #TODO: test if it OleFileIO works with Unicode strings, instead of + # converting to Latin-1. + try: + # First the string is converted to plain Unicode: + # (assuming it is encoded as UTF-16 little-endian) + u = s.decode('UTF-16LE', errors) + if bytes is not str or KEEP_UNICODE_NAMES: + return u + else: + # Second the unicode string is converted to Latin-1 + return u.encode('latin_1', errors) + except: + # there was an error during Unicode to Latin-1 conversion: + raise IOError('incorrect Unicode name') def filetime2datetime(filetime): @@ -439,8 +414,8 @@ def filetime2datetime(filetime): # TODO: manage exception when microseconds is too large # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24))) - return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10) + #debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) @@ -588,14 +563,14 @@ class OleMetadata: """ Dump all metadata, for debugging purposes. """ - print 'Properties from SummaryInformation stream:' + print('Properties from SummaryInformation stream:') for prop in self.SUMMARY_ATTRIBS: value = getattr(self, prop) - print '- %s: %s' % (prop, repr(value)) - print 'Properties from DocumentSummaryInformation stream:' + print('- %s: %s' % (prop, repr(value))) + print('Properties from DocumentSummaryInformation stream:') for prop in self.DOCSUM_ATTRIBS: value = getattr(self, prop) - print '- %s: %s' % (prop, repr(value)) + print('- %s: %s' % (prop, repr(value))) #--- _OleStream --------------------------------------------------------------- @@ -605,7 +580,7 @@ class _OleStream(io.BytesIO): OLE2 Stream Returns a read-only file object which can be used to read - the contents of a OLE stream (instance of the StringIO class). + the contents of a OLE stream (instance of the BytesIO class). To open a stream, use the openstream method in the OleFile class. This function can be used with either ordinary streams, @@ -631,7 +606,7 @@ class _OleStream(io.BytesIO): sectorsize: size of one sector fat : array/list of sector indexes (FAT or MiniFAT) filesize : size of OLE file (for debugging) - return : a StringIO instance containing the OLE stream + return : a BytesIO instance containing the OLE stream """ debug('_OleStream.__init__:') debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' @@ -647,12 +622,12 @@ class _OleStream(io.BytesIO): # and we keep a record that size was unknown: unknown_size = True debug(' stream with UNKNOWN SIZE') - nb_sectors = (size + (sectorsize-1)) / sectorsize + nb_sectors = (size + (sectorsize-1)) // sectorsize debug('nb_sectors = %d' % nb_sectors) # This number should (at least) be less than the total number of # sectors in the given FAT: if nb_sectors > len(fat): - raise IOError, 'malformed OLE document, stream too large' + raise IOError('malformed OLE document, stream too large') # optimization(?): data is first a list of strings, and join() is called # at the end to concatenate all in one string. # (this may not be really useful with recent Python versions) @@ -660,7 +635,7 @@ class _OleStream(io.BytesIO): # if size is zero, then first sector index should be ENDOFCHAIN: if size == 0 and sect != ENDOFCHAIN: debug('size == 0 and sect != ENDOFCHAIN:') - raise IOError, 'incorrect OLE sector index for empty stream' + raise IOError('incorrect OLE sector index for empty stream') #[PL] A fixed-length for loop is used instead of an undefined while # loop to avoid DoS attacks: for i in xrange(nb_sectors): @@ -671,17 +646,17 @@ class _OleStream(io.BytesIO): else: # else this means that the stream is smaller than declared: debug('sect=ENDOFCHAIN before expected size') - raise IOError, 'incomplete OLE stream' + raise IOError('incomplete OLE stream') # sector index should be within FAT: if sect<0 or sect>=len(fat): debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) -## tmp_data = string.join(data, "") +## tmp_data = b"".join(data) ## f = open('test_debug.bin', 'wb') ## f.write(tmp_data) ## f.close() ## debug('data read so far: %d bytes' % len(tmp_data)) - raise IOError, 'incorrect OLE FAT, sector index out of range' + raise IOError('incorrect OLE FAT, sector index out of range') #TODO: merge this code with OleFileIO.getsect() ? #TODO: check if this works with 4K sectors: try: @@ -689,7 +664,7 @@ class _OleStream(io.BytesIO): except: debug('sect=%d, seek=%d, filesize=%d' % (sect, offset+sectorsize*sect, filesize)) - raise IOError, 'OLE sector index out of range' + raise IOError('OLE sector index out of range') sector_data = fp.read(sectorsize) # [PL] check if there was enough data: # Note: if sector is the last of the file, sometimes it is not a @@ -699,17 +674,17 @@ class _OleStream(io.BytesIO): debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) - raise IOError, 'incomplete OLE sector' + raise IOError('incomplete OLE sector') data.append(sector_data) # jump to next sector in the FAT: try: sect = fat[sect] except IndexError: # [PL] if pointer is out of the FAT an exception is raised - raise IOError, 'incorrect OLE FAT, sector index out of range' + raise IOError('incorrect OLE FAT, sector index out of range') #[PL] Last sector should be a "end of chain" marker: if sect != ENDOFCHAIN: - raise IOError, 'incorrect last sector index in OLE stream' + raise IOError('incorrect last sector index in OLE stream') data = b"".join(data) # Data is truncated to the actual stream size: if len(data) >= size: @@ -723,8 +698,8 @@ class _OleStream(io.BytesIO): else: # read data is less than expected: debug('len(data)=%d, size=%d' % (len(data), size)) - raise IOError, 'OLE stream size is less than declared' - # when all data is read in memory, StringIO constructor is called + raise IOError('OLE stream size is less than declared') + # when all data is read in memory, BytesIO constructor is called io.BytesIO.__init__(self, data) # Then the _OleStream object can be used as a read-only file object. @@ -827,10 +802,10 @@ class _OleDirectoryEntry: self.sid_right, self.sid_child)) # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes - # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1 + # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1 # or some other value so it cannot be raised as a defect in general: if olefile.sectorsize == 512: - if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL: + if sizeHigh != 0 and sizeHigh != 0xFFFFFFFF: debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh)) olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') @@ -906,7 +881,7 @@ class _OleDirectoryEntry: self.append_kids(child.sid_left) # Check if its name is not already used (case-insensitive): name_lower = child.name.lower() - if self.kids_dict.has_key(name_lower): + if name_lower in self.kids_dict: self.olefile._raise_defect(DEFECT_INCORRECT, "Duplicate filename in OLE storage") # Then the child_sid _OleDirectoryEntry object is appended to the @@ -1059,7 +1034,7 @@ class OleFileIO: self.fp = filename else: # string-like object: filename of file on disk - #TODO: if larger than 1024 bytes, this could be the actual data => StringIO + #TODO: if larger than 1024 bytes, this could be the actual data => BytesIO self.fp = open(filename, "rb") # old code fails if filename is not a plain string: #if isPath(filename): @@ -1149,10 +1124,10 @@ class OleFileIO: ) = struct.unpack(fmt_header, header1) debug( struct.unpack(fmt_header, header1)) - if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': + if self.Sig != b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': # OLE signature should always be present self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") - if self.clsid != '\x00'*16: + if self.clsid != bytearray(16): # according to AAF specs, CLSID should always be zero self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") debug( "MinorVersion = %d" % self.MinorVersion ) @@ -1197,7 +1172,7 @@ class OleFileIO: # calculate the number of sectors in the file # (-1 because header doesn't count) - self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1 + self.nb_sect = ( (filesize + self.SectorSize-1) // self.SectorSize) - 1 debug( "Number of sectors in the file: %d" % self.nb_sect ) # file clsid (probably never used, so we don't store it) @@ -1271,14 +1246,14 @@ class OleFileIO: DIFSECT: "DIFSECT " } nbsect = len(fat) - nlines = (nbsect+VPL-1)/VPL - print "index", + nlines = (nbsect+VPL-1)//VPL + print("index", end=" ") for i in range(VPL): - print ("%8X" % i), - print "" + print("%8X" % i, end=" ") + print() for l in range(nlines): index = l*VPL - print ("%8X:" % (firstindex+index)), + print("%8X:" % (firstindex+index), end=" ") for i in range(index, index+VPL): if i>=nbsect: break @@ -1290,8 +1265,8 @@ class OleFileIO: nom = " --->" else: nom = "%8X" % sect - print nom, - print "" + print(nom, end=" ") + print() def dumpsect(self, sector, firstindex=0): @@ -1301,21 +1276,21 @@ class OleFileIO: VPL=8 # number of values per line (8+1 * 8+1 = 81) tab = array.array(UINT32, sector) nbsect = len(tab) - nlines = (nbsect+VPL-1)/VPL - print "index", + nlines = (nbsect+VPL-1)//VPL + print("index", end=" ") for i in range(VPL): - print ("%8X" % i), - print "" + print("%8X" % i, end=" ") + print() for l in range(nlines): index = l*VPL - print ("%8X:" % (firstindex+index)), + print("%8X:" % (firstindex+index), end=" ") for i in range(index, index+VPL): if i>=nbsect: break sect = tab[i] nom = "%8X" % sect - print nom, - print "" + print(nom, end=" ") + print() def sect2array(self, sect): """ @@ -1345,7 +1320,7 @@ class OleFileIO: self.dumpsect(sect) # The FAT is a sector chain starting at the first index of itself. for isect in fat1: - #print "isect = %X" % isect + #print("isect = %X" % isect) if isect == ENDOFCHAIN or isect == FREESECT: # the end of the sector chain has been reached break @@ -1367,7 +1342,7 @@ class OleFileIO: # described by DIF blocks sect = header[76:512] - debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) ) + debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) #fat = [] # [PL] FAT is an array of 32 bits unsigned ints, it's more effective # to use an array than a list in Python. @@ -1377,8 +1352,8 @@ class OleFileIO: #self.dumpfat(self.fat) ## for i in range(0, len(sect), 4): ## ix = i32(sect, i) -## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: -## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL: +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: +## if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: ## break ## s = self.getsect(ix) ## #fat = fat + [i32(s, i) for i in range(0, len(s), 4)] @@ -1396,10 +1371,10 @@ class OleFileIO: debug( "DIFAT analysis..." ) # We compute the necessary number of DIFAT sectors : # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) - nb_difat = (self.csectFat-109 + 126)/127 + nb_difat = (self.csectFat-109 + 126)//127 debug( "nb_difat = %d" % nb_difat ) if self.csectDif != nb_difat: - raise IOError, 'incorrect DIFAT' + raise IOError('incorrect DIFAT') isect_difat = self.sectDifStart for i in xrange(nb_difat): debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) @@ -1414,11 +1389,11 @@ class OleFileIO: # checks: if isect_difat not in [ENDOFCHAIN, FREESECT]: # last DIFAT pointer value must be ENDOFCHAIN or FREESECT - raise IOError, 'incorrect end of DIFAT' + raise IOError('incorrect end of DIFAT') ## if len(self.fat) != self.csectFat: ## # FAT should contain csectFat blocks -## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat) -## raise IOError, 'incorrect DIFAT' +## print("FAT length: %d instead of %d" % (len(self.fat), self.csectFat)) +## raise IOError('incorrect DIFAT') # since FAT is read from fixed-size sectors, it may contain more values # than the actual number of sectors in the file. # Keep only the relevant sector indexes: @@ -1444,7 +1419,7 @@ class OleFileIO: # 2) Actually used size is calculated by dividing the MiniStream size # (given by root entry size) by the size of mini sectors, *4 for # 32 bits indexes: - nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize + nb_minisectors = (self.root.size + self.MiniSectorSize-1) // self.MiniSectorSize used_size = nb_minisectors * 4 debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors)) @@ -1473,7 +1448,7 @@ class OleFileIO: # 512 bytes: #self.fp.seek(512 + self.sectorsize * sect) #[PL]: added safety checks: - #print "getsect(%X)" % sect + #print("getsect(%X)" % sect) try: self.fp.seek(self.sectorsize * (sect+1)) except: @@ -1502,7 +1477,7 @@ class OleFileIO: #[PL] to detect malformed documents and avoid DoS attacks, the maximum # number of directory entries can be calculated: - max_entries = self.directory_fp.size / 128 + max_entries = self.directory_fp.size // 128 debug('loaddirectory: size=%d, max_entries=%d' % (self.directory_fp.size, max_entries)) @@ -1659,7 +1634,7 @@ class OleFileIO: def openstream(self, filename): """ - Open a stream as a read-only file object (StringIO). + Open a stream as a read-only file object (BytesIO). filename: path of stream in storage tree (except root entry), either: - a string using Unix path syntax, for example: @@ -1754,7 +1729,7 @@ class OleFileIO: entry = self.direntries[sid] if entry.entry_type != STGTY_STREAM: #TODO: Should it return zero instead of raising an exception ? - raise TypeError, 'object is not an OLE stream' + raise TypeError('object is not an OLE stream') return entry.size @@ -1799,17 +1774,16 @@ class OleFileIO: fp.seek(i32(s, 16)) # get section - s = "****" + fp.read(i32(fp.read(4))-4) + s = b"****" + fp.read(i32(fp.read(4))-4) # number of properties: num_props = i32(s, 4) - except: + except BaseException as exc: # catch exception while parsing property header, and only raise # a DEFECT_INCORRECT then return an empty dict, because this is not # a fatal error when parsing the whole file - exctype, excvalue = sys.exc_info()[:2] msg = 'Error while parsing properties header in stream %s: %s' % ( - repr(streampath), excvalue) - self._raise_defect(DEFECT_INCORRECT, msg, exctype) + repr(streampath), exc) + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) return data for i in range(num_props): @@ -1845,7 +1819,7 @@ class OleFileIO: count = i32(s, offset+4) value = s[offset+8:offset+8+count-1] # remove all null chars: - value = value.replace('\x00', '') + value = value.replace(b'\x00', b'') elif type == VT_BLOB: # binary large object (BLOB) # see http://msdn.microsoft.com/en-us/library/dd942282.aspx @@ -1864,12 +1838,12 @@ class OleFileIO: # since Jan 1,1601". if convert_time and id not in no_conversion: debug('Converting property #%d to python datetime, value=%d=%fs' - %(id, value, float(value)/10000000L)) + %(id, value, float(value)/10000000)) # convert FILETIME to Python datetime.datetime # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - debug('timedelta days=%d' % (value/(10*1000000*3600*24))) - value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10) + debug('timedelta days=%d' % (value//(10*1000000*3600*24))) + value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10) else: # legacy code kept for backward compatibility: returns a # number of seconds since Jan 1,1601 @@ -1901,17 +1875,16 @@ class OleFileIO: # type of items, e.g. VT_VECTOR|VT_BSTR # see http://msdn.microsoft.com/en-us/library/dd942011.aspx - #print "%08x" % id, repr(value), - #print "(%s)" % VT[i32(s, offset) & 0xFFF] + #print("%08x" % id, repr(value), end=" ") + #print("(%s)" % VT[i32(s, offset) & 0xFFF]) data[id] = value - except: + except BaseException as exc: # catch exception while parsing each property, and only raise # a DEFECT_INCORRECT, because parsing can go on - exctype, excvalue = sys.exc_info()[:2] msg = 'Error while parsing property id %d in stream %s: %s' % ( - id, repr(streampath), excvalue) - self._raise_defect(DEFECT_INCORRECT, msg, exctype) + id, repr(streampath), exc) + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) return data @@ -1938,8 +1911,8 @@ if __name__ == "__main__": # [PL] display quick usage info if launched from command-line if len(sys.argv) <= 1: - print __doc__ - print """ + print(__doc__) + print(""" Launched from command line, this script parses OLE files and prints info. Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] @@ -1947,7 +1920,7 @@ Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] Options: -d : debug mode (display a lot of debug information, for developers only) -c : check all streams (for debugging purposes) -""" +""") sys.exit() check_streams = False @@ -1975,65 +1948,66 @@ Options: props = sorted(props.items()) for k, v in props: #[PL]: avoid to display too large or binary values: - if isinstance(v, basestring): + if isinstance(v, (basestring, bytes)): if len(v) > 50: v = v[:50] + if isinstance(v, bytes): # quick and dirty binary check: for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, 21,22,23,24,25,26,27,28,29,30,31): - if chr(c) in v: + if c in bytearray(v): v = '(binary data)' break print(" ", k, v) if check_streams: # Read all streams to check if there are errors: - print '\nChecking streams...' + print('\nChecking streams...') for streamname in ole.listdir(): # print name using repr() to convert binary chars to \xNN: - print '-', repr('/'.join(streamname)),'-', + print('-', repr('/'.join(streamname)),'-', end=' ') st_type = ole.get_type(streamname) if st_type == STGTY_STREAM: - print 'size %d' % ole.get_size(streamname) + print('size %d' % ole.get_size(streamname)) # just try to read stream in memory: ole.openstream(streamname) else: - print 'NOT a stream : type=%d' % st_type - print '' + print('NOT a stream : type=%d' % st_type) + print() ## for streamname in ole.listdir(): ## # print name using repr() to convert binary chars to \xNN: -## print '-', repr('/'.join(streamname)),'-', -## print ole.getmtime(streamname) -## print '' +## print('-', repr('/'.join(streamname)),'-', end=' ') +## print(ole.getmtime(streamname)) +## print() - print 'Modification/Creation times of all directory entries:' + print('Modification/Creation times of all directory entries:') for entry in ole.direntries: if entry is not None: - print '- %s: mtime=%s ctime=%s' % (entry.name, - entry.getmtime(), entry.getctime()) - print '' + print('- %s: mtime=%s ctime=%s' % (entry.name, + entry.getmtime(), entry.getctime())) + print() # parse and display metadata: meta = ole.get_metadata() meta.dump() - print '' + print() #[PL] Test a few new methods: root = ole.get_rootentry_name() - print 'Root entry name: "%s"' % root + print('Root entry name: "%s"' % root) if ole.exists('worddocument'): - print "This is a Word document." - print "type of stream 'WordDocument':", ole.get_type('worddocument') - print "size :", ole.get_size('worddocument') + print("This is a Word document.") + print("type of stream 'WordDocument':", ole.get_type('worddocument')) + print("size :", ole.get_size('worddocument')) if ole.exists('macros/vba'): - print "This document may contain VBA macros." + print("This document may contain VBA macros.") # print parsing issues: - print '\nNon-fatal issues raised during parsing:' + print('\nNon-fatal issues raised during parsing:') if ole.parsing_issues: for exctype, msg in ole.parsing_issues: - print '- %s: %s' % (exctype.__name__, msg) + print('- %s: %s' % (exctype.__name__, msg)) else: - print 'None' + print('None') ## except IOError as v: ## print("***", "cannot read", file, "-", v) From 5143df8561ccf193afc49b8531b43c5e60e1f0ad Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Sat, 18 Jan 2014 01:39:41 +0000 Subject: [PATCH 031/101] Use range() rather than xrange(), for Python 3 compatibility --- PIL/OleFileIO.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index f258b31f6..108a9693a 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -638,7 +638,7 @@ class _OleStream(io.BytesIO): raise IOError('incorrect OLE sector index for empty stream') #[PL] A fixed-length for loop is used instead of an undefined while # loop to avoid DoS attacks: - for i in xrange(nb_sectors): + for i in range(nb_sectors): # Sector index may be ENDOFCHAIN, but only if size was unknown if sect == ENDOFCHAIN: if unknown_size: @@ -1376,7 +1376,7 @@ class OleFileIO: if self.csectDif != nb_difat: raise IOError('incorrect DIFAT') isect_difat = self.sectDifStart - for i in xrange(nb_difat): + for i in range(nb_difat): debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) #TODO: check if corresponding FAT SID = DIFSECT sector_difat = self.getsect(isect_difat) @@ -1485,7 +1485,7 @@ class OleFileIO: #self.direntries = [] # We start with a list of "None" object self.direntries = [None] * max_entries -## for sid in xrange(max_entries): +## for sid in range(max_entries): ## entry = fp.read(128) ## if not entry: ## break From a6fd013a77b0343ddaa4a3651ad558e8219997cd Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Sat, 18 Jan 2014 01:43:01 +0000 Subject: [PATCH 032/101] Implement rich comparison using @total_ordering (2.7+, 3.2+) --- PIL/OleFileIO.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 108a9693a..d13bcb0a9 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -224,6 +224,7 @@ import io import sys from PIL import _binary import struct, array, os.path, datetime +from functools import total_ordering #[PL] Define explicitly the public API to avoid private objects in pydoc: __all__ = ['OleFileIO', 'isOleFile'] @@ -706,6 +707,7 @@ class _OleStream(io.BytesIO): #--- _OleDirectoryEntry ------------------------------------------------------- +@total_ordering class _OleDirectoryEntry: """ @@ -851,7 +853,7 @@ class _OleDirectoryEntry: # in the OLE file, entries are sorted on (length, name). # for convenience, we sort them on name instead: - # (see __cmp__ method in this class) + # (see rich comparison methods in this class) self.kids.sort() @@ -899,11 +901,14 @@ class _OleDirectoryEntry: child.build_storage_tree() - def __cmp__(self, other): + def __eq__(self, other): "Compare entries by name" - return cmp(self.name, other.name) - #TODO: replace by the same function as MS implementation ? - # (order by name length first, then case-insensitive order) + return self.name == other.name + def __lt__(self, other): + "Compare entries by name" + return self.name < other.name + #TODO: replace by the same function as MS implementation ? + # (order by name length first, then case-insensitive order) def dump(self, tab = 0): From 704ed762290383b49895f97fa379f65029f40a16 Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Sat, 18 Jan 2014 07:35:50 +0000 Subject: [PATCH 033/101] Update documentation for Python 3 --- PIL/OleFileIO-README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 3914a11a7..89f493ef2 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -12,7 +12,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. Main improvements over PIL version of OleFileIO: ------------------------------------------------ -- Better compatibility with Python 2.4 up to 2.7 +- Better compatibility with Python 2.7 (also compatible with Python 3.2+) - Support for files larger than 6.8MB - Robust: many checks to detect malformed files - Improved API @@ -57,28 +57,28 @@ Here are a few examples: ole = OleFileIO_PL.OleFileIO('myfile.doc') # Get list of streams: - print ole.listdir() + print(ole.listdir()) # Test if known streams/storages exist: if ole.exists('worddocument'): - print "This is a Word document." - print "size :", ole.get_size('worddocument') + print("This is a Word document.") + print("size :", ole.get_size('worddocument')) if ole.exists('macros/vba'): - print "This document seems to contain VBA macros." - + print("This document seems to contain VBA macros.") + # Extract the "Pictures" stream from a PPT file: if ole.exists('Pictures'): pics = ole.openstream('Pictures') data = pics.read() - f = open('Pictures.bin', 'w') + f = open('Pictures.bin', 'wb') f.write(data) f.close() # Extract metadata (new in v0.24) - see source code for all attributes: meta = ole.get_metadata() - print 'Author:', meta.author - print 'Title:', meta.title - print 'Creation date:', meta.create_time + print('Author:', meta.author) + print('Title:', meta.title) + print('Creation date:', meta.create_time) # print all metadata: meta.dump() @@ -87,9 +87,9 @@ Here are a few examples: # Work with a file-like object (e.g. StringIO) instead of a file on disk: data = open('myfile.doc', 'rb').read() - f = StringIO.StringIO(data) + f = io.BytesIO(data) ole = OleFileIO_PL.OleFileIO(f) - print ole.listdir() + print(ole.listdir()) ole.close() From caa609c438a9ed479f69b1749bd7a0b74874b65b Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Fri, 31 Jan 2014 01:32:46 +0000 Subject: [PATCH 034/101] Do away with @functools.total_ordering to restore Python 2.6 support * Manually implement __ne__() and __lt__() * __gt__() and __ge__() not needed due to operator reflection --- PIL/OleFileIO-README.md | 2 +- PIL/OleFileIO.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/PIL/OleFileIO-README.md b/PIL/OleFileIO-README.md index 89f493ef2..f02a548d6 100644 --- a/PIL/OleFileIO-README.md +++ b/PIL/OleFileIO-README.md @@ -12,7 +12,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS. Main improvements over PIL version of OleFileIO: ------------------------------------------------ -- Better compatibility with Python 2.7 (also compatible with Python 3.2+) +- Better compatibility with Python 2.6 (also compatible with Python 3.0+) - Support for files larger than 6.8MB - Robust: many checks to detect malformed files - Improved API diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index d13bcb0a9..373c830c0 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -224,7 +224,6 @@ import io import sys from PIL import _binary import struct, array, os.path, datetime -from functools import total_ordering #[PL] Define explicitly the public API to avoid private objects in pydoc: __all__ = ['OleFileIO', 'isOleFile'] @@ -707,7 +706,6 @@ class _OleStream(io.BytesIO): #--- _OleDirectoryEntry ------------------------------------------------------- -@total_ordering class _OleDirectoryEntry: """ @@ -909,6 +907,12 @@ class _OleDirectoryEntry: return self.name < other.name #TODO: replace by the same function as MS implementation ? # (order by name length first, then case-insensitive order) + + def __ne__(self, other): + return not self.__eq__(other) + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + # Reflected __lt__() and __le__() will be used for __gt__() and __ge__() def dump(self, tab = 0): From 99e7599ce2e670941a3289c7c998cbe0f8e2d079 Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Tue, 4 Feb 2014 04:10:19 +0000 Subject: [PATCH 035/101] OleFileIO: Add MAGIC constant to __all__ This is used by other modules in Pillow. --- PIL/OleFileIO.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index 373c830c0..0e2723b19 100644 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -226,7 +226,7 @@ from PIL import _binary import struct, array, os.path, datetime #[PL] Define explicitly the public API to avoid private objects in pydoc: -__all__ = ['OleFileIO', 'isOleFile'] +__all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] if str is not bytes: long = int From 4d321363652c244908bc5a276a3af23ea93de2be Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Feb 2014 20:41:27 -0800 Subject: [PATCH 036/101] Working in py3k, not in 2.x --- Tests/images/high_ascii_chars.png | Bin 0 -> 1457 bytes Tests/test_font_pcf.py | 14 +++++++++++++- _imaging.c | 23 ++++++++++++++++------- 3 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 Tests/images/high_ascii_chars.png diff --git a/Tests/images/high_ascii_chars.png b/Tests/images/high_ascii_chars.png new file mode 100644 index 0000000000000000000000000000000000000000..fc9ab8401a444b0107eb4ea1e95eb681efe90e6f GIT binary patch literal 1457 zcmV;i1y1^jP)9*u32kaq^iO6g&Bajra*#&h`}-eZs1IelhRp_ z(N46-C?pxZvW)m3=<3({ZLMx&A|WzdVg5e+^9yADQNRQx5Rb2+-2Np1FCXcJ82AA< z*WH8)qF#|za80km)t=@MM9d^ytMYTOvBNK{0{Cm+-x~fsS`DH}ZpUnJUnG07R$;y7 zcp`8CcY^&K0hR~shj%9M%a*aUfS@>r_b$=IpjmZ>(jNRMpKL0K{|e z&s#2C53TOxxCX?8``G7J5OKOQ7^iRTrfb8-#k@f@ydfP~TiNb$9lwG&M{M^#)K%E{ z!^dR7^dp4}sPDy{rBO#-erVb3<|8jSuewHk4*^O3VMHn<`fklwhkcgdIu$aahBy7Z!a~;|3cNb6R z!{fU9v30yCe1@s)A;|=FpEFWn3_#{3^;Rg$8bkmP2hD*EjCtVf9uD-r>R=oDLz-)B z)vCVAlJsEx8@7t*E;t9Ugu&S?t6tV8%&mEaMYA)ZG?H2G;4bu!<0_SX2VB6^hiBwq z33stqO22!36)lQ*TO2Q7$|RK2f`RfI&vEaZk#u4GV%(+rm2io%Fp1uQfmnDN_mAI?jH+CmA6CRz z5wg{V1#DyE%an`qXKX~=Us+ksrk4(b%I7WoGT=Yw#^q;_W)f#`))TUsRGBJzR`#bn z(x-OUJM(=5;>t{9V!!M_-a_P_q=>{9C7=6_JckFns7uT_#55utV|0)`srF;PV#oRo z(Ml=tvuuy?%RBy*Tgk)NU?{)vzWF&Yl8+ukaeawe6)eZ10rWZ6L#$2tq4LQZxtXHI zbi*bAm?OZr!ftP}fXhRIb(vbEZNrH2Y;}Hp64=>mg>aSO?!!B{hfkC$)23pQ+*V0v zF`v{o!J56O?;uUzHdaJVA}W6819j4|Mj8>8*Gxro++!$;Uzga9wWZGn(51^!${T!K zv5NvIbSV<=4IfRyU|(XtTK6Q?u0~Fw%#ubDACe$F%_>Ev0J22+5SE?)7^`8#{``7w z)Lkx|+Gz|ztZKUV#Anwkbnb=o&(yx#RUx=<3eN4F^S5C(-0fXB#e?N^7keFa`qai+ z2S$-H!VKt`Y|Wm&?{>b%n^fUlq2<&RzJt|5KpZO{Xxg}qObiNVYK(HZ$TPTDfnv59 z+kC<7oFvL+^2fR7l@r`mUHOW*Ks&`RU^*oX?viq=xRsq|9;1;@soM=1N8H7b1{&fk z9~1Ii$j^E)a{OfmfCoS8-B^K;N^E<5yUvMjkAXP2mRHW88}}1(ic~C;?DwRbq<$jq z!h6g*b+a3G3wE*g-LU0J%%;q5A!hABq@w;r+jb|}5at(dZ)uXtk&nRlm(RZhg4uiQ z-#KmE#oHf+N8bhCaQghdej~&EpCFn&yz_Zbv=9}=&!^!hat(f9U+>-3FFb%~_Pzhv zC`HuolE3TU5$(WwO}Y>7Y5Ro-kj?(L)So|}hECwm()O=`WcL36C=s`B2cCKq00000 LNkvXXu0mjf`Kj8w literal 0 HcmV?d00001 diff --git a/Tests/test_font_pcf.py b/Tests/test_font_pcf.py index 60e6e0e26..015885d01 100644 --- a/Tests/test_font_pcf.py +++ b/Tests/test_font_pcf.py @@ -22,10 +22,22 @@ def test_sanity(): font.save(tempname) -def test_draw(): +def xtest_draw(): font = ImageFont.load(tempname) image = Image.new("L", font.getsize(message), "white") draw = ImageDraw.Draw(image) draw.text((0, 0), message, font=font) # assert_signature(image, "7216c60f988dea43a46bb68321e3c1b03ec62aee") + +def test_high_characters(): + + message = "".join([chr(i+1) for i in range(140,232)]) + font = ImageFont.load(tempname) + image = Image.new("L", font.getsize(message), "white") + draw = ImageDraw.Draw(image) + draw.text((0, 0), message, font=font) + + compare = Image.open('Tests/images/high_ascii_chars.png') + + assert_image_equal(image, compare) diff --git a/_imaging.c b/_imaging.c index 078961da4..873617ea2 100644 --- a/_imaging.c +++ b/_imaging.c @@ -2245,24 +2245,28 @@ _font_getmask(ImagingFontObject* self, PyObject* args) Imaging im; Imaging bitmap; int x, b; + int i=0; int status; Glyph* glyph; unsigned char* text; char* mode = ""; - if (!PyArg_ParseTuple(args, "s|s:getmask", &text, &mode)) + if (!PyArg_ParseTuple(args, "es|s:getmask", "latin1", &text, &mode)){ return NULL; + } im = ImagingNew(self->bitmap->mode, textwidth(self, text), self->ysize); - if (!im) + if (!im) { + PyMem_Free(text); return NULL; + } b = 0; (void) ImagingFill(im, &b); b = self->baseline; - for (x = 0; *text; text++) { - glyph = &self->glyphs[*text]; + for (x = 0; text[i]; i++) { + glyph = &self->glyphs[text[i]]; bitmap = ImagingCrop( self->bitmap, glyph->sx0, glyph->sy0, glyph->sx1, glyph->sy1 @@ -2279,10 +2283,11 @@ _font_getmask(ImagingFontObject* self, PyObject* args) x = x + glyph->dx; b = b + glyph->dy; } - + PyMem_Free(text); return PyImagingNew(im); failed: + PyMem_Free(text); ImagingDelete(im); return NULL; } @@ -2291,10 +2296,14 @@ static PyObject* _font_getsize(ImagingFontObject* self, PyObject* args) { unsigned char* text; - if (!PyArg_ParseTuple(args, "s:getsize", &text)) + PyObject* retval; + if (!PyArg_ParseTuple(args, "es:getsize", "latin-1", &text)) return NULL; - return Py_BuildValue("ii", textwidth(self, text), self->ysize); + retval = Py_BuildValue("ii", textwidth(self, text), self->ysize); + PyMem_Free(text); + return retval; + } static struct PyMethodDef _font_methods[] = { From cacd63818758627f57f2a7138a734cb0d03880e4 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Feb 2014 21:16:19 -0800 Subject: [PATCH 037/101] Unicode handled as Latin-1 for Py3, Strings handled for Py2. --- _imaging.c | 55 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/_imaging.c b/_imaging.c index 873617ea2..e941f765a 100644 --- a/_imaging.c +++ b/_imaging.c @@ -2239,6 +2239,33 @@ textwidth(ImagingFontObject* self, const unsigned char* text) return xsize; } +void _font_text_asBytes(PyObject* encoded_string, unsigned char** text){ + PyObject* bytes; + + if (PyUnicode_CheckExact(encoded_string)){ + bytes = PyUnicode_AsLatin1String(encoded_string); + if (bytes) { + *text = (unsigned char*)PyBytes_AsString(bytes); + } else { + *text = NULL; + } + } else { +#if PY_VERSION_HEX >= 0x03000000 + /* this should always be a unicode if we're in Py3.x */ + *text = NULL; +#else + /* likely case here is py2.x with an ordinary string. + but this isn't defined in Py3.x */ + if (PyString_Check(encoded_string)) { + *text = (unsigned char *)PyString_AsString(encoded_string); + } else { + *text = NULL; + } +#endif + } +} + + static PyObject* _font_getmask(ImagingFontObject* self, PyObject* args) { @@ -2249,15 +2276,22 @@ _font_getmask(ImagingFontObject* self, PyObject* args) int status; Glyph* glyph; + PyObject* encoded_string; + unsigned char* text; char* mode = ""; - if (!PyArg_ParseTuple(args, "es|s:getmask", "latin1", &text, &mode)){ + + if (!PyArg_ParseTuple(args, "O|s:getmask", &encoded_string, &mode)){ + return NULL; + } + + _font_text_asBytes(encoded_string, &text); + if (!text) { return NULL; } im = ImagingNew(self->bitmap->mode, textwidth(self, text), self->ysize); if (!im) { - PyMem_Free(text); return NULL; } @@ -2283,11 +2317,9 @@ _font_getmask(ImagingFontObject* self, PyObject* args) x = x + glyph->dx; b = b + glyph->dy; } - PyMem_Free(text); return PyImagingNew(im); failed: - PyMem_Free(text); ImagingDelete(im); return NULL; } @@ -2296,14 +2328,17 @@ static PyObject* _font_getsize(ImagingFontObject* self, PyObject* args) { unsigned char* text; - PyObject* retval; - if (!PyArg_ParseTuple(args, "es:getsize", "latin-1", &text)) + PyObject* encoded_string; + + if (!PyArg_ParseTuple(args, "O:getsize", &encoded_string)) return NULL; - retval = Py_BuildValue("ii", textwidth(self, text), self->ysize); - PyMem_Free(text); - return retval; - + _font_text_asBytes(encoded_string, &text); + if (!text) { + return NULL; + } + + return Py_BuildValue("ii", textwidth(self, text), self->ysize); } static struct PyMethodDef _font_methods[] = { From 540e1e2c68c750934a44b989a708d18d46bee083 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Feb 2014 21:34:24 -0800 Subject: [PATCH 038/101] Added ability to render a bytes object using an old style bitmap font to make it easier to use in Py3k --- Tests/test_font_pcf.py | 14 ++++++++++---- _imaging.c | 36 +++++++++++++++++------------------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/Tests/test_font_pcf.py b/Tests/test_font_pcf.py index 015885d01..bae214e35 100644 --- a/Tests/test_font_pcf.py +++ b/Tests/test_font_pcf.py @@ -30,14 +30,20 @@ def xtest_draw(): draw.text((0, 0), message, font=font) # assert_signature(image, "7216c60f988dea43a46bb68321e3c1b03ec62aee") -def test_high_characters(): - - message = "".join([chr(i+1) for i in range(140,232)]) +def _test_high_characters(message): + font = ImageFont.load(tempname) image = Image.new("L", font.getsize(message), "white") draw = ImageDraw.Draw(image) draw.text((0, 0), message, font=font) compare = Image.open('Tests/images/high_ascii_chars.png') - assert_image_equal(image, compare) + +def test_high_characters(): + message = "".join([chr(i+1) for i in range(140,232)]) + _test_high_characters(message) + # accept bytes instances in Py3. + if bytes is not str: + _test_high_characters(message.encode('latin1')) + diff --git a/_imaging.c b/_imaging.c index e941f765a..f1a181b30 100644 --- a/_imaging.c +++ b/_imaging.c @@ -2240,29 +2240,27 @@ textwidth(ImagingFontObject* self, const unsigned char* text) } void _font_text_asBytes(PyObject* encoded_string, unsigned char** text){ - PyObject* bytes; + PyObject* bytes = NULL; + + *text = NULL; if (PyUnicode_CheckExact(encoded_string)){ bytes = PyUnicode_AsLatin1String(encoded_string); - if (bytes) { - *text = (unsigned char*)PyBytes_AsString(bytes); - } else { - *text = NULL; - } - } else { -#if PY_VERSION_HEX >= 0x03000000 - /* this should always be a unicode if we're in Py3.x */ - *text = NULL; -#else - /* likely case here is py2.x with an ordinary string. - but this isn't defined in Py3.x */ - if (PyString_Check(encoded_string)) { - *text = (unsigned char *)PyString_AsString(encoded_string); - } else { - *text = NULL; - } -#endif + } else if (PyBytes_Check(encoded_string)) { + bytes = encoded_string; } + if (bytes) { + *text = (unsigned char*)PyBytes_AsString(bytes); + return; + } + +#if PY_VERSION_HEX < 0x03000000 + /* likely case here is py2.x with an ordinary string. + but this isn't defined in Py3.x */ + if (PyString_Check(encoded_string)) { + *text = (unsigned char *)PyString_AsString(encoded_string); + } +#endif } From 41e124af0527f45b7a8b60d59d10c9199b7a9ae1 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Feb 2014 21:45:34 -0800 Subject: [PATCH 039/101] Skip CFFI test earlier if it's not installed --- Tests/test_cffi.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Tests/test_cffi.py b/Tests/test_cffi.py index 4065a9e53..1c0d8d31e 100644 --- a/Tests/test_cffi.py +++ b/Tests/test_cffi.py @@ -1,16 +1,15 @@ from tester import * -from PIL import Image, PyAccess - -import test_image_putpixel as put -import test_image_getpixel as get - - - try: import cffi except: skip() + +from PIL import Image, PyAccess + +import test_image_putpixel as put +import test_image_getpixel as get + Image.USE_CFFI_ACCESS = True From 2867d203cfd42d071541b58c2c1a7239f414111d Mon Sep 17 00:00:00 2001 From: "Christian E. Hopps" Date: Tue, 11 Feb 2014 15:05:09 -0800 Subject: [PATCH 040/101] Handle 32bit compiled python on 64bit architecture platforma.processor() will return x86_64 on a 64 bit linux system; however, this it wrong for 32 bit compiled python. By looking at platform.architecture() first it correctly notes the 32bit compile. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0d791c444..a8ff2e762 100644 --- a/setup.py +++ b/setup.py @@ -222,7 +222,7 @@ class pil_build_ext(build_ext): _add_directory(include_dirs, "/usr/X11/include") elif sys.platform.startswith("linux"): - for platform_ in (plat.processor(), plat.architecture()[0]): + for platform_ in (plat.architecture()[0], plat.processor()): if not platform_: continue From 5ecec7db105ba48b20b1d6b8496d7caf49759243 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sat, 15 Feb 2014 22:41:02 -0800 Subject: [PATCH 041/101] Move image to first xsize*bands bytes, rather than including padding if stride > xsize*bands, fixes #523 --- libImaging/PcxDecode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libImaging/PcxDecode.c b/libImaging/PcxDecode.c index 04c86cb35..af282cfe4 100644 --- a/libImaging/PcxDecode.c +++ b/libImaging/PcxDecode.c @@ -57,7 +57,16 @@ ImagingPcxDecode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) } if (state->x >= state->bytes) { - + if (state->bytes % state->xsize && state->bytes > state->xsize) { + int bands = state->bytes / state->xsize; + int stride = state->bytes / bands; + int i; + for (i=1; i< bands; i++) { // note -- skipping first band + memmove(&state->buffer[i*state->xsize], + &state->buffer[i*stride], + state->xsize); + } + } /* Got a full line, unpack it */ state->shuffle((UINT8*) im->image[state->y + state->yoff] + state->xoff * im->pixelsize, state->buffer, From 36323d114054e9b4b1a963afe5687de35ad35afc Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sun, 16 Feb 2014 20:58:06 -0800 Subject: [PATCH 042/101] Make ICO files work with the ImageFile.Parser interface, fixes #522 --- PIL/IcoImagePlugin.py | 4 ++++ Tests/images/python.ico | Bin 0 -> 15086 bytes Tests/test_imagefile.py | 7 +++++++ 3 files changed, 11 insertions(+) create mode 100644 Tests/images/python.ico diff --git a/PIL/IcoImagePlugin.py b/PIL/IcoImagePlugin.py index 82d33e383..268e93d6c 100644 --- a/PIL/IcoImagePlugin.py +++ b/PIL/IcoImagePlugin.py @@ -222,6 +222,10 @@ class IcoImageFile(ImageFile.ImageFile): self.mode = im.mode self.size = im.size + + def load_seek(self): + # Flage the ImageFile.Parser so that it just does all the decode at the end. + pass # # -------------------------------------------------------------------- diff --git a/Tests/images/python.ico b/Tests/images/python.ico new file mode 100644 index 0000000000000000000000000000000000000000..c9efc5844a2627a8474949724a2aefe4ab2baee4 GIT binary patch literal 15086 zcmcgz3tUxI_P$#EqsLfjrKaN}rF^8R7zl#$e#lE9E2sH4T54qp=Bv_FY)l=~00C>% zF&`;O=A+c|g@6JoRF;_+O)YUI9mCN{y9!+PeBZujUk}%FFE4Tbe#_&%_nf`g+Uv2` z`qsf@YGS&@bnm?;TsxRvy47UzG?`2tI@mscjPIA>yI#Gt&+aBu=mREG7uBIOT=7iM}d`t)*vk<+$7lLzev?;P$@UuUtE$9x=ZJ;KgrrPHgxHf;L zREU_9LdZFeV2(pVxNCgi_W%&(dJNPKbQg&7bOiZ=o_PKjA=Dgxg=ez*=AT>JE$`gg zj@jqdw%TE%=1U6R#J!xW!>URCi7O}gS(c9T5%?VW>Nz3wa|A&aZ}9Or_~`tNZ9ljt z%@^lQ!To*LPYXKo@tXl+#Y8_5*{AKZiKUl6=Nu!z$J3A{8nUQ4dg|tQDEGHK^6=Tr zP|K&&g2kFQ{6*@Eo>}-VCgJD*t{HRIY8mmbYjXsGkAC1o&7tjQ*tR)?#Ah?Z#3yeB ziPe(_inNzKw}$j?^JmNx1qugI{{bO;PQ4OPVzqFN9^j)BWWo2L+3!Y+FW!j|8|;YSI9)bXCj zMTsr5!o`LeAwuoNl0+Z*Jm(;3a|A`65F!9`cpkQv;rY%5sd2k<7sQGkb0fv(zlIC7 z7t1I3iNBBat~Lkm4d$CEk7yr=h1)ibJz&o9fo%5n-?)JS{IdOncdx59A zfM~N$N6vzn4cliADwBILJy>MIE|$IKYgzP?*FrqRXK&FzeBpMJNs|6c7ynOE_iI-& z)B)rVVtnFAV&JLYP`=DPLL@Ww2Kj)vZ^0mckT<9&s3V9z?*?iDYUtqF%|PzGzWi}| z@BH&ydKUd|?oxEM?2)1h(Z0Y^+B)yDxy9zwS$EC;Gy(TTf*3P8fw->?<>8LNqH~i2 zekd;+@I$5W{=QQ5{jO3xey~Dx`>H~?gNF|LD@5DEa`9k(xwtp?N}1b&ZEs;+?hE$< z8ma$?EwRjuK5iA^N39~rTq&IKFyVNmICZ5;Og&L4?%Pu?+!pR+9Pp7712LB;o%$GK1k=#{*n8zwFa#}{GU}q@i62>^?e`@{a_Cs-@+cWeNgt$dXHS6=UHv{pLR~4 zxd(JVs3oZRs9&mtBM(u~MVMv}dLDX$hpw7Uwkxnu{zpM=L5z)@kMnYV+N9nul#g}B z_gUdAv!(=^KYr7{ zEc1;3@e$5E+!LONGuIUl+=oE;h1`cjHeINF=*;*4-+ZW`T-={`rR?79%VxLK?aX(A zG#Ht@{nd5hm8CY={zf&mYII#nfx8u4%#HfgjE`)vC_ z9-3ueqHR!T?Qkw8ZJRwvY@HP$J_irX7kE}PXTa~#Twn2!0A0KcdrTEx;ev!xp4ZP6-feUiX)|H1k;IkV_JL#0SVlGsf8RVU=5t4DCG5 z)jo9F_Z!d0o`q@gy9yV^iM=U0JxCrLe3EeiKFPSS`t^Z=`R9k@ePvF+IKf-I|H`#| zm^`G6_ArO-J~A^FzB%ZaRfIU`qDS7@kFkEI{TV}(_ov4T2m7 zU8NpS8+`~pEJaPgoPN{lUT|&$K*{7H8R) z7B6TEdG9G(Fz`vn1zmjpXp*u8=m9*~^l*&_u0ITMQu2TtO6|jd!{w}5yMcHHnTpef zPCtyC@JYr6+5*-NYv}qaKL2uPr_67EI~R79fVKTW%=zj)WPFBC z>IZ9S+ga^AJ}*t|Pak9cD}AiK45;}0b77`r;qaDZgZQlFOuG0C8(fVwS%=zvtUf&K zPl*jGNEvLoEw;OA*+P<|uYdSu|7`ZnkE*Z91w0nNE*)cvNXR~}d zJ;;pwdH0&v2NEMqx=_Ie6Y^UhO*}lA4GvxDODi zX*kG^QxIXeaf5ZvZ6MZRtbsa#h*L18u|D+!1%g6AtWWrv-?Mh^4q~lB+=F)9AX~!o zbsglLm?~>HkH^*>oX|7(%-YBD|C86f=yJ&;g_ld*frH4LGT(B#^`1*5EpsoGwD{ue z)@JLDFS}#T`b0d(oXsCZnHdY+K%BR+hveNm+Z^Qk&2PEB-DuV|H!#?8f!Bzq7O9M z_?((I1oyV45{>gYwFn1SjU#P#K5?5+j#B z4a`Bwa5MNHiMm5C12NoqL+PtVH^>gC2<65L4C50RFu|F8h@=?Js_3{N z9Few2eJX1H;u+vvN(Nxitl@5y3|NQpljoRPPhB6xcdQF}zPZZZ{tv(N-X5TVAo>$+ zQjr%i)7r{_=a@4N);+K0sq2*(FERH=L6pNN|Gh1JO#klaM`j{lEhE;-I+nN!F$LBy zcrIeBhWn0&9$3RlTXN6?aRuVg(x-uMFfQqB=?P^^!2cMR^tMC{j(vhw=-DtXv1f8O zVyoMVLupu_Ia41HZA^boH(izx><+xRRMwxsD2d6eMcu(Vm^C2#tav73+^<43R0j4I zoXOA;XPl7?&2YxuvE|o1@DK~4ozceCaNCp=VkW*xY>)UNYiQPw#Ff~qX0I7HM-cax zGW^?yzZ>cS*baT#Q4hr92ja|?_@{#m-KhuUd-Q2T8IWgC2DddwKLj5^pw=L%gN;eS ziL5(`kFu9boS2vtu{qe1O$OKz`Bzx2Ga3G@Y{@_dVr9-`AfD|^hL*6U`)Ny%;SV`i zEDpI+(=D?i%*1hsF%#z|*1iEahLiy~g7gK{@rr*}GO!QfOong`?`xT?h`4k!WwI{QjX8R5iK85jdT=E}4E&+KGPK!qnYQ4oa8TmG#J{8r%8uX*(vA$y1>_VZ z@4z20#}H5?s4h88nD_Ec!=k=BpBar>x;`?r-)rNaJ*GXtQ6-+N^}6T_)ncmdTyT|N z@SQZ`%*%jvv-d)*nY|D8UsUgb{g>3yeNQKk?)%lG_^xxhPM-Tzl=(`rk|9{@2kHGm z*%JGNe7A-E@Nj|T-#b5<7^v2t(J*l%Ur=`BD!*Wk#2y~|SF(pj8PKrlLhH|X%?oSJ0-wxr46ZsWgqLFy1Qv26BM>f;Oc97&g_vIofimy&^f55AX{ z;Y5b{FZ9mq@z#4!>TStU55&i8da%jho_%I5Y_@A*>W~fWPf0t{`a^Z~1*2S>GO%yM zcLQZF6a7^7kM#Xar2}5aMm;(;^5FRo^ggX*(BorD2KOz;lEHt+{pmxdvB$?gBkkyB z`+|~zI`H{u%hP;^r8Y9SFW>Vb_-|R565W%1QuOE`1MCR>T4_fMw3upeE;NQOB#-uB zJPc(|Q!fKCzF^=k?6)gDcz!GUoW!}CflLR|;!C9rv?HzW&i4--&joolyT~u}xwfj= z>f#GvCE^)kOF9`8#;2FTtLS1b?(Gb^9b_s_kDtaqt+XQpU#M+0m}-Jms*MBb@w@8e3#w+*-mzk@%(3p2c4TlasFb4*>^P;)mLkj*rTupuW9>_v#Bk&e+WNxH;9s9#{K8q? z=_0>CeZZWCf5&y*xq2goDa89CD%UQ|cvk$ld#N~CkSTIgpOF|$ed+@3YZTTY9&XRa9M`>(pQQ|`LzdGQBz{rPTpN7_ z^FI=N*^^*C)2RFIO#Zz2(Zg~U#4gX7A6=aNZj>4Li^T}bLw})k{fuBU`XZY$r}!^i z{e~~!qhT)7AihN}g)47^^rcD-HTJtKU^)l}Cj9%FUU Date: Mon, 3 Mar 2014 16:08:09 +0400 Subject: [PATCH 043/101] fix ImageColor.getcolor in cases: - color is rgba, mode is RGB - mode is LA --- PIL/ImageColor.py | 19 ++++++++++--------- Tests/test_imagecolor.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/PIL/ImageColor.py b/PIL/ImageColor.py index c14257151..107df193a 100644 --- a/PIL/ImageColor.py +++ b/PIL/ImageColor.py @@ -117,17 +117,18 @@ def getcolor(color, mode): :return: ``(red, green, blue)`` """ # same as getrgb, but converts the result to the given mode - color = getrgb(color) - if mode == "RGB": - return color - if mode == "RGBA": - if len(color) == 3: - color = (color + (255,)) - r, g, b, a = color - return r, g, b, a + color, alpha = getrgb(color), 255 + if len(color) == 4: + color, alpha = color[0:3], color[3] + if Image.getmodebase(mode) == "L": r, g, b = color - return (r*299 + g*587 + b*114)//1000 + color = (r*299 + g*587 + b*114)//1000 + if mode[-1] == 'A': + return (color, alpha) + else: + if mode[-1] == 'A': + return color + (alpha,) return color colormap = { diff --git a/Tests/test_imagecolor.py b/Tests/test_imagecolor.py index 23f21744a..acdb84213 100644 --- a/Tests/test_imagecolor.py +++ b/Tests/test_imagecolor.py @@ -26,9 +26,25 @@ for color in list(ImageColor.colormap.keys()): assert_equal((0, 0, 0), ImageColor.getcolor("black", "RGB")) assert_equal((255, 255, 255), ImageColor.getcolor("white", "RGB")) +assert_equal((0, 255, 115), ImageColor.getcolor("rgba(0, 255, 115, 33)", "RGB")) +Image.new("RGB", (1, 1), "white") + +assert_equal((0, 0, 0, 255), ImageColor.getcolor("black", "RGBA")) +assert_equal((255, 255, 255, 255), ImageColor.getcolor("white", "RGBA")) +assert_equal((0, 255, 115, 33), ImageColor.getcolor("rgba(0, 255, 115, 33)", "RGBA")) +Image.new("RGBA", (1, 1), "white") assert_equal(0, ImageColor.getcolor("black", "L")) assert_equal(255, ImageColor.getcolor("white", "L")) +assert_equal(162, ImageColor.getcolor("rgba(0, 255, 115, 33)", "L")) +Image.new("L", (1, 1), "white") assert_equal(0, ImageColor.getcolor("black", "1")) assert_equal(255, ImageColor.getcolor("white", "1")) +assert_equal(162, ImageColor.getcolor("rgba(0, 255, 115, 33)", "1")) +Image.new("1", (1, 1), "white") + +assert_equal((0, 255), ImageColor.getcolor("black", "LA")) +assert_equal((255, 255), ImageColor.getcolor("white", "LA")) +assert_equal((162, 33), ImageColor.getcolor("rgba(0, 255, 115, 33)", "LA")) +Image.new("LA", (1, 1), "white") From 05cd72f4f00a08d7f0f28ebea6c7cb7bbf375dd8 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 3 Mar 2014 20:28:34 -0800 Subject: [PATCH 044/101] mixed 4/8 tabs+spaces -> 4 spaces, additional bracing --- libImaging/PcxEncode.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/libImaging/PcxEncode.c b/libImaging/PcxEncode.c index 87d599463..8c6272d44 100644 --- a/libImaging/PcxEncode.c +++ b/libImaging/PcxEncode.c @@ -5,7 +5,7 @@ * encoder for PCX data * * history: - * 99-02-07 fl created + * 99-02-07 fl created * * Copyright (c) Fredrik Lundh 1999. * Copyright (c) Secret Labs AB 1999. @@ -30,19 +30,18 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) ptr = buf; if (!state->state) { - /* sanity check */ if (state->xsize <= 0 || state->ysize <= 0) { state->errcode = IMAGING_CODEC_END; return 0; } - state->bytes = (state->xsize*state->bits + 7) / 8; + state->bytes = (state->xsize*state->bits + 7) / 8; state->state = FETCH; } - for (;;) + for (;;) { switch (state->state) { case FETCH: @@ -68,7 +67,6 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) /* fall through */ case ENCODE: - /* compress this line */ /* when we arrive here, "count" contains the number of @@ -78,7 +76,6 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) while (state->x < state->bytes) { if (state->count == 63) { - /* this run is full; flush it */ if (bytes < 2) return ptr - buf; @@ -93,7 +90,6 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) this = state->buffer[state->x]; if (this == state->LAST) { - /* extend the current run */ state->x++; state->count++; @@ -102,14 +98,16 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) /* start a new run */ if (state->count == 1 && (state->LAST < 0xc0)) { - if (bytes < 1) + if (bytes < 1) { return ptr - buf; + } *ptr++ = state->LAST; bytes--; } else { if (state->count > 0) { - if (bytes < 2) + if (bytes < 2) { return ptr - buf; + } *ptr++ = 0xc0 | state->count; *ptr++ = state->LAST; bytes -= 2; @@ -126,14 +124,16 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) /* end of line; flush the current run */ if (state->count == 1 && (state->LAST < 0xc0)) { - if (bytes < 1) + if (bytes < 1) { return ptr - buf; + } *ptr++ = state->LAST; bytes--; } else { if (state->count > 0) { - if (bytes < 2) + if (bytes < 2) { return ptr - buf; + } *ptr++ = 0xc0 | state->count; *ptr++ = state->LAST; bytes -= 2; @@ -143,6 +143,7 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) /* read next line */ state->state = FETCH; break; - } + } } + From fe5372c867dfec8171a5737ea56b15badc7c3fe5 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 3 Mar 2014 20:29:14 -0800 Subject: [PATCH 045/101] Debug Loging --- PIL/PcxImagePlugin.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PIL/PcxImagePlugin.py b/PIL/PcxImagePlugin.py index 42dd9be0b..d90a7cf0d 100644 --- a/PIL/PcxImagePlugin.py +++ b/PIL/PcxImagePlugin.py @@ -55,12 +55,18 @@ class PcxImageFile(ImageFile.ImageFile): bbox = i16(s,4), i16(s,6), i16(s,8)+1, i16(s,10)+1 if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]: raise SyntaxError("bad PCX image size") + if Image.DEBUG: + print ("BBox: %s %s %s %s" % bbox) + # format version = i8(s[1]) bits = i8(s[3]) planes = i8(s[65]) stride = i16(s,66) + if Image.DEBUG: + print ("PCX version %s, bits %s, planes %s, stride %s" % + (version, bits, planes, stride)) self.info["dpi"] = i16(s,12), i16(s,14) @@ -98,7 +104,9 @@ class PcxImageFile(ImageFile.ImageFile): self.size = bbox[2]-bbox[0], bbox[3]-bbox[1] bbox = (0, 0) + self.size - + if Image.DEBUG: + print ("size: %sx%s" % self.size) + self.tile = [("pcx", bbox, self.fp.tell(), (rawmode, planes * stride))] # -------------------------------------------------------------------- From b0f8f498804639768ca5327cf5a45e6a7242fc6a Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 3 Mar 2014 20:43:47 -0800 Subject: [PATCH 046/101] mixed 8 ch tabs + spaces -> spaces --- encode.c | 191 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 98 insertions(+), 93 deletions(-) diff --git a/encode.c b/encode.c index 8be44d8ec..3ceb50731 100644 --- a/encode.c +++ b/encode.c @@ -33,13 +33,13 @@ #endif /* -------------------------------------------------------------------- */ -/* Common */ +/* Common */ /* -------------------------------------------------------------------- */ typedef struct { PyObject_HEAD int (*encode)(Imaging im, ImagingCodecState state, - UINT8* buffer, int bytes); + UINT8* buffer, int bytes); struct ImagingCodecStateInstance state; Imaging im; PyObject* lock; @@ -58,21 +58,21 @@ PyImaging_EncoderNew(int contextsize) encoder = PyObject_New(ImagingEncoderObject, &ImagingEncoderType); if (encoder == NULL) - return NULL; + return NULL; /* Clear the encoder state */ memset(&encoder->state, 0, sizeof(encoder->state)); /* Allocate encoder context */ if (contextsize > 0) { - context = (void*) calloc(1, contextsize); - if (!context) { - Py_DECREF(encoder); - (void) PyErr_NoMemory(); - return NULL; - } + context = (void*) calloc(1, contextsize); + if (!context) { + Py_DECREF(encoder); + (void) PyErr_NoMemory(); + return NULL; + } } else - context = 0; + context = 0; /* Initialize encoder context */ encoder->state.context = context; @@ -105,14 +105,14 @@ _encode(ImagingEncoderObject* encoder, PyObject* args) int bufsize = 16384; if (!PyArg_ParseTuple(args, "|i", &bufsize)) - return NULL; + return NULL; buf = PyBytes_FromStringAndSize(NULL, bufsize); if (!buf) - return NULL; + return NULL; status = encoder->encode(encoder->im, &encoder->state, - (UINT8*) PyBytes_AsString(buf), bufsize); + (UINT8*) PyBytes_AsString(buf), bufsize); /* adjust string length to avoid slicing in encoder */ if (_PyBytes_Resize(&buf, (status > 0) ? status : 0) < 0) @@ -138,28 +138,28 @@ _encode_to_file(ImagingEncoderObject* encoder, PyObject* args) int bufsize = 16384; if (!PyArg_ParseTuple(args, "i|i", &fh, &bufsize)) - return NULL; + return NULL; /* Allocate an encoder buffer */ buf = (UINT8*) malloc(bufsize); if (!buf) - return PyErr_NoMemory(); + return PyErr_NoMemory(); ImagingSectionEnter(&cookie); do { - /* This replaces the inner loop in the ImageFile _save - function. */ + /* This replaces the inner loop in the ImageFile _save + function. */ - status = encoder->encode(encoder->im, &encoder->state, buf, bufsize); + status = encoder->encode(encoder->im, &encoder->state, buf, bufsize); - if (status > 0) - if (write(fh, buf, status) < 0) { + if (status > 0) + if (write(fh, buf, status) < 0) { ImagingSectionLeave(&cookie); - free(buf); - return PyErr_SetFromErrno(PyExc_IOError); - } + free(buf); + return PyErr_SetFromErrno(PyExc_IOError); + } } while (encoder->state.errcode == 0); @@ -186,39 +186,39 @@ _setimage(ImagingEncoderObject* encoder, PyObject* args) /* FIXME: should publish the ImagingType descriptor */ if (!PyArg_ParseTuple(args, "O|(iiii)", &op, &x0, &y0, &x1, &y1)) - return NULL; + return NULL; im = PyImaging_AsImaging(op); if (!im) - return NULL; + return NULL; encoder->im = im; state = &encoder->state; if (x0 == 0 && x1 == 0) { - state->xsize = im->xsize; - state->ysize = im->ysize; + state->xsize = im->xsize; + state->ysize = im->ysize; } else { - state->xoff = x0; - state->yoff = y0; - state->xsize = x1 - x0; - state->ysize = y1 - y0; + state->xoff = x0; + state->yoff = y0; + state->xsize = x1 - x0; + state->ysize = y1 - y0; } if (state->xsize <= 0 || - state->xsize + state->xoff > im->xsize || - state->ysize <= 0 || - state->ysize + state->yoff > im->ysize) { - PyErr_SetString(PyExc_SystemError, "tile cannot extend outside image"); - return NULL; + state->xsize + state->xoff > im->xsize || + state->ysize <= 0 || + state->ysize + state->yoff > im->ysize) { + PyErr_SetString(PyExc_SystemError, "tile cannot extend outside image"); + return NULL; } /* Allocate memory buffer (if bits field is set) */ if (state->bits > 0) { - state->bytes = (state->bits * state->xsize+7)/8; - state->buffer = (UINT8*) malloc(state->bytes); - if (!state->buffer) - return PyErr_NoMemory(); + state->bytes = (state->bits * state->xsize+7)/8; + state->buffer = (UINT8*) malloc(state->bytes); + if (!state->buffer) + return PyErr_NoMemory(); } /* Keep a reference to the image object, to make sure it doesn't @@ -239,13 +239,13 @@ static struct PyMethodDef methods[] = { }; static PyTypeObject ImagingEncoderType = { - PyVarObject_HEAD_INIT(NULL, 0) - "ImagingEncoder", /*tp_name*/ - sizeof(ImagingEncoderObject), /*tp_size*/ - 0, /*tp_itemsize*/ - /* methods */ - (destructor)_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ + PyVarObject_HEAD_INIT(NULL, 0) + "ImagingEncoder", /*tp_name*/ + sizeof(ImagingEncoderObject), /*tp_size*/ + 0, /*tp_itemsize*/ + /* methods */ + (destructor)_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ @@ -283,9 +283,9 @@ get_packer(ImagingEncoderObject* encoder, const char* mode, pack = ImagingFindPacker(mode, rawmode, &bits); if (!pack) { - Py_DECREF(encoder); - PyErr_SetString(PyExc_SystemError, "unknown raw mode"); - return -1; + Py_DECREF(encoder); + PyErr_SetString(PyExc_SystemError, "unknown raw mode"); + return -1; } encoder->state.shuffle = pack; @@ -296,7 +296,7 @@ get_packer(ImagingEncoderObject* encoder, const char* mode, /* -------------------------------------------------------------------- */ -/* EPS */ +/* EPS */ /* -------------------------------------------------------------------- */ PyObject* @@ -306,7 +306,7 @@ PyImaging_EpsEncoderNew(PyObject* self, PyObject* args) encoder = PyImaging_EncoderNew(0); if (encoder == NULL) - return NULL; + return NULL; encoder->encode = ImagingEpsEncode; @@ -315,7 +315,7 @@ PyImaging_EpsEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* GIF */ +/* GIF */ /* -------------------------------------------------------------------- */ PyObject* @@ -328,14 +328,14 @@ PyImaging_GifEncoderNew(PyObject* self, PyObject* args) int bits = 8; int interlace = 0; if (!PyArg_ParseTuple(args, "ss|ii", &mode, &rawmode, &bits, &interlace)) - return NULL; + return NULL; encoder = PyImaging_EncoderNew(sizeof(GIFENCODERSTATE)); if (encoder == NULL) - return NULL; + return NULL; if (get_packer(encoder, mode, rawmode) < 0) - return NULL; + return NULL; encoder->encode = ImagingGifEncode; @@ -347,7 +347,7 @@ PyImaging_GifEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* PCX */ +/* PCX */ /* -------------------------------------------------------------------- */ PyObject* @@ -358,15 +358,19 @@ PyImaging_PcxEncoderNew(PyObject* self, PyObject* args) char *mode; char *rawmode; int bits = 8; - if (!PyArg_ParseTuple(args, "ss|ii", &mode, &rawmode, &bits)) - return NULL; + + if (!PyArg_ParseTuple(args, "ss|ii", &mode, &rawmode, &bits)) { + return NULL; + } encoder = PyImaging_EncoderNew(0); - if (encoder == NULL) - return NULL; + if (encoder == NULL) { + return NULL; + } - if (get_packer(encoder, mode, rawmode) < 0) - return NULL; + if (get_packer(encoder, mode, rawmode) < 0) { + return NULL; + } encoder->encode = ImagingPcxEncode; @@ -375,7 +379,7 @@ PyImaging_PcxEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* RAW */ +/* RAW */ /* -------------------------------------------------------------------- */ PyObject* @@ -389,14 +393,14 @@ PyImaging_RawEncoderNew(PyObject* self, PyObject* args) int ystep = 1; if (!PyArg_ParseTuple(args, "ss|ii", &mode, &rawmode, &stride, &ystep)) - return NULL; + return NULL; encoder = PyImaging_EncoderNew(0); if (encoder == NULL) - return NULL; + return NULL; if (get_packer(encoder, mode, rawmode) < 0) - return NULL; + return NULL; encoder->encode = ImagingRawEncode; @@ -408,7 +412,7 @@ PyImaging_RawEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* XBM */ +/* XBM */ /* -------------------------------------------------------------------- */ PyObject* @@ -418,10 +422,10 @@ PyImaging_XbmEncoderNew(PyObject* self, PyObject* args) encoder = PyImaging_EncoderNew(0); if (encoder == NULL) - return NULL; + return NULL; if (get_packer(encoder, "1", "1;R") < 0) - return NULL; + return NULL; encoder->encode = ImagingXbmEncode; @@ -430,7 +434,7 @@ PyImaging_XbmEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* ZIP */ +/* ZIP */ /* -------------------------------------------------------------------- */ #ifdef HAVE_LIBZ @@ -469,16 +473,16 @@ PyImaging_ZipEncoderNew(PyObject* self, PyObject* args) encoder = PyImaging_EncoderNew(sizeof(ZIPSTATE)); if (encoder == NULL) - return NULL; + return NULL; if (get_packer(encoder, mode, rawmode) < 0) - return NULL; + return NULL; encoder->encode = ImagingZipEncode; if (rawmode[0] == 'P') - /* disable filtering */ - ((ZIPSTATE*)encoder->state.context)->mode = ZIP_PNG_PALETTE; + /* disable filtering */ + ((ZIPSTATE*)encoder->state.context)->mode = ZIP_PNG_PALETTE; ((ZIPSTATE*)encoder->state.context)->optimize = optimize; ((ZIPSTATE*)encoder->state.context)->compress_level = compress_level; @@ -492,7 +496,7 @@ PyImaging_ZipEncoderNew(PyObject* self, PyObject* args) /* -------------------------------------------------------------------- */ -/* JPEG */ +/* JPEG */ /* -------------------------------------------------------------------- */ #ifdef HAVE_LIBJPEG @@ -500,15 +504,15 @@ PyImaging_ZipEncoderNew(PyObject* self, PyObject* args) /* We better define this encoder last in this file, so the following undef's won't mess things up for the Imaging library proper. */ -#undef HAVE_PROTOTYPES -#undef HAVE_STDDEF_H -#undef HAVE_STDLIB_H -#undef UINT8 -#undef UINT16 -#undef UINT32 -#undef INT8 -#undef INT16 -#undef INT32 +#undef HAVE_PROTOTYPES +#undef HAVE_STDDEF_H +#undef HAVE_STDLIB_H +#undef UINT8 +#undef UINT16 +#undef UINT32 +#undef INT8 +#undef INT16 +#undef INT32 #include "Jpeg.h" @@ -601,14 +605,14 @@ PyImaging_JpegEncoderNew(PyObject* self, PyObject* args) &progressive, &smooth, &optimize, &streamtype, &xdpi, &ydpi, &subsampling, &qtables, &extra, &extra_size, &rawExif, &rawExifLen)) - return NULL; + return NULL; encoder = PyImaging_EncoderNew(sizeof(JPEGENCODERSTATE)); if (encoder == NULL) - return NULL; + return NULL; if (get_packer(encoder, mode, rawmode) < 0) - return NULL; + return NULL; qarrays = get_qtables_arrays(qtables); @@ -718,11 +722,11 @@ PyImaging_LibTiffEncoderNew(PyObject* self, PyObject* args) return NULL; } - // While failes on 64 bit machines, complains that pos is an int instead of a Py_ssize_t - // while (PyDict_Next(dir, &pos, &key, &value)) { - for (pos=0;pos Date: Mon, 3 Mar 2014 23:03:00 -0800 Subject: [PATCH 047/101] Tests for issue #523 --- Tests/test_file_pcx.py | 43 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/Tests/test_file_pcx.py b/Tests/test_file_pcx.py index 73d358229..cb785ec54 100644 --- a/Tests/test_file_pcx.py +++ b/Tests/test_file_pcx.py @@ -2,29 +2,30 @@ from tester import * from PIL import Image + +def _roundtrip(im): + f = tempfile("temp.pcx") + im.save(f) + im2 = Image.open(f) + + assert_equal(im2.mode, im.mode) + assert_equal(im2.size, im.size) + assert_equal(im2.format, "PCX") + assert_image_equal(im2, im) + def test_sanity(): + for mode in ('1', 'L', 'P', 'RGB'): + _roundtrip(lena(mode)) - file = tempfile("temp.pcx") - - lena("1").save(file) - - im = Image.open(file) - im.load() - assert_equal(im.mode, "1") - assert_equal(im.size, (128, 128)) - assert_equal(im.format, "PCX") - - lena("1").save(file) - im = Image.open(file) - - lena("L").save(file) - im = Image.open(file) - - lena("P").save(file) - im = Image.open(file) - - lena("RGB").save(file) - im = Image.open(file) +def test_odd(): + # see issue #523, odd sized images should have a stride that's even. + # not that imagemagick or gimp write pcx that way. + # we were not handling properly. + for mode in ('1', 'L', 'P', 'RGB'): + # larger, odd sized images are better here to ensure that + # we handle interrupted scan lines properly. + _roundtrip(lena(mode).resize((511,511))) + def test_pil184(): # Check reading of files where xmin/xmax is not zero. From 89cb3c7e85fdd98b54f0f93fdee36806dc86993e Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 3 Mar 2014 23:03:36 -0800 Subject: [PATCH 048/101] Use even stride when saving odd sized .pcx files --- PIL/PcxImagePlugin.py | 10 +++ libImaging/PcxEncode.c | 149 ++++++++++++++++++++++++++--------------- 2 files changed, 104 insertions(+), 55 deletions(-) diff --git a/PIL/PcxImagePlugin.py b/PIL/PcxImagePlugin.py index d90a7cf0d..2496af676 100644 --- a/PIL/PcxImagePlugin.py +++ b/PIL/PcxImagePlugin.py @@ -134,6 +134,16 @@ def _save(im, fp, filename, check=0): # bytes per plane stride = (im.size[0] * bits + 7) // 8 + # stride should be even + stride = stride + (stride % 2) + # Stride needs to be kept in sync with the PcxEncode.c version. + # Ideally it should be passed in in the state, but the bytes value + # gets overwritten. + + + if Image.DEBUG: + print ("PcxImagePlugin._save: xwidth: %d, bits: %d, stride: %d" % ( + im.size[0], bits, stride)) # under windows, we could determine the current screen size with # "Image.core.display_mode()[1]", but I think that's overkill... diff --git a/libImaging/PcxEncode.c b/libImaging/PcxEncode.c index 8c6272d44..c1f64a33d 100644 --- a/libImaging/PcxEncode.c +++ b/libImaging/PcxEncode.c @@ -26,6 +26,12 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) { UINT8* ptr; int this; + int bytes_per_line = 0; + int padding = 0; + int stride = 0; + int bpp = 0; + int planes = 1; + int i; ptr = buf; @@ -35,12 +41,25 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) state->errcode = IMAGING_CODEC_END; return 0; } - - state->bytes = (state->xsize*state->bits + 7) / 8; state->state = FETCH; - } + bpp = state->bits; + if (state->bits == 24){ + planes = 3; + bpp = 8; + } + + bytes_per_line = (state->xsize*bpp + 7) / 8; + /* The stride here needs to be kept in sync with the version in + PcxImagePlugin.py. If it's not, the header and the body of the + image will be out of sync and bad things will happen on decode. + */ + stride = bytes_per_line + (bytes_per_line % 2); + + padding = stride - bytes_per_line; + + for (;;) { switch (state->state) { @@ -72,74 +91,94 @@ ImagingPcxEncode(Imaging im, ImagingCodecState state, UINT8* buf, int bytes) /* when we arrive here, "count" contains the number of bytes having the value of "LAST" that we've already seen */ + while (state->x < planes * bytes_per_line) { + /* If we're encoding an odd width file, and we've + got more than one plane, we need to pad each + color row with padding bytes at the end. Since + The pixels are stored RRRRRGGGGGBBBBB, so we need + to have the padding be RRRRRPGGGGGPBBBBBP. Hence + the double loop + */ + while (state->x % bytes_per_line) { - while (state->x < state->bytes) { - - if (state->count == 63) { - /* this run is full; flush it */ - if (bytes < 2) - return ptr - buf; - *ptr++ = 0xff; - *ptr++ = state->LAST; - bytes -= 2; - - state->count = 0; - - } - - this = state->buffer[state->x]; - - if (this == state->LAST) { - /* extend the current run */ - state->x++; - state->count++; - - } else { - - /* start a new run */ - if (state->count == 1 && (state->LAST < 0xc0)) { - if (bytes < 1) { + if (state->count == 63) { + /* this run is full; flush it */ + if (bytes < 2) return ptr - buf; - } + *ptr++ = 0xff; *ptr++ = state->LAST; - bytes--; + bytes -= 2; + + state->count = 0; + + } + + this = state->buffer[state->x]; + + if (this == state->LAST) { + /* extend the current run */ + state->x++; + state->count++; + } else { - if (state->count > 0) { - if (bytes < 2) { + /* start a new run */ + if (state->count == 1 && (state->LAST < 0xc0)) { + if (bytes < 1) { return ptr - buf; } - *ptr++ = 0xc0 | state->count; *ptr++ = state->LAST; - bytes -= 2; + bytes--; + } else { + if (state->count > 0) { + if (bytes < 2) { + return ptr - buf; + } + *ptr++ = 0xc0 | state->count; + *ptr++ = state->LAST; + bytes -= 2; + } } + + state->LAST = this; + state->count = 1; + + state->x++; + } - - state->LAST = this; - state->count = 1; - - state->x++; - } - } - /* end of line; flush the current run */ - if (state->count == 1 && (state->LAST < 0xc0)) { - if (bytes < 1) { - return ptr - buf; - } - *ptr++ = state->LAST; - bytes--; - } else { - if (state->count > 0) { - if (bytes < 2) { + /* end of line; flush the current run */ + if (state->count == 1 && (state->LAST < 0xc0)) { + if (bytes < 1 + padding) { return ptr - buf; } - *ptr++ = 0xc0 | state->count; *ptr++ = state->LAST; - bytes -= 2; + bytes--; + } else { + if (state->count > 0) { + if (bytes < 2 + padding) { + return ptr - buf; + } + *ptr++ = 0xc0 | state->count; + *ptr++ = state->LAST; + bytes -= 2; + } + } + if (bytes < padding) { + return ptr - buf; + } + /* add the padding */ + for (i=0;ix < planes * bytes_per_line) { + state->count = 1; + state->LAST = state->buffer[state->x]; + state->x++; } } - /* read next line */ state->state = FETCH; break; From ad5609d2001ef3486420ebd1964938c1de3ded9c Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 3 Mar 2014 23:09:35 -0800 Subject: [PATCH 049/101] updated changes --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index ae917acb1..b3838b562 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (2014-04-01 est.) ------------------ +- Fixed opening and saving odd sized .pcx files + [wiredfool] + - Fixed DOS with invalid palette size or invalid image size in BMP file [wiredfool] From 93a8bc9bd3e3b7e44df4a0b44ce9fc798d7b0b17 Mon Sep 17 00:00:00 2001 From: David Schmidt Date: Wed, 4 Dec 2013 15:07:36 +0100 Subject: [PATCH 050/101] * fix palette handling for converted gifs * fix gif optimization * better auto convert paramter for gif save --- PIL/GifImagePlugin.py | 12 ++++++++---- PIL/ImagePalette.py | 5 +++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/PIL/GifImagePlugin.py b/PIL/GifImagePlugin.py index aed525824..8882f9beb 100644 --- a/PIL/GifImagePlugin.py +++ b/PIL/GifImagePlugin.py @@ -237,7 +237,10 @@ def _save(im, fp, filename): # convert on the fly (EXPERIMENTAL -- I'm not sure PIL # should automatically convert images on save...) if Image.getmodebase(im.mode) == "RGB": - imOut = im.convert("P") + palette_size = 256 + if im.palette: + palette_size = len(im.palette.getdata()[1]) // 3 + imOut = im.convert("P", palette=1, colors=palette_size) rawmode = "P" else: imOut = im.convert("L") @@ -248,9 +251,7 @@ def _save(im, fp, filename): palette = im.encoderinfo["palette"] except KeyError: palette = None - if im.palette: - # use existing if possible - palette = im.palette.getdata()[1] + im.encoderinfo["optimize"] = im.encoderinfo.get("optimize", True) header, usedPaletteColors = getheader(imOut, palette, im.encoderinfo) for s in header: @@ -391,6 +392,9 @@ def getheader(im, palette=None, info=None): for i in range(len(imageBytes)): imageBytes[i] = newPositions[imageBytes[i]] im.frombytes(bytes(imageBytes)) + newPaletteBytes = paletteBytes + (768 - len(paletteBytes)) * b'\x00' + im.putpalette(newPaletteBytes) + im.palette = ImagePalette.ImagePalette("RGB", palette = paletteBytes, size = len(paletteBytes)) if not paletteBytes: paletteBytes = sourcePalette diff --git a/PIL/ImagePalette.py b/PIL/ImagePalette.py index 61affdb19..d5b9d04eb 100644 --- a/PIL/ImagePalette.py +++ b/PIL/ImagePalette.py @@ -23,13 +23,14 @@ from PIL import Image, ImageColor class ImagePalette: "Color palette for palette mapped images" - def __init__(self, mode = "RGB", palette = None): + def __init__(self, mode = "RGB", palette = None, size = 0): self.mode = mode self.rawmode = None # if set, palette contains raw data self.palette = palette or list(range(256))*len(self.mode) self.colors = {} self.dirty = None - if len(self.mode)*256 != len(self.palette): + if ((size == 0 and len(self.mode)*256 != len(self.palette)) or + (size != 0 and size != len(self.palette))): raise ValueError("wrong palette size") def getdata(self): From 232c175bd92320100f891d0d9874cf8230a19b1b Mon Sep 17 00:00:00 2001 From: David Schmidt Date: Wed, 5 Feb 2014 12:49:06 +0100 Subject: [PATCH 051/101] fixes #513 --- PIL/PngImagePlugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PIL/PngImagePlugin.py b/PIL/PngImagePlugin.py index a6038d9f2..30fdfb8ac 100644 --- a/PIL/PngImagePlugin.py +++ b/PIL/PngImagePlugin.py @@ -505,7 +505,7 @@ def _save(im, fp, filename, chunk=putchunk, check=0): else: # check palette contents if im.palette: - colors = len(im.palette.getdata()[1])//3 + colors = max(len(im.im.getpalette("RGB"))//3, 256) else: colors = 256 From 6457eed2cbedfd785df8579933d0ced0f6729df2 Mon Sep 17 00:00:00 2001 From: David Schmidt Date: Wed, 5 Feb 2014 13:49:08 +0100 Subject: [PATCH 052/101] overwrite redundant Image palette with new ImageCore Palette after quantize, fixes #513 --- PIL/Image.py | 5 ++++- PIL/PngImagePlugin.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PIL/Image.py b/PIL/Image.py index e71e2ad5f..442393a9e 100644 --- a/PIL/Image.py +++ b/PIL/Image.py @@ -734,7 +734,10 @@ class Image: if mode == "P" and palette == ADAPTIVE: im = self.im.quantize(colors) - return self._new(im) + new = self._new(im) + from PIL import ImagePalette + new.palette = ImagePalette.raw("RGB", new.im.getpalette("RGB")) + return new # colorspace conversion if dither is None: diff --git a/PIL/PngImagePlugin.py b/PIL/PngImagePlugin.py index 30fdfb8ac..d72a9c141 100644 --- a/PIL/PngImagePlugin.py +++ b/PIL/PngImagePlugin.py @@ -505,7 +505,7 @@ def _save(im, fp, filename, chunk=putchunk, check=0): else: # check palette contents if im.palette: - colors = max(len(im.im.getpalette("RGB"))//3, 256) + colors = max(min(len(im.palette.getdata()[1])//3, 256), 2) else: colors = 256 From a77ee2d8c56c07ebf97adde4677d821cb7229ae0 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Mar 2014 21:43:11 -0800 Subject: [PATCH 053/101] Fix test failure when optimizing the palette on mode L gifs --- PIL/GifImagePlugin.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PIL/GifImagePlugin.py b/PIL/GifImagePlugin.py index 8882f9beb..c6d449425 100644 --- a/PIL/GifImagePlugin.py +++ b/PIL/GifImagePlugin.py @@ -252,6 +252,12 @@ def _save(im, fp, filename): except KeyError: palette = None im.encoderinfo["optimize"] = im.encoderinfo.get("optimize", True) + if im.encoderinfo["optimize"]: + # When the mode is L, and we optimize, we end up with + # im.mode == P and rawmode = L, which fails. + # If we're optimizing the palette, we're going to be + # in a rawmode of P anyway. + rawmode = 'P' header, usedPaletteColors = getheader(imOut, palette, im.encoderinfo) for s in header: From ae5bcb8e84ba26383dcedeac41f6cf833bcde199 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Mar 2014 22:02:03 -0800 Subject: [PATCH 054/101] Test for issue #513 --- Tests/test_file_gif.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Tests/test_file_gif.py b/Tests/test_file_gif.py index 3a6478e2a..7c4428a44 100644 --- a/Tests/test_file_gif.py +++ b/Tests/test_file_gif.py @@ -46,3 +46,20 @@ def test_roundtrip2(): assert_image_similar(reread.convert('RGB'), lena(), 50) +def test_palette_handling(): + # see https://github.com/python-imaging/Pillow/issues/513 + + im = Image.open('Images/lena.gif') + im = im.convert('RGB') + + im = im.resize((100,100), Image.ANTIALIAS) + im2 = im.convert('P', palette=Image.ADAPTIVE, colors=256) + + f = tempfile('temp.gif') + im2.save(f, optimize=True) + + reloaded = Image.open(f) + + assert_image_similar(im, reloaded.convert('RGB'), 10) + + From 1706e59d1c3ef8f54581948fb1d17b3399455e05 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Mar 2014 22:02:22 -0800 Subject: [PATCH 055/101] cleanup tempfile --- Tests/test_file_gif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/test_file_gif.py b/Tests/test_file_gif.py index 7c4428a44..2dcd904ff 100644 --- a/Tests/test_file_gif.py +++ b/Tests/test_file_gif.py @@ -38,7 +38,7 @@ def test_roundtrip(): def test_roundtrip2(): #see https://github.com/python-imaging/Pillow/issues/403 - out = 'temp.gif'#tempfile('temp.gif') + out = tempfile('temp.gif') im = Image.open('Images/lena.gif') im2 = im.copy() im2.save(out) From 226fd2e8a8f0b1a4f5ce44537ea29bbe17b94ecc Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Mar 2014 22:29:27 -0800 Subject: [PATCH 056/101] Changes updated --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index ae917acb1..b06e1eca5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (2014-04-01 est.) ------------------ +- Fixed palette handling when converting from mode P->RGB->P + [d_schmidt] + - Fixed DOS with invalid palette size or invalid image size in BMP file [wiredfool] From 0fc225c358a45419b547271ca2b6ad27d544d673 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 4 Mar 2014 22:29:55 -0800 Subject: [PATCH 057/101] test for issue #434 --- Tests/images/test.colors.gif | Bin 0 -> 17855 bytes Tests/test_file_gif.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 Tests/images/test.colors.gif diff --git a/Tests/images/test.colors.gif b/Tests/images/test.colors.gif new file mode 100644 index 0000000000000000000000000000000000000000..0faf96760bd07e176449d278325fc3e2feeb2a63 GIT binary patch literal 17855 zcmV)6K*+yGNk%v~VJ8D#0`mX>|Ns900093000030|Nj60{{a60|No)@N4wzw00000 z00000000000000000000EC2ui04D=p0ssX5Fvv-(y*TU5yZ>M)j$~<`XsWJk>%MR- z&vb3yxCPw1-Pb=_a5j_`V@Fu>wRAOGPgZm_RaH}0RQ9wrbxYrzcuX#v&*-#z&2GEj z@VIdFX%FE2n&d<=%(v`D9)kN1u z*+|<;-Avz3;ZWmJ<g(+7?(gvN^7Hid_V@Vt`uqI-{)(fFh*W=w?ydkP&&wCK+uMUyIBiuCAFrck5ebZStlRjE{EUfrrRt3s||iFyr7w&&Qg zXs5zli|{9jnF8XZLpL6o3MN0}CEZxUk{Fh!YR)n+m{y$B-jSo=my2 z<;$2QGrp9zv)WYv42T|0y0q!js8g$6U3zmX00d&oo=v;9?c2C>>!$5m2LJ@Xg9{%{ zytwh>$deE6EfB!o=g^}|7r;EI;_KM6YcEjU$n@{v!&6_iy}bGJ+*f5@&%V8T&73{U z_t}cI{rmXyw_dfozyJRL^8U7pas(D=;DO7nBHn@w;+4c-=tU@@VGA{A;e~M(0pWxk zcBtWdAciO+WLeB-B7Fqur{an%j;3LNFvcjOY#JhH^Z2R}hh9wwV`F zX~wB0K~L6cXJl1Ksb`8;_UWgK1aT?optB(r=AnoRhm@O)-UX+ekRCP=n~hdFoaj7d+Ul}4{=RA}qq622X|2yfi>tJm?%M0N?EMO?XTlOIu6@QLYp#FFGOI3u z&Q@DywAjW;t-J4D8?T(&a;vXp-hP`_xZ?(lTDj*Ij9a?vCTv{0^cD&4y-Lp8u*3F3 zjAp+3W{l@2{(?KIz#yMSu)!o3N|3@RFHF$HWIk*$kP@?uq029Gh_S{R_uDa90EZm% z$R(GIaLOq&EVG3)*DPVoM>m+X(&*ic^S(Oo3|7xR3k-D71`kcN!bUgkW5rP0EQHuI zlU?@GbWS~W)mCqXb=Ko<&Gornf4y$lXoL6k+S@VRx8G_5exB5BZwq(aR?ST}v3B2$ zY~Fe^%lF{^(*14toJ3&Gpyrz^hIrzK6zW>gx>B29Sdhvd;j{Km? zFMnwB)Z1nK^@2^GzN65y-@8HD10PEFTzwC|smG6x_sQl<82a8fm-WdnJ)I+k|7hxSHmYVZ)H7v9;SdOy0fLxK@4Og0)?nWPjRt}JozFR^RvV; z!pMncL{JppcSYV+@q>l5T^8#IqC4VICwkoDefsD}00AR9h&)wA&w3sRoVgPx;Qq*| zOnvS!djs{=It`i=U?S8s+FYn@y6Mp3Ky-EzT^mKiX;CU-G^6h+C_6p+QPPA|qyjQX zNxyVb&!x0%D~)JNvG`J$wnwJbq-jk>b5oqssG&PmPERrSQ>Xgb;R%urSp$tF)SCtvWtpzkJGu!G`!Pac9iS?=}^ZL=f;_~Co0?7PNueayzM)4{=3lK?zFdJ z`z^$RyV~KZw7ACI*kT#G6Xjx)x$<;wNuk>`>E4SHO<{{dFl$2V0u{U2eN1}WQWfrY z*RhXf4S2UHUWbzRXy%RAe1W3gOtAN@A&e`9;2U4$cG0`R+%GZvThIT>EWp^Ju5aCI zRR(9*!A68|DkaQH3VYMSEW@z0G#p(H57xuR1#t^SEJ+eq(!|F^@x)ZDE6%2s$Fzd+ zY-Kz_8t>7@e#G%Bb=)u@Yl_9?b@69~99p~D_n`UJ50b~IWK1%dUQW)IkNX+sX;GQ3 zM8=a;C6ZZFTY{2w-$G<)=zIUnbj`5pr{^lCMD<^O_*_&YM#8MGaWcOZyD8lobk?%J;)D?FJ`bFDyGbRDaypqqPv_a+vi5Pky{B`}(cNDV?yK)l>x(9QL=)f8Wq&&I0jhjhGvA=j zA6Da)82m}Y#=lzySq zcSB@)qJj|ihacmoLHoyl$8mla5_`8JfaOGhwPb)Qgn%XzelY}n`PU_ehjS8$Gyte- z**AO`m`fT6K^zz?^|ydFg?}MfeqiH&(RPBamV(x|g3#20_49)A;eo0|gZXEJ0)&I( zrh~j@ffeP0=LCe>BZMBKfHJ6GMi_laxHn43a!fdFPRLSFxKHl~e|05)^Jh(=afQMJ zf+2VtTi8KeICW!qQ9Ss6b?8!MXodqog`Gi!f6{=*5{H&EhkBQXM0JNbh=^LXhi2%9 z{}O|2D2O!lfDmX>hDdt}R&dXzh)c+b{%qxl@CS)#0f^y~g{w!1Ht|_S7;d!~dAGQhuTG7^7s{LNPkwy zhJ&|@N%xFrRF3u5j`yaHZm|gm=0h9uigTn8@OWI`H&e?fgUv`N{D>&)I5yW9Th&O1 z!N`f-Cu|LwR1dj;5lKekIF4uKj~WM%80Q`4sFEBfg}<|n!*h@Lczg#q^aXIhTZUm>rgwt+<$s*^%c4nGYtJ zVWp3fHJCj0luwwKwpW&~rI|10m{o>`^@wFPX_McGlW;hKm4%t#QJ0A^jHou0E+?DS zMw|86jaEXLGJ=}Oq?%&Ln#t#y!UdaCW}F^}oN1_>Rl=Of!kd2On@2*Nc*mK*=$Qx! zn)xQ0Ax4^dWtwBU|J;ZSGlG@X1%;nOgKomG-HA z3ff`}3Tz?jks}InC5l@odRi&UTPtdoEn0x)36RouRVtM;8k#jqc}e^Wn9I%0v!UxQj< zg}R`JN`@@?kNz>LaE%Ick6L7rT4R$sVU>DfmkOhqYKNPelARiKpE`7*s%E2lWTjeS zr^w|Zl_x@5a*XT4f=zY3+ndY{AUcg5;x$LeRv znq)maUMstz6cvaQ3af7OrSFu3|c^E?TVzsIIcc zuA=6yZU(P{Ca(xbuY4D+bc(M)daetZr4&Syw>g6pTA^PmNCf+H2K#vSs)FU3hztvl z4vQ83N^1ZMXaY-V7u$FlD|#E-gB{z5AIpa!>p~D4u}oF6l6JDDma?F?vc-q6hN`eq z8nchc{;$L~vc`t9q^7gD#PK$hR`*w-5)o?IySZN4U(UxEP7J)`+=Q$+$+@r_BesA}6{3M!6Jcx!?Ao z6==7rinn&^xmXFh;3v8?N4gScx+aIZ@us@y$hy4Bxo8QyYAL%8S+h3VrE*80mzbdo zWVky>4FVaa9P7G>D!h*wy;@P5G_j>&G_kr_u@9xY0|&f_IgbNoU@hsqO{lLM=(t7x zN4rPoy*LTJnIygz$GbqO1VX94V935M=)PG8zguU!B!|12XTK>6uW!GI#a;7Gp(vcY??p#c?? z>FdC;3Bku{!Y9nVT6V!*m%;o8!_}j}cKN}08N#eO!X%uo#0!$f8?ngyA1)kx((AHF zY_2rSsWyC_IGmk2Ji$E7M?PG5Kn#3BtVUbBpGK^h2T7#<8-xJdb^`o%xmlq(sSrDf ziB&wcSM0G_?5b(ppk3UbU#z`h%z$Hzc?Rry2)uKFETe1OnQd&PZ#;!@40!%ee4A0+ zN@YxUjVyK@OpTFzo|BBGgZ!X{T$_gsSuFg9Q!G`VYNNl_sitB$keyTWvk4Ki_G_0%6fXrBAUv{ z3C*e3$~8#M<`>I}_sE_)%0kM`jOxua3eMXp&b%kh4j9bBe3Hbh#n_y=+C07PEUfUX zrt)}l|YS;LcJTW+<$8=$!(pG`diF#Ey9km#7w->Bzo7A*wK~9 z(D{7Ea1_e!*wr;X*hyU2OYF?vQ`UjV*v^>NtJv41{MQ|NxiAor-!BQ`^UC1R3gEMV9gFa2 z=cg&>&q?PGZs$y}zeFysB)+$7j;(%{CDCNZoz2h-n5;;oCKuA`(*f~+o<9xhj70S1|l=Kieb zs?Fzq3hHip>r9&Ky}9dj+3OBO<-jWJ7ESCWZtS6p?0~B5aLVlA>Fk;5>IWg_68>C=AcP!Q~SZrF!i3MY)hQ@z43t?(7qV9J>;4$ z$gm!}>R!C_t-|#!g7D6x@?P%rE(P{J<5&7N@%@Lj?u`lo(Jk;v|(>h6?E^Mq>icZ>7ut@9?$^Udw) z{R;H9D)gpG^pQ8%WL@>TJW8y_&3k^d#>;M-pg1h_R324 z%xd=`j`xrp<9@C2@6DG!Uy3l_oun_tlns`-ta&GK{zB9jYnis zxnvT49{@r+rBGEu{S=(M#smwH;zflO3MsC!Iw?WPEVAW9??sP?@7xG07`-88v3H z)z2fMe*r}{LYOb#zKT*|Uj7rCr-tRol0bW5w0Wly6^=gYi3dQ<;E4IVO+fH8Kcau=s zkH7K!3vC4S5~M~w_8NQxJ_u!ykG|++!_PYX21KSm10V6OHvu)=u)q%KF;BtKkaN&M z)goMNLJD8A@Iu-$6j2urOJwB3%|g5pM-p}H4nY(XRB=W2T68fn7-MW~Mj9ov@yA*j_gl^3WsdKYBIP+{{PCn_dQz|?ieRR)Acl7g5(gGFqv@;9kNliq9YID(} z8nv{kc{2V56F_1rU6oZRTzwS`P5~wJ)51a(6(Ukgt>{!#H&S)hGa$8f(g2Z7L|Io) z%rz!Yc^%8wU%?4hSaXOiHlt&mB?H<_nLSBdJzKT4&oHM=aaU_|!ZurPw&j*BMZbL$ zJ9IBqcU_WT#YfhD6~PqXIyZHd8dbdR(Eqjsxph-zGV3 zx&FTfTQH=@&Sq(|c|beuCDvxU3#!Gn`b)ZpAz0(Zuhe^QO2qaX8^8nCVQ|8yE9&s6 z-L_kvTII%4;JQaIBXS`prwFttjNeNPUX$w${?~Lnjz=B?<(J>2dFSK1J^FE`r~ZoGk zu!xx|mT-7y?28s($i*(6;ET=+WAw(Dz%u5Ejde*Q6e74st+?@maEzl)z)(Z9tPg}x z6QK@gk;gazGAn%q&kBn-G(q-{kcE5@B1=(6+}-hgkmQmip+d>W70i@&v7ixQ_diay zXD6N{pduHkHr+WAaaMfWDqVugKVGt9nH=9H3pvNyfwFU=q#h}$y!UG6~1b-8u6|&@7KMg=fiXT5~J7a^)qo*+p&!&YNiiC+Nifc_VUW#+=+k zXX(@#iB`4~6R?b>3F2wCc|NV4hr6eU@);0d_A`I~ykJh= z{or&rB{k_z(`ikgQVys96e_)l$}OW_45{H!s)?AYLZ|vOrBU76RLxh_Y+Y4YS=HB8 zv(we#d^LzB|*_WILnTUlewYbzK){7v}z1zkwD_qID8!7iHSY zp!QaBO=ZqlyDW>c6o#=X=nG`KB-+Nvwr;}hUVa;`%VKtPy6uH)-38mR9oI$4rO9T6bD1b{LD{zc zn5|W2=bYIwSH=)8wU%FY^~h&ED{?R0Og}mclxJXI8I+?u%PH?b&reFTn)BmlpaI%r zgJ$%~ex~H}>Nr<+<`tfsGwHSPnNgS?nWh6qUJ@(r&<;#=J4yX1Q?mxuY^t=SYr$&B z%^HPpHY|?+DQZsLT2Q$*6j0>L3RTw#*s2tE3yHnjErSo)e^Pdtn5~#y)2!FN-r=-; zn{Cr|*~sY5TdiH$Z6tl0N%4I)5~4jDHh;v`nu7I>Cp_o$Hu*U0Ev~%J5^kZ~TT13G zF}sa-ZT>nOlkVOk!3%2eG|{`>(i=Cq^X-v+{(BALUY7VUDgH)_#}eZ{LAa?p?(2{D zBjkHfx1mek(~}P}<(yo3Pg-sfg-0gl{FwRa`VDFd2VCI8b)2JB(DRr%d*iqiGP&W$ za9Q8nt|MQ!vDLY4F~NI9P>;yZhxKwx51l|-Z!Ont?scf0gztP7w+!0%Ssa`***?;f$ z+sF4IgZ~(}PZ;?lZhkJKzpUz~D*I{a{)&OW`jS6mpuY;TKa#^gZPP!z;=i-xI6@~Jyec9)RsT2w?G7!IP9T945UMBxI~<| zMYJ#({H!$uqey(VNo0pX?2tpx#&kHvXW7M5>c!oI z#;ub^MWe5k#j^al#@<%cPNIeqBXp%=D6vwth$clS1;Uj@Z{Dvf?IvmWwU;9Ek z#K_6ZJ%5uk>eEP+PGEG-s)SC%NX@39PLHxqkiyQS(oVt(O-me2L@Q0!J54Af zPxuN>;oQ6K%R#&}1QjG2?kq^Xgg)^6J@M=n>9i{IoGJ9IDfQ$k_OvXOL^^;gP$E>w zJ4?{CTF|0u(4%_Lw~Elsa!*}^Pg;{t>!Z&Bvril1P^|J$tpZWO64BHm(SS73VMEdM zQ_=QfQNeN1wSv*Ml2OW{(cStg-4jsnj7AH!LJTc64dp8!^(!F-EFwKEBdsnQok$$D zHXYqR9xb>ib*w3ctSXhPE6uGe)vqnh$u2FtFKxguy|^(Atuh_0GcBz&b*?uxy#N)> z0Uf{oCPl+1#kD$}tvju)JiVfCD%4e2%`MWL9Bx*1F5e-&7y`%+KVD z0;Md;Wt2X%G{XsH#!!R7*HYDZgVrCD))%AJ7_(L_<5pFCS9X<5dA%-r9Xoq9GkguQ zq*TiOEQJ3AC~b99e?3TVoknm?JaHXYyerrIBu-gqSbv#V(gfJ@Gu4c&*mAts&CC8+ zjSWdA@YuFt%GzvKS0vf{Guf16r`*&{ec;XB)Y#`qSS!<4EaTTU!&XC#*sw%eOpMbH zR9V+!*>QE*`PxtK16ekBSoE^l1jO0B)Y;rTP?wR~-=x~uAlg2Y+2y3!dgR&-WZR!a z*?WXq>?2#PG+U%pzI6RQqt&-i-87}0OQz*Tr;S0nT|c`mKD@Ozv|Tl|?M}FTO1CA$ z$el{Cea?b4!o*!Xp+z&RT|TT0n!)|Xz*WrAjYqjn$iwZw&F#F--LuA(w#QY{%Dv4| zjm%~o*d5DUEIimedtFNV+|Uc%tr6JL^~}>9$iki1)qTL$g+1U+wb;eCzWxQdzkRjI zWyRWU&dUu?-c`fr)xPNEHQ}{7;!OjsotrjYT9J*<)ZNJ7^I0F2wDA?U^7XjtmAUKX zx9pwA?fuT~HPG+nL*G5M`c1g$UAWmT)&FJE+ojN!eBYOhU*4QwXGYCvo;M6;O4 z4%-2C#H41ql%BwquE3T?#gO(>hlX5;HsnLLhD5IDtz2q1Y-%{X=3C8XO5Nr)E^9-c zXnDzMpWEsZbZbHU>TaBA(2VKz6ztg?X*oUW9@Xopdu3Se>FRlE?Tu=tB2{cgW^Jx& z%)RM+%xO_%Ye4jCN`&iVmFrxk>sq#JiN0#AB)%~wV=#c_Gw$kk1nhS-?BpD4+Es0< z=In*^Y+3|uNfd2UwC%h!?PW#nW9IG6R%|f^Ze#uwZc-#}TqNynMec4@ZlLaF#C}@m zrc>y?M(MUi>Q=@G<7PS5Z3bp;@Ad5r4exXuZ)7a*Z8UEG>}UX0@5^Ry0d{XFh3|co z?`)*+*R*bl#qNLAZm8z&qHfmj?o|M9NC96*+WtoaXI})TSOuSK2G{KNe(-0FaKfDM zeys3_yl{}t@R8kc|K;xz2Jvup?bl{xA2w(Hv~P#SZ-~|JxW??c_VE6Oag3GmMds@p z9Ok{vXX3%_?d0*&_VLpO@)zGt`DN^~Gjb&)@Zv;q(+z7#b@J?U(wHuI=O^Zu5_a*;&xbX9V=VeN=^s;sIs&J+1E=!@$8?+4bk^ncyY=+5h4rtL^%O7V+tn_O7IM#I$yq#`dJu_AZBWB1UwJ<#p%v zb;SktyhT@H26XRH_oij{S$B6phWC_}_Yty&scX-mh{`iR3l8I;LidS8NXJUhQU5?jGk55mK4^cxOQ9`GA zkw@K>hhK1KWsTp^m#@y4chH%i(VDN(n>YHLSKjV6+?BWEmWSS;KhL56(4sHWqd(Hh zK6!WD`3dfMX#ROAoq7hX`WU@>EX{f?O?s?f`VaQ{8wPuM7JD)!`wlhxAw_#NReLsN z`?Poa8iso>mV1+?dpouJ7sY!k)q6e#{7W`O~rgF)qFkWd{E_kQLXg4?r^il>xY4MNK}0@Wqn3<{aN*VnwI_4 zrv1BSZOg=ckhFKk<$X{7_5ET6{=o+QYZrZyCVf>l{pXH)@zs4iW&T)o{%eK)!-o8J zCjQm-Z$m}>R^@(0_5NW6|9+)@u_k|iHvis!dSBN56lQ-;b^mIGe+UqA(rRx7VFyq? z7>XlVnkSkT8rQln9B%+t+c%!;5<9CuFenW5I)fuJsa!If&L=cFD9Sp3D>kd$a=YFy zIDBmZ4CXUBtzNU+?l;^}1Q6=>J3g=9^ZWik!5qRMVWHt6Vxr`;YbWJHLHFf^1C-!IbHn+fZHF&1#D|xy3HO#I0Iy+XbFMPZ_aB;o;{V}ut{=Ref zf2eQa1_nWw%@Zhq3L^<5=n%y?a}p;)G3cTl#*AU!$?NFx4ZV9JNBYUf?GY{FGAXB0EwmCRRvu({ zawS6ZYlp02wSMYqHe%4U=Y|?>i;?5fxbvDqtvk|W)x7=4j1{ZXEWw#zU)b#1@-N_U zXw@=qC3uG0$m-zAt;{Q=-OR-B>g}BDs^Nu*6IXjYSX=0uqe-J$-1v1xwvuPhqHOs# zNV=ML2ma&i`L{~GsSj^u4b$}Lu)|rdHm=Dv?C48O)2_Y|DemmKKk@GV$2aishCnap z*4#NJKnu6T-k7QRNEh8pVU zA#VPO=pBhA{&pgaq{ukqbIA==GsCpq}QSt+Iz?-(C*At%1iB{5>yp%p8LY5n5?kzz z_8}XYvdhi|qO-d}8|_}GX?ti{-4>)ILhtHeZJ>*SyB3Dywu+s(S?ZQ`Su*<0I9Ra(E7h{}} z#v5NYu*b849P*bVlZ-*jPo~Vt%Jv40`QZ1kUhAE@ z-F>qz6X2Sc!+Gbld=5GrqK{ru+@?!Q_ui4O2wX<=l;PDC{uIHRG>2t2hHC} zvvJg9k~JNIO-CYgSA6VaZ2ARA-BnSLa2V(4$|<*V7Ac*9DJJEPsm*P6GjQLGUpU!@ zPsQoeYW#$eKjQh1x>j)=-{36h#qL%tSTF&VhQcpk_O0 z^*(B|kWOu+AuH*FP70273e=((?HT|y>I#_VPNo~9Y0hkVFq|GprxQvk6Ibffi@H>4 zMD3hW1BO(ODV1GJZI4r}0oADXw1YpzCL zWmK&Q+L@`gPOPnzYrO>9(8%@@JQeM8g%(}EP8Va<#nN^0lwBch z*D&2}6LTwS*0Tatx0?fOZ*lQk;Py_q!{r_kjmwDk_T{|`imz1XdsfL-ti1j@FOSl@ zrS+DGz+*A+yg=LB@P!v?<5jDJ#|2>zN%%(;CXj{w(qYLVxU3$aZ+-3igb*WZ#2qTJ zh)z796lVp)ifS=zCL@}rQvA0-2b z$tFVX09SlGDL+=s&ik-Xu`DMoYlqAK@iK6nj1wq-ip<3`^Z3xLQ8ibI&Dn7?kOC}V zMaWq-c>bTAnRn;{>KUGV&XJ#c6Gzj2<8+x_JvJ*BOVk;P^lbp$9aZ-X(~6cg*I=D#Pq!4*QN(Y4vw&-XVG-5D zoxz}82xeLW8*hp3-=Z;SY&0S}*<&DesXsIWXZJ?X(LMyLnUif9UfY7}R^qlL*==tx zyE3}wlDW@qm1#S-+L&99`O4>OZ2SHjhw(0Dz&k)aZwnUO;qK3=$sLG%>w6IC7R|e# zm~ir)NxmDJ0=+xW?&6`-{!O;8j_Et1`?_F$Zim-9@cAnIAjiDQ zqPHsP0h#=BjXU)KE^ft7*T~_|-t)Ak`co0-_rS(h=uDlKxSf2)-H#Hkb+O5(?Ur3X+5dqJ;g8 z0{+ns2o4(rQq>I>Sq?VS4hm8a-UaU&R}jXC2YOf$!rT#ln*aSD1J_RBBS*wqkS=>Q8;5Suwp#`qqGU*{t;v7VdFV!;|Ox2 zZh2!-fa4W)pC(3-G!_vxN`fqUg*wt8JKCT-N>?Dp6fas5G#1i5%FzoZQXY=d9xfm{ z1|O>Y-(T_mmisANL8j6{Vp2jL(n4}lL&nlWc3?zG-$YX6X$@piWh5_c9I00oe2_-NQ zWi=V4Iw2)PDrKHAWturju zIiY1gsij1*B}uwvBc3Em?pFINXe!{(WXh+=1!UBOroZ?sb;9Lrg;IURtaZK5hqC*XHFsKQ^g+I5#Jw* z9CSXIbiNgJCY5#G6m~Y1c8XPlY+QPR)q38Qa~|9D)F)&Z=3%lCXDTK=)?mw$_`zR-C%moO)M_Uge9zn~)+L zkzS;cw#}bnSDYbuto-X5__F11sSfr}gq_S6~Uf87`Sf-|BqH5ft z(p;mu-=p5ss7_d^R@kYkSgT&$s_x~k^4+UuU92+PtZqkt`sXIzDvIT5kELpcMd<@R zX()DTH-0Llg(`{Fs);G;BCfUDs{#n4rS-cwB zy~<#{X6MD~VZQoZ!*1QfZdt^>*~DJj_(^HPw&TGXVafvJ!tUh9x?RY&*~kXkAuj1- zZp+3xWX7uJ&WdEmj$X_H-pu}4&F)&su3E*m+NNIP%3@*5=H$yZ<!u^{%Bb)TZ?R%x-})!%ik~O4talc!iZ*X@CT@H!SSl^!qx`zFogFU~=(#BDF3NwCjRu+-rd)lp{j zN~8u4CI`QT2P@tFGF|_M-~R%Q37ee-k0$)KFlM6bt;KNHWp6!-uz`0V?T|CSo5$tr9~O z6LY8&Tjm^FX&tZR9bcd#XW$-Z-WuCo0H@{-6KD?y6#^3`(F`v}ZgL8C@(MC?BU@Z3 zZ=N8_XDK`BDW|e3*90rCQ5iUH8aVK?b-*m|&%mlM2HWkKxMwmq@iK4jC8Ozq&cQDe zuYL+MyMnOJeX{dCGTlaV>msPYm2<~+=N}U?tQxYTBC_CdGvRr2{nfJTwJ{OqGQsY0 zD!*_ExpU>g^X19&20k+-i?j54F**mSItwd1%l@iApB@o2ra%KBJ^$V)Hzz(Tvp#RI z6eHb3D<3TT;6#%lIA^Xwv+xpsacPFJ5@vEwx-t)jG!&+>*j4oX+4C%BG&FCt3wQMG z!L$>~G#!q#9on=I;xs?%bV2j<54-eJ0<|0pwIRy#EZ;ESTr`X-HJMiOZcekFI(1k= zH6coMD-v}lPV!ct^b@CaLWA)_>uW<3>qm262Oz8J(KOu|H5FpDqH49BaR(Q%CV>{)=a7YvgMGVrx4lW5*&=qZMTLu4LnBX)|nG z^J{LmTvt&bo1(4du(z$>~jBPb0g$)_g-}CV{mg5?^<_pbLw%! z>~@1}cSq%Sb7Xi+;dn#lbpva4kL!Jx>v~gdd&}&5k7aznj|=&a15}U` z_mCTRkvH{-qivFt=aR=}lMm(olT&$=7t@rZca^L6mD_ff>+P0HXqOM?mse$&oB5c# zPMPa>dOK~1I=Y({Ztr@v+ZtvL~rAA2S7cJFtWMri;5DD>SuJueJX&w)1CQ z+w~A}`)tCy&%Qd(w)(&aJ1O7(yZz=nEARUnXuCZEd~piA*b+R@8hphgJVq;gAv3)7 z&UO5~d38$sbyB?HT0GfuI&G6XFY~k7fIJwAym^v5dYXLrqCDYZ{K;#4R&)G1dp!QO zIf=?Vh0?r*+B_WNJmPjZs$PV>$GRK#ioRb2pa(k9yXi%^`ynDFPwXmv*>&N=ySdmaChl{ zf^w(+eTF{Ak^bwie(e6E{_7`P;om;)=f3XmKJWLw@BcpV2fy$SKk*m8@gG0(C%^J9 zKl3-g^FKfIN5Aw>KlN9?^G|2zv|2Xx4AzA1UYGyz-GNS@b?QmYy>ee9947W-0nP4 zd7`ORR?E3gbzAhR-9o%ytk|RFc06ZO2ox$wPUEoBgibG9Sc_FUdZ$t{H8_Kg)jMqx zdzR13pxeD`r| z({fb9N72&}rv6hhR+eznl?=%fRnvB_2$Xh`wQrWU*b9|-(YQ2MI5nfHc3BnDOqqDv z&=)p1PV_T-h%!^w=K4;z@{#lf*_s-Qovs3W$T>=QuT|aE$ z$a$;wP~5kJ{xqI^IKQG9|iv5;Ict7z<;~ zayJtWjH&Y_FdeOO8l#7c$`UYXXx1Eh>Dp7Ew^X({S8gdU5?gU99JTaY$D35mfh8!? zWl>BnqKyqZ^^eju{@|K~2MDZLrgiyY+S1b?N-lDl*c;aKgWAByxB(uiSmZ}PRelYm zs|PaU{vb^qgPCVXY}LMYBVDc4)*EGD0S{@Ec^WjG%&Pt>oISc15nr_}cT0ShZ|<%Z z7fYtg8MwpSYejPmE?BidX`z>i;;dJ?P3mogM@~kZRrG@xrARHka6D}A=z)qwzFzrr z<0Fs5HC*)_anw%7I+uS8k?g2jPBU-{3DB$a2E435sFLfgk6k`8hd~PC?0bk$v# i-FDr57v6Z~otNHv?Y$S@eD&Rz-+ul57vO0k00296)luI7 literal 0 HcmV?d00001 diff --git a/Tests/test_file_gif.py b/Tests/test_file_gif.py index 2dcd904ff..4318e178e 100644 --- a/Tests/test_file_gif.py +++ b/Tests/test_file_gif.py @@ -46,6 +46,7 @@ def test_roundtrip2(): assert_image_similar(reread.convert('RGB'), lena(), 50) + def test_palette_handling(): # see https://github.com/python-imaging/Pillow/issues/513 @@ -62,4 +63,25 @@ def test_palette_handling(): assert_image_similar(im, reloaded.convert('RGB'), 10) +def test_palette_434(): + # see https://github.com/python-imaging/Pillow/issues/434 + + def roundtrip(im, *args, **kwargs): + out = tempfile('temp.gif') + im.save(out, *args, **kwargs) + reloaded = Image.open(out) + + return [im, reloaded] + + orig = "Tests/images/test.colors.gif" + im = Image.open(orig) + + assert_image_equal(*roundtrip(im)) + assert_image_equal(*roundtrip(im, optimize=True)) + + im = im.convert("RGB") + # check automatic P conversion + reloaded = roundtrip(im)[1].convert('RGB') + assert_image_equal(im, reloaded) + From 1e331e3e6a40141ca8eee4f5da9f74e895423b66 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Fri, 14 Mar 2014 15:56:41 -0700 Subject: [PATCH 058/101] Removed tempfile.mktemp, fixes CVE-2014-1932 CVE-2014-1933, debian bug #737059 --- PIL/EpsImagePlugin.py | 6 ++++-- PIL/Image.py | 9 ++++++--- PIL/IptcImagePlugin.py | 4 ++-- PIL/JpegImagePlugin.py | 12 ++++++++---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/PIL/EpsImagePlugin.py b/PIL/EpsImagePlugin.py index 94f3e27f4..88686343a 100644 --- a/PIL/EpsImagePlugin.py +++ b/PIL/EpsImagePlugin.py @@ -67,8 +67,10 @@ def Ghostscript(tile, size, fp, scale=1): import tempfile, os, subprocess - outfile = tempfile.mktemp() - infile = tempfile.mktemp() + out_fd, outfile = tempfile.mkstemp() + os.close(out_fd) + in_fd, infile = tempfile.mkstemp() + os.close(in_fd) with open(infile, 'wb') as f: fp.seek(offset) diff --git a/PIL/Image.py b/PIL/Image.py index b93ce24a4..0d8a235eb 100644 --- a/PIL/Image.py +++ b/PIL/Image.py @@ -504,14 +504,17 @@ class Image: self.readonly = 0 def _dump(self, file=None, format=None): - import tempfile + import tempfile, os if not file: - file = tempfile.mktemp() + f, file = tempfile.mkstemp(format or '') + os.close(f) + self.load() if not format or format == "PPM": self.im.save_ppm(file) else: - file = file + "." + format + if file.endswith(format): + file = file + "." + format self.save(file, format) return file diff --git a/PIL/IptcImagePlugin.py b/PIL/IptcImagePlugin.py index 157b73509..104153002 100644 --- a/PIL/IptcImagePlugin.py +++ b/PIL/IptcImagePlugin.py @@ -172,8 +172,8 @@ class IptcImageFile(ImageFile.ImageFile): self.fp.seek(offset) # Copy image data to temporary file - outfile = tempfile.mktemp() - o = open(outfile, "wb") + o_fd, outfile = tempfile.mkstemp(text=False) + o = os.fdopen(o_fd) if encoding == "raw": # To simplify access to the extracted file, # prepend a PPM header diff --git a/PIL/JpegImagePlugin.py b/PIL/JpegImagePlugin.py index 9563f9723..07a09232c 100644 --- a/PIL/JpegImagePlugin.py +++ b/PIL/JpegImagePlugin.py @@ -344,13 +344,17 @@ class JpegImageFile(ImageFile.ImageFile): # ALTERNATIVE: handle JPEGs via the IJG command line utilities import tempfile, os - file = tempfile.mktemp() - os.system("djpeg %s >%s" % (self.filename, file)) + f, path = tempfile.mkstemp() + os.close(f) + if os.path.exists(self.filename): + os.system("djpeg '%s' >'%s'" % (self.filename, path)) + else: + raise ValueError("Invalid Filename") try: - self.im = Image.core.open_ppm(file) + self.im = Image.core.open_ppm(path) finally: - try: os.unlink(file) + try: os.unlink(path) except: pass self.mode = self.im.mode From 8f9e4854704b52fe9a8c812363ca5bb9f9518b6a Mon Sep 17 00:00:00 2001 From: wiredfool Date: Fri, 14 Mar 2014 16:56:07 -0700 Subject: [PATCH 059/101] Updated Changes.rst [ci skip] --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index abdf73b33..328772d7b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -45,6 +45,12 @@ Changelog (Pillow) - Prefer homebrew freetype over X11 freetype (but still allow both) [dmckeone] + +2.3.1 (2014-03-14) +------------------ + +- Fix insecure use of tempfile.mktemp (CVE-2014-1932 CVE-2014-1933) + [wiredfool]  2.3.0 (2014-01-01) ------------------ From 94ec5a8cdae954df3551f91c04df7fc8890d7d5c Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 08:28:02 -0400 Subject: [PATCH 060/101] Clean up --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 328772d7b..389766456 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -50,7 +50,7 @@ Changelog (Pillow) ------------------ - Fix insecure use of tempfile.mktemp (CVE-2014-1932 CVE-2014-1933) - [wiredfool]  + [wiredfool] 2.3.0 (2014-01-01) ------------------ From 320308c4ce3fbaea5854be364e6bba810baa19a6 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 08:28:38 -0400 Subject: [PATCH 061/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 389766456..79a5aff8e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Skip CFFI test earlier if it's not installed + [wiredfool] + - Fixed saving mode P image as a PNG with transparency = palette color 0 [d-schmidt] From 5f5c35813eabb76cc896ad911590a0368e1b8875 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 08:30:34 -0400 Subject: [PATCH 062/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 79a5aff8e..29397ad7d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Fix support for characters >128 using .pcf or .pil fonts in Py3k. Fixes #505 + [wiredfool] + - Skip CFFI test earlier if it's not installed [wiredfool] From 2ce0d7e7d5b72770a032a11611878f53ca042d45 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 08:45:33 -0400 Subject: [PATCH 063/101] Fix manifest --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 3769b645e..3dba3ea47 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -27,8 +27,10 @@ recursive-include Sane README recursive-include Scripts *.py recursive-include Scripts README recursive-include Tests *.bin +recursive-include Tests *.bmp recursive-include Tests *.eps recursive-include Tests *.gnuplot +recursive-include Tests *.html recursive-include Tests *.icm recursive-include Tests *.jpg recursive-include Tests *.pcf From 7dbca485ae691a71afbc80a52d1f6ac01a2d484d Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 09:40:25 -0400 Subject: [PATCH 064/101] Fix error(s) --- CHANGES.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.rst b/CHANGES.rst index f6d2c3807..d36ca4ce8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,7 @@ Changelog (Pillow) [wiredfool] - Skip CFFI test earlier if it's not installed + [wiredfool] - Fixed opening and saving odd sized .pcx files, fixes #523 [wiredfool] From beb80ee0bb2f4aa1aad3ce2d2b77404c4dde7181 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:12:18 -0400 Subject: [PATCH 065/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 6a9712da3..9befc32c7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Handle 32bit compiled python on 64bit architecture + [choppsv1] + - Fix support for characters >128 using .pcf or .pil fonts in Py3k. Fixes #505 [wiredfool] From 55a8de3b367a19e3f5f362ba4dcb8a88fd65f4d7 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:14:35 -0400 Subject: [PATCH 066/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 9befc32c7..cb0820ad6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Make ICO files work with the ImageFile.Parser interface, fixes #522 + [wiredfool] + - Handle 32bit compiled python on 64bit architecture [choppsv1] From b769d06de675a5cb014daa2b8edfd66b0da88bda Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:21:49 -0400 Subject: [PATCH 067/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index cb0820ad6..b3cf969d6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Fix ImageColor.getcolor + [homm] + - Make ICO files work with the ImageFile.Parser interface, fixes #522 [wiredfool] From 11ab4018dfb6342745b8e4f1f6621f0ac4c1ca3d Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:41:53 -0400 Subject: [PATCH 068/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index b3cf969d6..2f9a50efe 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Merge from Philippe Lagadec’s OleFileIO_PL fork + [vadmium] + - Fix ImageColor.getcolor [homm] From d2dfe3b13737a7c5648717cf05a24bdb718059cf Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:49:39 -0400 Subject: [PATCH 069/101] Fix manifest --- MANIFEST.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 3dba3ea47..09f265250 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -19,6 +19,7 @@ recursive-include Images *.psd recursive-include Images *.tar recursive-include Images *.webp recursive-include Images *.xpm +recursive-include PIL *.md recursive-include Sane *.c recursive-include Sane *.py recursive-include Sane *.txt @@ -29,9 +30,11 @@ recursive-include Scripts README recursive-include Tests *.bin recursive-include Tests *.bmp recursive-include Tests *.eps +recursive-include Tests *.gif recursive-include Tests *.gnuplot recursive-include Tests *.html recursive-include Tests *.icm +recursive-include Tests *.ico recursive-include Tests *.jpg recursive-include Tests *.pcf recursive-include Tests *.pcx From 1c4dc75fabb34cac3c18d7aa0c0e9b74fd740cd9 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 10:50:58 -0400 Subject: [PATCH 070/101] Avoid conflicting _expand functions in PIL & MINGW fixes #538 --- CHANGES.rst | 3 +++ _imaging.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2f9a50efe..bb4131151 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Avoid conflicting _expand functions in PIL & MINGW, fixes #538 + [aclark] + - Merge from Philippe Lagadec’s OleFileIO_PL fork [vadmium] diff --git a/_imaging.c b/_imaging.c index f1a181b30..215c56bf6 100644 --- a/_imaging.c +++ b/_imaging.c @@ -846,7 +846,7 @@ _crop(ImagingObject* self, PyObject* args) } static PyObject* -_expand(ImagingObject* self, PyObject* args) +_expand_image(ImagingObject* self, PyObject* args) { int x, y; int mode = 0; @@ -2996,7 +2996,7 @@ static struct PyMethodDef methods[] = { {"crackcode", (PyCFunction)_crackcode, 1}, #endif {"crop", (PyCFunction)_crop, 1}, - {"expand", (PyCFunction)_expand, 1}, + {"expand", (PyCFunction)_expand_image, 1}, {"filter", (PyCFunction)_filter, 1}, {"histogram", (PyCFunction)_histogram, 1}, #ifdef WITH_MODEFILTER From 45ce1e04a7cf659c6ca905bcb0b2593a3b0af16e Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 5 Mar 2014 22:20:37 -0800 Subject: [PATCH 071/101] Docstring/comment fixes [skip ci] ref #534 --- PIL/ImageColor.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/PIL/ImageColor.py b/PIL/ImageColor.py index 107df193a..98a241bb0 100644 --- a/PIL/ImageColor.py +++ b/PIL/ImageColor.py @@ -20,15 +20,6 @@ from PIL import Image import re - -## -# Convert color string to RGB tuple. -# -# @param color A CSS3-style colour string. -# @return An RGB-tuple. -# @exception ValueError If the color string could not be interpreted -# as an RGB value. - def getrgb(color): """ Convert a color string to an RGB tuple. If the string cannot be parsed, @@ -37,7 +28,7 @@ def getrgb(color): .. versionadded:: 1.1.4 :param color: A color string - :return: ``(red, green, blue)`` + :return: ``(red, green, blue[, alpha])`` """ try: rgb = colormap[color] @@ -114,7 +105,7 @@ def getcolor(color, mode): .. versionadded:: 1.1.4 :param color: A color string - :return: ``(red, green, blue)`` + :return: ``(graylevel [, alpha]) or (red, green, blue[, alpha])`` """ # same as getrgb, but converts the result to the given mode color, alpha = getrgb(color), 255 From ed36893ca029785c9c8cd557caa5581a435c327f Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 17 Mar 2014 13:28:06 -0700 Subject: [PATCH 072/101] Clarified Test Validity --- Tests/test_imagecolor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/test_imagecolor.py b/Tests/test_imagecolor.py index acdb84213..c67c20255 100644 --- a/Tests/test_imagecolor.py +++ b/Tests/test_imagecolor.py @@ -41,7 +41,11 @@ Image.new("L", (1, 1), "white") assert_equal(0, ImageColor.getcolor("black", "1")) assert_equal(255, ImageColor.getcolor("white", "1")) +# The following test is wrong, but is current behavior +# The correct result should be 255 due to the mode 1 assert_equal(162, ImageColor.getcolor("rgba(0, 255, 115, 33)", "1")) +# Correct behavior +# assert_equal(255, ImageColor.getcolor("rgba(0, 255, 115, 33)", "1")) Image.new("1", (1, 1), "white") assert_equal((0, 255), ImageColor.getcolor("black", "LA")) From 203f14bb4c8e1a5cbe3bbe91d3bfd00d31fc2b90 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Mon, 17 Mar 2014 16:57:03 -0400 Subject: [PATCH 073/101] Wording --- docs/index.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 52a054e22..36f600c85 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,7 @@ -Pillow: a modern fork of PIL -============================ +Pillow +====== -Pillow is the "friendly" PIL fork by Alex Clark and Contributors. PIL is the +Pillow is the 'friendly' PIL fork by Alex Clark and Contributors. PIL is the Python Imaging Library by Fredrik Lundh and Contributors. .. image:: https://travis-ci.org/python-imaging/Pillow.png @@ -15,7 +15,7 @@ Python Imaging Library by Fredrik Lundh and Contributors. :target: https://pypi.python.org/pypi/Pillow/ :alt: Number of PyPI downloads -To start using Pillow, read the :doc:`installation +To start using Pillow, please read the :doc:`installation instructions `. If you can't find the information you need, try the old `PIL Handbook`_, but be From 1adf30b70103e3c4fad5ecafca0042a994de0dee Mon Sep 17 00:00:00 2001 From: cgohlke Date: Thu, 20 Mar 2014 01:16:31 -0700 Subject: [PATCH 074/101] TST: fix failing BMP tests on Windows --- Tests/test_bmp_reference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Tests/test_bmp_reference.py b/Tests/test_bmp_reference.py index 0e03df79a..99818229f 100644 --- a/Tests/test_bmp_reference.py +++ b/Tests/test_bmp_reference.py @@ -3,7 +3,7 @@ from tester import * from PIL import Image import os -base = 'Tests/images/bmp/' +base = os.path.join('Tests', 'images', 'bmp') def get_files(d, ext='.bmp'): @@ -78,9 +78,9 @@ def test_good(): except Exception as msg: # there are three here that are unsupported: - unsupported = ('Tests/images/bmp/g/rgb32bf.bmp', - 'Tests/images/bmp/g/pal8rle.bmp', - 'Tests/images/bmp/g/pal4rle.bmp') + unsupported = (os.path.join(base, 'g', 'rgb32bf.bmp'), + os.path.join(base, 'g', 'pal8rle.bmp'), + os.path.join(base, 'g', 'pal4rle.bmp')) if f not in unsupported: assert_true(False, "Unsupported Image %s: %s" %(f,msg)) From 324a6fb4e7342d424a05c9a884ea168bf3c53449 Mon Sep 17 00:00:00 2001 From: cgohlke Date: Thu, 20 Mar 2014 01:32:09 -0700 Subject: [PATCH 075/101] Update platform support Pillow 2.3.0dev passes all tests on Windows 8.1 Pro --- docs/installation.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/installation.rst b/docs/installation.rst index ff08dee17..828db9057 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -216,4 +216,6 @@ current versions of Linux, OS X, and Windows. +----------------------------------+-------------+------------------------------+------------------------------+-----------------------+ | Windows 8 Pro |Yes | 2.6,2.7,3.2,3.3,3.4a3 | 2.2.0 |x86,x86-64 | +----------------------------------+-------------+------------------------------+------------------------------+-----------------------+ +| Windows 8.1 Pro |Yes | 2.6,2.7,3.2,3.3,3.4 | 2.3.0 |x86,x86-64 | ++----------------------------------+-------------+------------------------------+------------------------------+-----------------------+ From bfb00173f1dc216dc9e6727b7c5138f8bd13ed13 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Fri, 21 Mar 2014 16:19:51 -0700 Subject: [PATCH 076/101] Error recovery for wheel install, #562 --- docs/installation.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/installation.rst b/docs/installation.rst index ff08dee17..565167226 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -172,6 +172,15 @@ Python Wheels $ pip install --use-wheel Pillow +If the above does not work, it's likely because we haven't uploaded a +wheel for the latest version of Pillow. In that case, try pinning it +to a specific version: + +:: + + $ pip install --use-wheel Pillow==2.3.0 + + Platform support ---------------- From 2bbc7f0c7dd22ea21031e2cbf0f0d5806acb6b0c Mon Sep 17 00:00:00 2001 From: hugovk Date: Mon, 24 Mar 2014 16:29:09 +0200 Subject: [PATCH 077/101] Add pypy as an allowed failure --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index 472f8a9fa..cedafc772 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ python: - 2.7 - 3.2 - 3.3 + - "pypy" install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev libwebp-dev python-qt4 ghostscript libffi-dev" @@ -20,3 +21,7 @@ script: - python setup.py build_ext --inplace - python selftest.py - python Tests/run.py + +matrix: + allow_failures: + - python: "pypy" From 61975dd8ad117fc5f8442b6c167f327ced708dd7 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Mon, 24 Mar 2014 17:49:19 +0100 Subject: [PATCH 078/101] more detailed error messages from Image.py --- .gitignore | 3 +++ PIL/Image.py | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index a2a3dc417..0a642e562 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ dist .tox *.so docs/_build + +# Vim cruft +.*.swp diff --git a/PIL/Image.py b/PIL/Image.py index 75e7efc75..7713fe7d9 100644 --- a/PIL/Image.py +++ b/PIL/Image.py @@ -1965,7 +1965,7 @@ def fromarray(obj, mode=None): else: ndmax = 4 if ndim > ndmax: - raise ValueError("Too many dimensions.") + raise ValueError("Too many dimensions: %d > %d." % (ndim, ndmax)) size = shape[1], shape[0] if strides is not None: @@ -2018,7 +2018,7 @@ def open(fp, mode="r"): """ if mode != "r": - raise ValueError("bad mode") + raise ValueError("bad mode %r" % mode) if isPath(fp): filename = fp @@ -2054,7 +2054,8 @@ def open(fp, mode="r"): #traceback.print_exc() pass - raise IOError("cannot identify image file") + raise IOError("cannot identify image file %r" + % (filename if filename else fp)) # # Image processing. From d7e5b3fd57891ecc453fc172490331a7f4ce8c31 Mon Sep 17 00:00:00 2001 From: hugovk Date: Tue, 25 Mar 2014 17:46:00 +0200 Subject: [PATCH 079/101] Possible fix for pypy Idea from http://www.tismer.com/pypy/irc-logs/pypy/pypy.2013-11-18.log.html --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index cedafc772..8f654f15e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ python: install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev libwebp-dev python-qt4 ghostscript libffi-dev" - "pip install cffi" + - "sudo python -c 'import Tkinter'" script: From 1f0e5563891af965091c5d85fa9afc575b07a60b Mon Sep 17 00:00:00 2001 From: hugovk Date: Tue, 25 Mar 2014 18:24:01 +0200 Subject: [PATCH 080/101] Possible fix for pypy Idea from https://github.com/cpbotha/nvpy/issues/19#issuecomment-9732016 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8f654f15e..1cd9e35a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ python: install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev libwebp-dev python-qt4 ghostscript libffi-dev" - "pip install cffi" - - "sudo python -c 'import Tkinter'" + - "apt-get install python-tk" script: From 526482b9658e5957f5f1874d637e14e7f54ba852 Mon Sep 17 00:00:00 2001 From: hugovk Date: Tue, 25 Mar 2014 18:39:07 +0200 Subject: [PATCH 081/101] Forgot sudo --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1cd9e35a9..fc7b5e6a5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ python: install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev libwebp-dev python-qt4 ghostscript libffi-dev" - "pip install cffi" - - "apt-get install python-tk" + - "sudo apt-get install python-tk" script: From 86ab02073645823f0dd700c17b8a151c4b19e4a3 Mon Sep 17 00:00:00 2001 From: Alex Clark Date: Tue, 25 Mar 2014 17:56:11 -0400 Subject: [PATCH 082/101] Add history --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index bb4131151..38349c2d1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Add more detailed error messages to Image.py + [larsmans] + - Avoid conflicting _expand functions in PIL & MINGW, fixes #538 [aclark] From edc3215e3454829e2f5741bcec831117a8643ea5 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 09:12:51 -0700 Subject: [PATCH 083/101] Disabling poor performing test on pypy --- Tests/test_image_point.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Tests/test_image_point.py b/Tests/test_image_point.py index c70556f6a..34233f80e 100644 --- a/Tests/test_image_point.py +++ b/Tests/test_image_point.py @@ -2,6 +2,11 @@ from tester import * from PIL import Image +if hasattr(sys, 'pypy_version_info'): + # This takes _forever_ on pypy. Open Bug, + # see https://github.com/python-imaging/Pillow/issues/484 + skip() + def test_sanity(): im = lena() From d97244181d9648c9e5273735508388ea4f827af1 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 09:13:49 -0700 Subject: [PATCH 084/101] Adding current webp dependency to travis --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fc7b5e6a5..85c0e0704 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,10 +12,15 @@ python: - "pypy" install: - - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev libwebp-dev python-qt4 ghostscript libffi-dev" + - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev" - "pip install cffi" + # enables pypy to compile - "sudo apt-get install python-tk" + # webp UNDONE -- refactor + - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" + - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" + - "cd /tmp/libwebp-0.4.0 && ./configure --enable-libwebpmux --enable-libwebpdemux && make && sudo make install" script: - python setup.py clean From c4cedd50eb92d0d918f2c5d01cbb4b84890ca952 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 09:29:47 -0700 Subject: [PATCH 085/101] Leave CWD in the right place --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 85c0e0704..2fcf8bca4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,9 @@ install: # webp UNDONE -- refactor - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" - - "cd /tmp/libwebp-0.4.0 && ./configure --enable-libwebpmux --enable-libwebpdemux && make && sudo make install" + - "pushd /tmp/libwebp-0.4.0 && ./configure --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" + + script: - python setup.py clean From b97d220b2c702567cac21311223bb3bee6bbda5f Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 09:53:45 -0700 Subject: [PATCH 086/101] Set prefix to not require ld preload on travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2fcf8bca4..f3d82dfee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ install: # webp UNDONE -- refactor - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" - - "pushd /tmp/libwebp-0.4.0 && ./configure --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" + - "pushd /tmp/libwebp-0.4.0 && ./configure --prefix=/usr --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" From e0d42d0e027532209e18b50864649389fdb584df Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 09:54:22 -0700 Subject: [PATCH 087/101] Speed this up for now --- .travis.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index f3d82dfee..24d0ad23f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,17 +5,17 @@ virtualenv: system_site_packages: true python: - - 2.6 +# - 2.6 - 2.7 - - 3.2 - - 3.3 - - "pypy" +# - 3.2 +# - 3.3 +# - "pypy" install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev" - "pip install cffi" # enables pypy to compile - - "sudo apt-get install python-tk" + # - "sudo apt-get install python-tk" # webp UNDONE -- refactor - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" @@ -30,6 +30,6 @@ script: - python selftest.py - python Tests/run.py -matrix: - allow_failures: - - python: "pypy" +#matrix: +# allow_failures: +# - python: "pypy" From 89507d16e35f70b607a28e6b55d7a02975546694 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 10:49:48 -0700 Subject: [PATCH 088/101] added jpeg2k build --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 24d0ad23f..2bf671dbe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ python: # - "pypy" install: - - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev" + - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev cmake" - "pip install cffi" # enables pypy to compile # - "sudo apt-get install python-tk" @@ -22,7 +22,9 @@ install: - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" - "pushd /tmp/libwebp-0.4.0 && ./configure --prefix=/usr --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" - + - "wget 'https://openjpeg.googlecode.com/files/openjpeg-2.0.0.tar.gz' -O /tmp/openjpeg-2.0.0.tar.gz" + - "tar -C /tmp -xvzf /tmp/openjpeg-2.0.0.tar.gz" + - "pushd /tmp/openjpeg-2.0.0 && cmake . && make && sudo DESTDIR=/usr make install && popd" script: - python setup.py clean From 9e5d597b4d7d70e1b161b29472a1f688f984d7ab Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 11:31:06 -0700 Subject: [PATCH 089/101] use cmake to set prefix, instead of DESTDIR env variable --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2bf671dbe..df423844b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ install: - "wget 'https://openjpeg.googlecode.com/files/openjpeg-2.0.0.tar.gz' -O /tmp/openjpeg-2.0.0.tar.gz" - "tar -C /tmp -xvzf /tmp/openjpeg-2.0.0.tar.gz" - - "pushd /tmp/openjpeg-2.0.0 && cmake . && make && sudo DESTDIR=/usr make install && popd" + - "pushd /tmp/openjpeg-2.0.0 && cmake -DCMAKE_INSTALL_PREFIX=/usr . && make && sudo make install && popd" script: - python setup.py clean From 7e80ce9ce049dafbb558d95664f5d796350395f7 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 12:04:54 -0700 Subject: [PATCH 090/101] Reenabling, timing --- .travis.yml | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index df423844b..28b9cbf24 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,12 +4,15 @@ language: python virtualenv: system_site_packages: true +notifications: + irc: "chat.freenode.net#pil" + python: -# - 2.6 + - 2.6 - 2.7 -# - 3.2 -# - 3.3 -# - "pypy" + - 3.2 + - 3.3 + - "pypy" install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev cmake" @@ -18,13 +21,19 @@ install: # - "sudo apt-get install python-tk" # webp UNDONE -- refactor + - date - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" + - date - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" - "pushd /tmp/libwebp-0.4.0 && ./configure --prefix=/usr --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" + - date + # openjpeg - "wget 'https://openjpeg.googlecode.com/files/openjpeg-2.0.0.tar.gz' -O /tmp/openjpeg-2.0.0.tar.gz" - "tar -C /tmp -xvzf /tmp/openjpeg-2.0.0.tar.gz" + - date - "pushd /tmp/openjpeg-2.0.0 && cmake -DCMAKE_INSTALL_PREFIX=/usr . && make && sudo make install && popd" + - date script: - python setup.py clean @@ -32,6 +41,6 @@ script: - python selftest.py - python Tests/run.py -#matrix: -# allow_failures: -# - python: "pypy" +matrix: + allow_failures: + - python: "pypy" From d4ecef390fc2b2b4c0b5ff6a31e072b546f14f68 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 13:12:20 -0700 Subject: [PATCH 091/101] Try catching the tk error in pypy --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a8ff2e762..aa9238383 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,8 @@ def _read(file): try: import _tkinter -except ImportError: +except (ImportError, OSError): + # pypy emits an oserror _tkinter = None From 31b84f41bca0dc35efcdb18fcceeb15be02163f2 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 13:22:37 -0700 Subject: [PATCH 092/101] refactored out dependencies --- .travis.yml | 15 +++------------ depends/install_openjpeg.sh | 18 ++++++++++++++++++ depends/install_webp.sh | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+), 12 deletions(-) create mode 100755 depends/install_openjpeg.sh create mode 100755 depends/install_webp.sh diff --git a/.travis.yml b/.travis.yml index 28b9cbf24..9c0d2fb28 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,20 +20,11 @@ install: # enables pypy to compile # - "sudo apt-get install python-tk" - # webp UNDONE -- refactor - - date - - "wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' -O /tmp/libwebp-0.4.0.tar.gz" - - date - - "tar -C /tmp -xvzf /tmp/libwebp-0.4.0.tar.gz" - - "pushd /tmp/libwebp-0.4.0 && ./configure --prefix=/usr --enable-libwebpmux --enable-libwebpdemux && make && sudo make install && popd" - - date + # webp + - pushd depends && ./install_web[.sh && popd # openjpeg - - "wget 'https://openjpeg.googlecode.com/files/openjpeg-2.0.0.tar.gz' -O /tmp/openjpeg-2.0.0.tar.gz" - - "tar -C /tmp -xvzf /tmp/openjpeg-2.0.0.tar.gz" - - date - - "pushd /tmp/openjpeg-2.0.0 && cmake -DCMAKE_INSTALL_PREFIX=/usr . && make && sudo make install && popd" - - date + - pushd depends && ./install_openjpeg.sh && popd script: - python setup.py clean diff --git a/depends/install_openjpeg.sh b/depends/install_openjpeg.sh new file mode 100755 index 000000000..bd6b83e3b --- /dev/null +++ b/depends/install_openjpeg.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# install openjpeg + + +if [ ! -f openjpeg-2.0.0.tar.gz ]; then + wget 'https://openjpeg.googlecode.com/files/openjpeg-2.0.0.tar.gz' +fi + +rm -r openjpeg-2.0.0 +tar -xvzf openjpeg-2.0.0.tar.gz + + +pushd openjpeg-2.0.0 + +cmake -DCMAKE_INSTALL_PREFIX=/usr . && make && sudo make install + +popd + diff --git a/depends/install_webp.sh b/depends/install_webp.sh new file mode 100755 index 000000000..5f5963712 --- /dev/null +++ b/depends/install_webp.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# install webp + + +if [ ! -f libwebp-0.4.0.tar.gz ]; then + wget 'https://webp.googlecode.com/files/libwebp-0.4.0.tar.gz' +fi + +rm -r libwebp-0.4.0 +tar -xvzf libwebp-0.4.0.tar.gz + + +pushd libwebp-0.4.0 + +./configure --prefix=/usr --enable-libwebpmux --enable-libwebpdemux && make && sudo make install + +popd + From 6e8c8bf9d49e6e7583d3d4db63e7dd4fbd0d2277 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 13:28:03 -0700 Subject: [PATCH 093/101] Emacs cruft in .gitignore [ci skip] --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 0a642e562..f16a1f9a8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ docs/_build # Vim cruft .*.swp + +#emacs +*~ +\#*# +.#* From 55ea6c01b1975f77dff9252c68f45849e18822bb Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 13:31:20 -0700 Subject: [PATCH 094/101] OSError catching in the test as well --- Tests/test_imagetk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/test_imagetk.py b/Tests/test_imagetk.py index 5c39c9283..b30971e8f 100644 --- a/Tests/test_imagetk.py +++ b/Tests/test_imagetk.py @@ -3,7 +3,7 @@ from tester import * from PIL import Image try: from PIL import ImageTk -except ImportError as v: +except (OSError, ImportError) as v: skip(v) success() From b91326b19505a27a963fed8b6fcc336f49d5ff5e Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 13:46:00 -0700 Subject: [PATCH 095/101] Tpyo in script path --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9c0d2fb28..aeba15d44 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ install: # - "sudo apt-get install python-tk" # webp - - pushd depends && ./install_web[.sh && popd + - pushd depends && ./install_web.sh && popd # openjpeg - pushd depends && ./install_openjpeg.sh && popd From 8ffcf97070953be991ee6730236593c864e7ba28 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 14:11:41 -0700 Subject: [PATCH 096/101] Correct resolution of typo --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index aeba15d44..bbff6391b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ install: # - "sudo apt-get install python-tk" # webp - - pushd depends && ./install_web.sh && popd + - pushd depends && ./install_webp.sh && popd # openjpeg - pushd depends && ./install_openjpeg.sh && popd From 33dda2ef613d27898015e1a55fccaed4cb50f5c8 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Wed, 26 Mar 2014 14:37:38 -0700 Subject: [PATCH 097/101] Removing python-tk install [skip ci] --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index bbff6391b..d68de0b32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,8 +17,6 @@ python: install: - "sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev cmake" - "pip install cffi" - # enables pypy to compile - # - "sudo apt-get install python-tk" # webp - pushd depends && ./install_webp.sh && popd From c7af2bf5b04f292d34018749056f9aadba996d2c Mon Sep 17 00:00:00 2001 From: wiredfool Date: Thu, 27 Mar 2014 16:39:58 -0700 Subject: [PATCH 098/101] Test for #577 --- Tests/test_file_jpeg.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Tests/test_file_jpeg.py b/Tests/test_file_jpeg.py index 07a7c9f96..61db7da6c 100644 --- a/Tests/test_file_jpeg.py +++ b/Tests/test_file_jpeg.py @@ -203,3 +203,9 @@ def test_exif(): im = Image.open("Tests/images/pil_sample_rgb.jpg") info = im._getexif() assert_equal(info[305], 'Adobe Photoshop CS Macintosh') + + +def test_quality_keep(): + im = Image.open("Images/lena.jpg") + f = tempfile('temp.jpg') + assert_no_exception(lambda: im.save(f, quality='keep')) From e07b0d8ac9bd5401f2733eb79b94c45e9bf74c8a Mon Sep 17 00:00:00 2001 From: wiredfool Date: Thu, 27 Mar 2014 16:40:44 -0700 Subject: [PATCH 099/101] don't use xrange, has_key, fixes #577 --- PIL/JpegImagePlugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PIL/JpegImagePlugin.py b/PIL/JpegImagePlugin.py index 07a09232c..da52006ca 100644 --- a/PIL/JpegImagePlugin.py +++ b/PIL/JpegImagePlugin.py @@ -442,7 +442,7 @@ samplings = { } def convert_dict_qtables(qtables): - qtables = [qtables[key] for key in xrange(len(qtables)) if qtables.has_key(key)] + qtables = [qtables[key] for key in range(len(qtables)) if key in qtables] for idx, table in enumerate(qtables): qtables[idx] = [table[i] for i in zigzag_index] return qtables @@ -504,7 +504,7 @@ def _save(im, fp, filename): except ValueError: raise ValueError("Invalid quantization table") else: - qtables = [lines[s:s+64] for s in xrange(0, len(lines), 64)] + qtables = [lines[s:s+64] for s in range(0, len(lines), 64)] if isinstance(qtables, (tuple, list, dict)): if isinstance(qtables, dict): qtables = convert_dict_qtables(qtables) From 5a4808d2d2b50e876359aba45b7347f9711e4f5e Mon Sep 17 00:00:00 2001 From: wiredfool Date: Thu, 27 Mar 2014 16:44:58 -0700 Subject: [PATCH 100/101] test cleanup: don't mask file builtin, data is unused --- Tests/test_file_jpeg.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Tests/test_file_jpeg.py b/Tests/test_file_jpeg.py index 61db7da6c..095cad359 100644 --- a/Tests/test_file_jpeg.py +++ b/Tests/test_file_jpeg.py @@ -10,9 +10,7 @@ codecs = dir(Image.core) if "jpeg_encoder" not in codecs or "jpeg_decoder" not in codecs: skip("jpeg support not available") -# sample jpeg stream -file = "Images/lena.jpg" -data = open(file, "rb").read() +test_file = "Images/lena.jpg" def roundtrip(im, **options): out = BytesIO() @@ -30,7 +28,7 @@ def test_sanity(): # internal version number assert_match(Image.core.jpeglib_version, "\d+\.\d+$") - im = Image.open(file) + im = Image.open(test_file) im.load() assert_equal(im.mode, "RGB") assert_equal(im.size, (128, 128)) @@ -40,7 +38,7 @@ def test_sanity(): def test_app(): # Test APP/COM reader (@PIL135) - im = Image.open(file) + im = Image.open(test_file) assert_equal(im.applist[0], ("APP0", b"JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00")) assert_equal(im.applist[1], ("COM", b"Python Imaging Library")) @@ -49,8 +47,8 @@ def test_app(): def test_cmyk(): # Test CMYK handling. Thanks to Tim and Charlie for test data, # Michael for getting me to look one more time. - file = "Tests/images/pil_sample_cmyk.jpg" - im = Image.open(file) + f = "Tests/images/pil_sample_cmyk.jpg" + im = Image.open(f) # the source image has red pixels in the upper left corner. c, m, y, k = [x / 255.0 for x in im.getpixel((0, 0))] assert_true(c == 0.0 and m > 0.8 and y > 0.8 and k == 0.0) @@ -66,7 +64,7 @@ def test_cmyk(): def test_dpi(): def test(xdpi, ydpi=None): - im = Image.open(file) + im = Image.open(test_file) im = roundtrip(im, dpi=(xdpi, ydpi or xdpi)) return im.info.get("dpi") assert_equal(test(72), (72, 72)) @@ -80,9 +78,9 @@ def test_icc(): icc_profile = im1.info["icc_profile"] assert_equal(len(icc_profile), 3144) # Roundtrip via physical file. - file = tempfile("temp.jpg") - im1.save(file, icc_profile=icc_profile) - im2 = Image.open(file) + f = tempfile("temp.jpg") + im1.save(f, icc_profile=icc_profile) + im2 = Image.open(f) assert_equal(im2.info.get("icc_profile"), icc_profile) # Roundtrip via memory buffer. im1 = roundtrip(lena()) From af4424ebf19538ebdc40ee47b70d106acb9f565e Mon Sep 17 00:00:00 2001 From: wiredfool Date: Thu, 27 Mar 2014 20:54:35 -0700 Subject: [PATCH 101/101] Update CHANGES.rst --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 51c47dd29..837bf9456 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,9 @@ Changelog (Pillow) 2.4.0 (unreleased) ------------------ +- Fixes for Jpeg encoding in Python 3, fixes #577 + [wiredfool] + - Added support for JPEG 2000 [al45tair]