From 1dd3bef61507847375772ffd207e8451fa5c897e Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Sun, 10 May 2015 23:55:33 +1000 Subject: [PATCH] Upgraded OleFileIO to 0.42b --- PIL/OleFileIO.py | 941 +++++++++++++++++++++++++++++++---------------- 1 file changed, 616 insertions(+), 325 deletions(-) diff --git a/PIL/OleFileIO.py b/PIL/OleFileIO.py index c804dd454..d787e59ed 100755 --- a/PIL/OleFileIO.py +++ b/PIL/OleFileIO.py @@ -1,47 +1,70 @@ #!/usr/bin/env python -## OleFileIO_PL: -## Module to read Microsoft OLE2 files (also called Structured Storage or -## Microsoft Compound Document File Format), such as Microsoft Office -## documents, Image Composer and FlashPix files, Outlook messages, ... -## This version is compatible with Python 2.6+ and 3.x -## version 0.30 2014-02-04 Philippe Lagadec - http://www.decalage.info - -## Project website: http://www.decalage.info/python/olefileio - -## Improved version of the OleFileIO module from PIL library v1.1.6 -## See: http://www.pythonware.com/products/pil/index.htm - -## The Python Imaging Library (PIL) is - -## Copyright (c) 1997-2005 by Secret Labs AB -## Copyright (c) 1995-2005 by Fredrik Lundh - -## OleFileIO_PL changes are Copyright (c) 2005-2014 by Philippe Lagadec - -## See source code and LICENSE.txt for information on usage and redistribution. - -## WARNING: THIS IS (STILL) WORK IN PROGRESS. +# olefile (formerly OleFileIO_PL) version 0.42 2015-01-25 +# +# Module to read/write Microsoft OLE2 files (also called Structured Storage or +# Microsoft Compound Document File Format), such as Microsoft Office 97-2003 +# documents, Image Composer and FlashPix files, Outlook messages, ... +# This version is compatible with Python 2.6+ and 3.x +# +# Project website: http://www.decalage.info/olefile +# +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info) +# +# olefile is based on the OleFileIO module from the PIL library v1.1.6 +# See: http://www.pythonware.com/products/pil/index.htm +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# See source code and LICENSE.txt for information on usage and redistribution. -# Starting with OleFileIO_PL v0.30, only Python 2.6+ and 3.x is supported +# Since OleFileIO_PL v0.30, only Python 2.6+ and 3.x is supported # This import enables print() as a function rather than a keyword # (main requirement to be compatible with Python 3.x) # The comment on the line below should be printed on Python 2.5 or older: -from __future__ import print_function # This version of OleFileIO_PL requires Python 2.6+ or 3.x. +from __future__ import print_function # This version of olefile requires Python 2.6+ or 3.x. -__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" -__date__ = "2014-02-04" -__version__ = '0.30' +__author__ = "Philippe Lagadec" +__date__ = "2015-01-25" +__version__ = '0.42b' #--- LICENSE ------------------------------------------------------------------ -# OleFileIO_PL is an improved version of the OleFileIO module from the -# Python Imaging Library (PIL). - -# OleFileIO_PL changes are Copyright (c) 2005-2014 by Philippe Lagadec +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec +# (http://www.decalage.info) # +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ---------- +# PIL License: +# +# olefile is based on source code from the OleFileIO module of the Python +# Imaging Library (PIL) published by Fredrik Lundh under the following license: + # The Python Imaging Library (PIL) is # Copyright (c) 1997-2005 by Secret Labs AB # Copyright (c) 1995-2005 by Fredrik Lundh @@ -67,7 +90,7 @@ __version__ = '0.30' # PERFORMANCE OF THIS SOFTWARE. #----------------------------------------------------------------------------- -# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6) +# CHANGELOG: (only olefile/OleFileIO_PL changes compared to PIL 1.1.6) # 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility # (all changes flagged with [PL]) # 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise @@ -142,10 +165,29 @@ __version__ = '0.30' # 2014-02-04 v0.30 PL: - upgraded code to support Python 3.x by Martin Panter # - several fixes for Python 2.6 (xrange, MAGIC) # - reused i32 from Pillow's _binary +# 2014-07-18 v0.31 - preliminary support for 4K sectors +# 2014-07-27 v0.31 PL: - a few improvements in OleFileIO.open (header parsing) +# - Fixed loadfat for large files with 4K sectors (issue #3) +# 2014-07-30 v0.32 PL: - added write_sect to write sectors to disk +# - added write_mode option to OleFileIO.__init__ and open +# 2014-07-31 PL: - fixed padding in write_sect for Python 3, added checks +# - added write_stream to write a stream to disk +# 2014-09-26 v0.40 PL: - renamed OleFileIO_PL to olefile +# 2014-11-09 NE: - added support for Jython (Niko Ehrenfeuchter) +# 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE +# data in a string buffer and file-like objects. +# 2014-11-21 PL: - updated comments according to Pillow's commits +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 +# to UTF-8 on Python 2.x (Unicode on Python 3.x) +# - added path_encoding option to override the default +# - fixed a bug in _list when a storage is empty #----------------------------------------------------------------------------- # TODO (for version 1.0): -# + isOleFile should accept file-like objects like open +# + get rid of print statements, to simplify Python 2.x and 3.x support +# + add is_stream and is_storage +# + remove leading and trailing slashes where a path is used +# + add functions path_list2str and path_str2list # + fix how all the methods handle unicode str and/or bytes as arguments # + add path attrib to _OleDirEntry, set it once and for all in init or # append_kids (then listdir/_list can be simplified) @@ -177,30 +219,16 @@ __version__ = '0.30' # - move all debug code (and maybe dump methods) to a separate module, with # a class which inherits OleFileIO ? # - fix docstrings to follow epydoc format -# - add support for 4K sectors ? # - add support for big endian byte order ? # - create a simple OLE explorer with wxPython # FUTURE EVOLUTIONS to add write support: -# 1) add ability to write a stream back on disk from BytesIO (same size, no -# change in FAT/MiniFAT). -# 2) rename a stream/storage if it doesn't change the RB tree -# 3) use rbtree module to update the red-black tree + any rename -# 4) remove a stream/storage: free sectors in FAT/MiniFAT -# 5) allocate new sectors in FAT/MiniFAT -# 6) create new storage/stream -#----------------------------------------------------------------------------- +# see issue #6 on Bitbucket: +# https://bitbucket.org/decalage/olefileio_pl/issue/6/improve-olefileio_pl-to-write-ole-files + +#----------------------------------------------------------------------------- +# NOTES from PIL 1.1.6: -# -# THIS IS WORK IN PROGRESS -# -# The Python Imaging Library -# $Id$ -# -# stuff to deal with OLE2 Structured Storage files. this module is -# used by PIL to read Image Composer and FlashPix files, but can also -# be used to read other files of this type. -# # History: # 1997-01-20 fl Created # 1997-01-22 fl Fixed 64-bit portability quirk @@ -222,25 +250,19 @@ __version__ = '0.30' # "If this document and functionality of the Software conflict, # the actual functionality of the Software represents the correct # functionality" -- Microsoft, in the OLE format specification -# -# Copyright (c) Secret Labs AB 1997. -# Copyright (c) Fredrik Lundh 1997. -# -# See the README file for information on usage and redistribution. -# #------------------------------------------------------------------------------ import io import sys -import struct -import array -import os.path -import datetime +import struct, array, os.path, datetime + +#=== COMPATIBILITY WORKAROUNDS ================================================ #[PL] Define explicitly the public API to avoid private objects in pydoc: -__all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] +#TODO: add more +# __all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] # For Python 3.x, need to redefine long as int: if str is not bytes: @@ -261,39 +283,66 @@ if array.array('L').itemsize == 4: elif array.array('I').itemsize == 4: # on 64 bits platforms, integers in an array are 32 bits: UINT32 = 'I' +elif array.array('i').itemsize == 4: + # On 64 bit Jython, signed integers ('i') are the only way to store our 32 + # bit values in an array in a *somewhat* reasonable way, as the otherwise + # perfectly suited 'H' (unsigned int, 32 bits) results in a completely + # unusable behaviour. This is most likely caused by the fact that Java + # doesn't have unsigned values, and thus Jython's "array" implementation, + # which is based on "jarray", doesn't have them either. + # NOTE: to trick Jython into converting the values it would normally + # interpret as "signed" into "unsigned", a binary-and operation with + # 0xFFFFFFFF can be used. This way it is possible to use the same comparing + # operations on all platforms / implementations. The corresponding code + # lines are flagged with a 'JYTHON-WORKAROUND' tag below. + UINT32 = 'i' else: raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') #[PL] These workarounds were inspired from the Path module # (see http://www.jorendorff.com/articles/python/path/) +#TODO: test with old Python versions + +# Pre-2.3 workaround for basestring. try: basestring except NameError: - basestring = str + try: + # is Unicode supported (Python >2.0 or >1.6 ?) + basestring = (str, unicode) + except NameError: + basestring = str #[PL] Experimental setting: if True, OLE filenames will be kept in Unicode # if False (default PIL behaviour), all filenames are converted to Latin-1. -KEEP_UNICODE_NAMES = False +KEEP_UNICODE_NAMES = True + +if sys.version_info[0] < 3: + # On Python 2.x, the default encoding for path names is UTF-8: + DEFAULT_PATH_ENCODING = 'utf-8' +else: + # On Python 3.x, the default encoding for path names is Unicode (None): + DEFAULT_PATH_ENCODING = None + + +#=== DEBUGGING =============================================================== + +#TODO: replace this by proper logging #[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on # command line to change it. DEBUG_MODE = False - - def debug_print(msg): print(msg) - - def debug_pass(msg): pass debug = debug_pass - def set_debug_mode(debug_mode): """ Set debug mode on or off, to control display of debugging messages. - mode: True or False + :param mode: True or False """ global DEBUG_MODE, debug DEBUG_MODE = debug_mode @@ -302,26 +351,30 @@ def set_debug_mode(debug_mode): else: debug = debug_pass + +#=== CONSTANTS =============================================================== + +# magic bytes that should be at the beginning of every OLE file: MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' -# [PL]: added constants for Sector IDs (from AAF specifications) -MAXREGSECT = 0xFFFFFFFA; # maximum SECT -DIFSECT = 0xFFFFFFFC; # (-4) denotes a DIFAT sector in a FAT -FATSECT = 0xFFFFFFFD; # (-3) denotes a FAT sector in a FAT -ENDOFCHAIN = 0xFFFFFFFE; # (-2) end of a virtual stream chain -FREESECT = 0xFFFFFFFF; # (-1) unallocated sector +#[PL]: added constants for Sector IDs (from AAF specifications) +MAXREGSECT = 0xFFFFFFFA # (-6) maximum SECT +DIFSECT = 0xFFFFFFFC # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFD # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFE # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFF # (-1) unallocated sector -# [PL]: added constants for Directory Entry IDs (from AAF specifications) -MAXREGSID = 0xFFFFFFFA; # maximum directory entry ID -NOSTREAM = 0xFFFFFFFF; # (-1) unallocated directory entry +#[PL]: added constants for Directory Entry IDs (from AAF specifications) +MAXREGSID = 0xFFFFFFFA # (-6) maximum directory entry ID +NOSTREAM = 0xFFFFFFFF # (-1) unallocated directory entry -# [PL] object types in storage (from AAF specifications) -STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) -STGTY_STORAGE = 1 # element is a storage object -STGTY_STREAM = 2 # element is a stream object -STGTY_LOCKBYTES = 3 # element is an ILockBytes object -STGTY_PROPERTY = 4 # element is an IPropertyStorage object -STGTY_ROOT = 5 # element is a root storage +#[PL] object types in storage (from AAF specifications) +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) +STGTY_STORAGE = 1 # element is a storage object +STGTY_STREAM = 2 # element is a stream object +STGTY_LOCKBYTES = 3 # element is an ILockBytes object +STGTY_PROPERTY = 4 # element is an IPropertyStorage object +STGTY_ROOT = 5 # element is a root storage # @@ -353,30 +406,52 @@ WORD_CLSID = "00020900-0000-0000-C000-000000000046" #TODO: check Excel, PPT, ... #[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() -DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect -DEFECT_POTENTIAL = 20 # a potential defect -DEFECT_INCORRECT = 30 # an error according to specifications, but parsing - # can go on -DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is - # impossible +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect +DEFECT_POTENTIAL = 20 # a potential defect +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing + # can go on +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is + # impossible + +# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes +# (this is used in isOleFile and OleFile.open) +MINIMAL_OLEFILE_SIZE = 1536 #[PL] add useful constants to __all__: -for key in list(vars().keys()): - if key.startswith('STGTY_') or key.startswith('DEFECT_'): - __all__.append(key) +# for key in list(vars().keys()): +# if key.startswith('STGTY_') or key.startswith('DEFECT_'): +# __all__.append(key) -#--- FUNCTIONS ---------------------------------------------------------------- +#=== FUNCTIONS =============================================================== def isOleFile (filename): """ - Test if file is an OLE container (according to its header). + Test if a file is an OLE container (according to the magic bytes in its header). + + :param filename: string-like or file-like object, OLE file to parse + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read and seek methods), + it is parsed as-is. - :param filename: file name or path (str, unicode) :returns: True if OLE, False otherwise. """ - f = open(filename, 'rb') - header = f.read(len(MAGIC)) + # check if filename is a string-like or file-like object: + if hasattr(filename, 'read'): + # file-like object: use it directly + header = filename.read(len(MAGIC)) + # just in case, seek back to start of file: + filename.seek(0) + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: + # filename is a bytes string containing the OLE file to be parsed: + header = filename[:len(MAGIC)] + else: + # string-like object: filename of file on disk + header = open(filename, 'rb').read(len(MAGIC)) if header == MAGIC: return True else: @@ -434,41 +509,17 @@ def _clsid(clsid): tuple(map(i8, clsid[8:16])))) -# UNICODE support: -# (necessary to handle storages/streams names which use Unicode) - -def _unicode(s, errors='replace'): - """ - Map unicode string to Latin 1. (Python with Unicode support) - - :param s: UTF-16LE unicode string to convert to Latin-1 - :param errors: 'replace', 'ignore' or 'strict'. - """ - #TODO: test if it OleFileIO works with Unicode strings, instead of - # converting to Latin-1. - try: - # First the string is converted to plain Unicode: - # (assuming it is encoded as UTF-16 little-endian) - u = s.decode('UTF-16LE', errors) - if bytes is not str or KEEP_UNICODE_NAMES: - return u - else: - # Second the unicode string is converted to Latin-1 - return u.encode('latin_1', errors) - except: - # there was an error during Unicode to Latin-1 conversion: - raise IOError('incorrect Unicode name') - def filetime2datetime(filetime): - """ - convert FILETIME (64 bits int) to Python datetime.datetime - """ - # TODO: manage exception when microseconds is too large - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) - #debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) - return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) + """ + convert FILETIME (64 bits int) to Python datetime.datetime + """ + # TODO: manage exception when microseconds is too large + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + #debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) + #=== CLASSES ================================================================== @@ -578,6 +629,7 @@ class OleMetadata: self.language = None self.doc_version = None + def parse_properties(self, olefile): """ Parse standard properties of an OLE file, from the streams @@ -639,6 +691,7 @@ class _OleStream(io.BytesIO): fat table arguments. Attributes: + - size: actual size of data stream, after it was opened. """ @@ -650,18 +703,18 @@ class _OleStream(io.BytesIO): """ Constructor for _OleStream class. - :param fp : file object, the OLE container or the MiniFAT stream - :param sect : sector index of first sector in the stream - :param size : total size of the stream - :param offset : offset in bytes for the first FAT or MiniFAT sector + :param fp: file object, the OLE container or the MiniFAT stream + :param sect: sector index of first sector in the stream + :param size: total size of the stream + :param offset: offset in bytes for the first FAT or MiniFAT sector :param sectorsize: size of one sector - :param fat : array/list of sector indexes (FAT or MiniFAT) - :param filesize : size of OLE file (for debugging) - :returns : a BytesIO instance containing the OLE stream + :param fat: array/list of sector indexes (FAT or MiniFAT) + :param filesize: size of OLE file (for debugging) + :returns: a BytesIO instance containing the OLE stream """ debug('_OleStream.__init__:') debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' - %(sect, sect, size, offset, sectorsize, len(fat), repr(fp))) + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) #[PL] To detect malformed documents with FAT loops, we compute the # expected number of sectors in the stream: unknown_size = False @@ -729,7 +782,7 @@ class _OleStream(io.BytesIO): data.append(sector_data) # jump to next sector in the FAT: try: - sect = fat[sect] + sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND except IndexError: # [PL] if pointer is out of the FAT an exception is raised raise IOError('incorrect OLE FAT, sector index out of range') @@ -787,6 +840,7 @@ class _OleDirectoryEntry: DIRENTRY_SIZE = 128 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE + def __init__(self, entry, sid, olefile): """ Constructor for an _OleDirectoryEntry object. @@ -842,8 +896,11 @@ class _OleDirectoryEntry: namelength = 64 # only characters without ending null char are kept: name = name[:(namelength-2)] - # name is converted from unicode to Latin-1: - self.name = _unicode(name) + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) + #TODO: check if the name does not contain forbidden characters: + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: + self.name = olefile._decode_utf16_str(name) debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) debug(' - type: %d' % self.entry_type) @@ -879,6 +936,8 @@ class _OleDirectoryEntry: minifat = False olefile._check_duplicate_stream(self.isectStart, minifat) + + def build_storage_tree(self): """ Read and build the red-black tree attached to this _OleDirectoryEntry @@ -902,15 +961,16 @@ class _OleDirectoryEntry: # (see rich comparison methods in this class) self.kids.sort() + def append_kids(self, child_sid): """ Walk through red-black tree of children of this directory entry to add all of them to the kids list. (recursive method) - child_sid : index of child directory entry to use, or None when called - first time for the root. (only used during recursion) + :param child_sid : index of child directory entry to use, or None when called + first time for the root. (only used during recursion) """ - # [PL] this method was added to use simple recursion instead of a complex + #[PL] this method was added to use simple recursion instead of a complex # algorithm. # if this is not a storage or a leaf of the tree, nothing to do: if child_sid == NOSTREAM: @@ -945,6 +1005,7 @@ class _OleDirectoryEntry: # Afterwards build kid's own tree if it's also a storage: child.build_storage_tree() + def __eq__(self, other): "Compare entries by name" return self.name == other.name @@ -964,6 +1025,7 @@ class _OleDirectoryEntry: #TODO: replace by the same function as MS implementation ? # (order by name length first, then case-insensitive order) + def dump(self, tab = 0): "Dump this entry, and all its subentries (for debug purposes only)" TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", @@ -978,12 +1040,13 @@ class _OleDirectoryEntry: for kid in self.kids: kid.dump(tab + 2) + def getmtime(self): """ Return modification time of a directory entry. :returns: None if modification time is null, a python datetime object - otherwise (UTC timezone) + otherwise (UTC timezone) new in version 0.26 """ @@ -991,12 +1054,13 @@ class _OleDirectoryEntry: return None return filetime2datetime(self.modifyTime) + def getctime(self): """ Return creation time of a directory entry. :returns: None if modification time is null, a python datetime object - otherwise (UTC timezone) + otherwise (UTC timezone) new in version 0.26 """ @@ -1012,8 +1076,7 @@ class OleFileIO: OLE container object This class encapsulates the interface to an OLE 2 structured - storage file. Use the :py:meth:`~PIL.OleFileIO.OleFileIO.listdir` and - :py:meth:`~PIL.OleFileIO.OleFileIO.openstream` methods to + storage file. Use the listdir and openstream methods to access the contents of this file. Object names are given as a list of strings, one for each subentry @@ -1037,22 +1100,47 @@ class OleFileIO: TIFF files). """ - def __init__(self, filename = None, raise_defects=DEFECT_FATAL): + def __init__(self, filename=None, raise_defects=DEFECT_FATAL, + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): """ - Constructor for OleFileIO class. + Constructor for the OleFileIO class. :param filename: file to open. + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read, seek and tell methods), + it is parsed as-is. + :param raise_defects: minimal level for defects to be raised as exceptions. - (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a - security-oriented application, see source code for details) + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a + security-oriented application, see source code for details) + + :param write_mode: bool, if True the file is opened in read/write mode instead + of read-only by default. + + :param debug: bool, set debug mode + + :param path_encoding: None or str, name of the codec to use for path + names (streams and storages), or None for Unicode. + Unicode by default on Python 3+, UTF-8 on Python 2.x. + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) """ + set_debug_mode(debug) # minimal level for defects to be raised as exceptions: self._raise_defects_level = raise_defects # list of defects/issues not raised as exceptions: # tuples of (exception type, message) self.parsing_issues = [] + self.write_mode = write_mode + self.path_encoding = path_encoding + self._filesize = None + self.fp = None if filename: - self.open(filename) + self.open(filename, write_mode=write_mode) + def _raise_defect(self, defect_level, message, exception_type=IOError): """ @@ -1061,10 +1149,12 @@ class OleFileIO: for the OleFileIO object. :param defect_level: defect level, possible values are: - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect - DEFECT_POTENTIAL : a potential defect - DEFECT_INCORRECT : an error according to specifications, but parsing can go on - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + + - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect + - DEFECT_POTENTIAL : a potential defect + - DEFECT_INCORRECT : an error according to specifications, but parsing can go on + - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + :param message: string describing the defect, used with raised exception. :param exception_type: exception class to be raised, IOError by default """ @@ -1075,31 +1165,70 @@ class OleFileIO: # just record the issue, no exception raised: self.parsing_issues.append((exception_type, message)) - def open(self, filename): - """ - Open an OLE2 file. - Reads the header, FAT and directory. - :param filename: string-like or file-like object + def _decode_utf16_str(self, utf16_str, errors='replace'): """ + Decode a string encoded in UTF-16 LE format, as found in the OLE + directory or in property streams. Return a string encoded + according to the path_encoding specified for the OleFileIO object. + + :param utf16_str: bytes string encoded in UTF-16 LE format + :param errors: str, see python documentation for str.decode() + :return: str, encoded according to path_encoding + """ + unicode_str = utf16_str.decode('UTF-16LE', errors) + if self.path_encoding: + # an encoding has been specified for path names: + return unicode_str.encode(self.path_encoding, errors) + else: + # path_encoding=None, return the Unicode string as-is: + return unicode_str + + + def open(self, filename, write_mode=False): + """ + Open an OLE2 file in read-only or read/write mode. + Read and parse the header, FAT and directory. + + :param filename: string-like or file-like object, OLE file to parse + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read, seek and tell methods), + it is parsed as-is. + + :param write_mode: bool, if True the file is opened in read/write mode instead + of read-only by default. (ignored if filename is not a path) + """ + self.write_mode = write_mode #[PL] check if filename is a string-like or file-like object: # (it is better to check for a read() method) if hasattr(filename, 'read'): - # file-like object + #TODO: also check seek and tell methods? + # file-like object: use it directly self.fp = filename + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: + # filename is a bytes string containing the OLE file to be parsed: + # convert it to BytesIO + self.fp = io.BytesIO(filename) else: # string-like object: filename of file on disk - #TODO: if larger than 1024 bytes, this could be the actual data => BytesIO - self.fp = open(filename, "rb") - # old code fails if filename is not a plain string: - #if isinstance(filename, (bytes, basestring)): - # self.fp = open(filename, "rb") - #else: - # self.fp = filename + if self.write_mode: + # open file in mode 'read with update, binary' + # According to https://docs.python.org/2/library/functions.html#open + # 'w' would truncate the file, 'a' may only append on some Unixes + mode = 'r+b' + else: + # read-only mode by default + mode = 'rb' + self.fp = open(filename, mode) # obtain the filesize by using seek and tell, which should work on most # file-like objects: #TODO: do it above, using getsize with filename when possible? #TODO: fix code to fail with clear exception when filesize cannot be obtained + filesize=0 self.fp.seek(0, os.SEEK_END) try: filesize = self.fp.tell() @@ -1177,7 +1306,7 @@ class OleFileIO: self.sectDifStart, self.csectDif ) = struct.unpack(fmt_header, header1) - debug(struct.unpack(fmt_header, header1)) + debug( struct.unpack(fmt_header, header1)) if self.Sig != MAGIC: # OLE signature should always be present @@ -1196,6 +1325,7 @@ class OleFileIO: # For now only common little-endian documents are handled correctly self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") # TODO: add big-endian support for documents created on Mac ? + # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE. self.SectorSize = 2**self.SectorShift debug( "SectorSize = %d" % self.SectorSize ) if self.SectorSize not in [512, 4096]: @@ -1210,28 +1340,44 @@ class OleFileIO: if self.Reserved != 0 or self.Reserved1 != 0: self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") debug( "csectDir = %d" % self.csectDir ) + # Number of directory sectors (only allowed if DllVersion != 3) if self.SectorSize==512 and self.csectDir!=0: self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header") debug( "csectFat = %d" % self.csectFat ) + # csectFat = number of FAT sectors in the file debug( "sectDirStart = %X" % self.sectDirStart ) + # sectDirStart = 1st sector containing the directory debug( "signature = %d" % self.signature ) # Signature should be zero, BUT some implementations do not follow this # rule => only a potential defect: + # (according to MS-CFB, may be != 0 for applications supporting file + # transactions) if self.signature != 0: self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)") - debug("MiniSectorCutoff = %d" % self.MiniSectorCutoff) - debug("MiniFatStart = %X" % self.MiniFatStart) - debug("csectMiniFat = %d" % self.csectMiniFat) - debug("sectDifStart = %X" % self.sectDifStart) - debug("csectDif = %d" % self.csectDif) + debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff ) + # MS-CFB: This integer field MUST be set to 0x00001000. This field + # specifies the maximum size of a user-defined data stream allocated + # from the mini FAT and mini stream, and that cutoff is 4096 bytes. + # Any user-defined data stream larger than or equal to this cutoff size + # must be allocated as normal sectors from the FAT. + if self.MiniSectorCutoff != 0x1000: + self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorCutoff in OLE header") + debug( "MiniFatStart = %X" % self.MiniFatStart ) + debug( "csectMiniFat = %d" % self.csectMiniFat ) + debug( "sectDifStart = %X" % self.sectDifStart ) + debug( "csectDif = %d" % self.csectDif ) # calculate the number of sectors in the file # (-1 because header doesn't count) self.nb_sect = ( (filesize + self.SectorSize-1) // self.SectorSize) - 1 debug( "Number of sectors in the file: %d" % self.nb_sect ) + #TODO: change this test, because an OLE file MAY contain other data + # after the last sector. - # file clsid (probably never used, so we don't store it) - #clsid = _clsid(header[8:24]) + # file clsid + self.clsid = _clsid(header[8:24]) + + #TODO: remove redundant attributes, and fix the code which uses them? self.sectorsize = self.SectorSize #1 << i16(header, 30) self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32) self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56) @@ -1254,19 +1400,22 @@ class OleFileIO: self.ministream = None self.minifatsect = self.MiniFatStart #i32(header, 60) + def close(self): """ close the OLE file, to release the file object """ self.fp.close() + def _check_duplicate_stream(self, first_sect, minifat=False): """ Checks if a stream has not been already referenced elsewhere. This method should only be called once for each known stream, and only if stream size is not null. - :param first_sect: index of first sector of the stream in FAT - :param minifat: if True, stream is located in the MiniFAT, else in the FAT + + :param first_sect: int, index of first sector of the stream in FAT + :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT """ if minifat: debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect) @@ -1284,13 +1433,14 @@ class OleFileIO: else: used_streams.append(first_sect) + def dumpfat(self, fat, firstindex=0): "Displays a part of FAT in human-readable form for debugging purpose" # [PL] added only for debug if not DEBUG_MODE: return # dictionary to convert special FAT values in human-readable strings - VPL=8 # valeurs par ligne (8+1 * 8+1 = 81) + VPL = 8 # values per line (8+1 * 8+1 = 81) fatnames = { FREESECT: "..free..", ENDOFCHAIN: "[ END. ]", @@ -1310,22 +1460,26 @@ class OleFileIO: if i>=nbsect: break sect = fat[i] - if sect in fatnames: - nom = fatnames[sect] + aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND + if aux in fatnames: + name = fatnames[aux] else: if sect == i+1: - nom = " --->" + name = " --->" else: - nom = "%8X" % sect - print(nom, end=" ") + name = "%8X" % sect + print(name, end=" ") print() + def dumpsect(self, sector, firstindex=0): "Displays a sector in a human-readable form, for debugging purpose." if not DEBUG_MODE: return VPL=8 # number of values per line (8+1 * 8+1 = 81) tab = array.array(UINT32, sector) + if sys.byteorder == 'big': + tab.byteswap() nbsect = len(tab) nlines = (nbsect+VPL-1)//VPL print("index", end=" ") @@ -1339,8 +1493,8 @@ class OleFileIO: if i>=nbsect: break sect = tab[i] - nom = "%8X" % sect - print(nom, end=" ") + name = "%8X" % sect + print(name, end=" ") print() def sect2array(self, sect): @@ -1354,6 +1508,7 @@ class OleFileIO: a.byteswap() return a + def loadfat_sect(self, sect): """ Adds the indexes of the given sector to the FAT @@ -1371,9 +1526,11 @@ class OleFileIO: self.dumpsect(sect) # The FAT is a sector chain starting at the first index of itself. for isect in fat1: - #print("isect = %X" % isect) + isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND + debug("isect = %X" % isect) if isect == ENDOFCHAIN or isect == FREESECT: # the end of the sector chain has been reached + debug("found end of sector chain") break # read the FAT sector s = self.getsect(isect) @@ -1383,13 +1540,15 @@ class OleFileIO: self.fat = self.fat + nextfat return isect + def loadfat(self, header): """ Load the FAT table. """ - # The header contains a sector numbers - # for the first 109 FAT sectors. Additional sectors are - # described by DIF blocks + # The 1st sector of the file contains sector numbers for the first 109 + # FAT sectors, right after the header which is 76 bytes long. + # (always 109, whatever the sector size: 512 bytes = 76+4*109) + # Additional sectors are described by DIF blocks sect = header[76:512] debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) @@ -1418,24 +1577,27 @@ class OleFileIO: if self.sectDifStart >= self.nb_sect: # initial DIFAT block index must be valid self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') - debug("DIFAT analysis...") + debug( "DIFAT analysis..." ) # We compute the necessary number of DIFAT sectors : - # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) - nb_difat = (self.csectFat-109 + 126)//127 - debug("nb_difat = %d" % nb_difat) + # Number of pointers per DIFAT sector = (sectorsize/4)-1 + # (-1 because the last pointer is the next DIFAT sector number) + nb_difat_sectors = (self.sectorsize//4)-1 + # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) + nb_difat = (self.csectFat-109 + nb_difat_sectors-1)//nb_difat_sectors + debug( "nb_difat = %d" % nb_difat ) if self.csectDif != nb_difat: raise IOError('incorrect DIFAT') isect_difat = self.sectDifStart for i in iterrange(nb_difat): - debug("DIFAT block %d, sector %X" % (i, isect_difat)) + debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) #TODO: check if corresponding FAT SID = DIFSECT sector_difat = self.getsect(isect_difat) difat = self.sect2array(sector_difat) self.dumpsect(sector_difat) - self.loadfat_sect(difat[:127]) + self.loadfat_sect(difat[:nb_difat_sectors]) # last DIFAT pointer is next DIFAT sector: - isect_difat = difat[127] - debug("next DIFAT sector: %X" % isect_difat) + isect_difat = difat[nb_difat_sectors] + debug( "next DIFAT sector: %X" % isect_difat ) # checks: if isect_difat not in [ENDOFCHAIN, FREESECT]: # last DIFAT pointer value must be ENDOFCHAIN or FREESECT @@ -1453,6 +1615,7 @@ class OleFileIO: debug('\nFAT:') self.dumpfat(self.fat) + def loadminifat(self): """ Load the MiniFAT table. @@ -1491,10 +1654,16 @@ class OleFileIO: """ Read given sector from file on disk. - :param sect: sector index + :param sect: int, sector index :returns: a string containing the sector data. """ - # [PL] this original code was wrong when sectors are 4KB instead of + # From [MS-CFB]: A sector number can be converted into a byte offset + # into the file by using the following formula: + # (sector number + 1) x Sector Size. + # This implies that sector #0 of the file begins at byte offset Sector + # Size, not at 0. + + # [PL] the original code in PIL was wrong when sectors are 4KB instead of # 512 bytes: #self.fp.seek(512 + self.sectorsize * sect) #[PL]: added safety checks: @@ -1512,6 +1681,34 @@ class OleFileIO: self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') return sector + + def write_sect(self, sect, data, padding=b'\x00'): + """ + Write given sector to file on disk. + + :param sect: int, sector index + :param data: bytes, sector data + :param padding: single byte, padding character if data < sector size + """ + if not isinstance(data, bytes): + raise TypeError("write_sect: data must be a bytes string") + if not isinstance(padding, bytes) or len(padding)!=1: + raise TypeError("write_sect: padding must be a bytes string of 1 char") + #TODO: we could allow padding=None for no padding at all + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + debug('write_sect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), self._filesize)) + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') + if len(data) < self.sectorsize: + # add padding + data += padding * (self.sectorsize - len(data)) + elif len(data) < self.sectorsize: + raise ValueError("Data is larger than sector size") + self.fp.write(data) + + def loaddirectory(self, sect): """ Load the directory. @@ -1541,12 +1738,13 @@ class OleFileIO: ## break ## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) # load root entry: - self._load_direntry(0) + root_entry = self._load_direntry(0) # Root entry is the first entry: self.root = self.direntries[0] # read and build all storage trees, starting from the root: self.root.build_storage_tree() + def _load_direntry (self, sid): """ Load a directory entry from the directory. @@ -1555,6 +1753,7 @@ class OleFileIO: :param sid: index of storage/stream in the directory. :returns: a _OleDirectoryEntry object + :exception IOError: if the entry has always been referenced. """ # check if SID is OK: @@ -1571,12 +1770,14 @@ class OleFileIO: self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) return self.direntries[sid] + def dumpdirectory(self): """ Dump directory (for debugging only) """ self.root.dump() + def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): """ Open a stream, either in FAT or MiniFAT according to its size. @@ -1585,7 +1786,7 @@ class OleFileIO: :param start: index of first sector :param size: size of stream (or nothing if size is unknown) :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT - according to size. If True, it will always be opened in FAT. + according to size. If True, it will always be opened in FAT. """ debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' % (start, size, str(force_FAT))) @@ -1602,51 +1803,60 @@ class OleFileIO: (self.root.isectStart, size_ministream)) self.ministream = self._open(self.root.isectStart, size_ministream, force_FAT=True) - return _OleStream(self.ministream, start, size, 0, - self.minisectorsize, self.minifat, - self.ministream.size) + return _OleStream(fp=self.ministream, sect=start, size=size, + offset=0, sectorsize=self.minisectorsize, + fat=self.minifat, filesize=self.ministream.size) else: # standard stream - return _OleStream(self.fp, start, size, 512, - self.sectorsize, self.fat, self._filesize) + return _OleStream(fp=self.fp, sect=start, size=size, + offset=self.sectorsize, + sectorsize=self.sectorsize, fat=self.fat, + filesize=self._filesize) + def _list(self, files, prefix, node, streams=True, storages=False): """ - (listdir helper) + listdir helper + :param files: list of files to fill in :param prefix: current location in storage tree (list of names) :param node: current node (_OleDirectoryEntry object) :param streams: bool, include streams if True (True by default) - new in v0.26 :param storages: bool, include storages if True (False by default) - new in v0.26 - (note: the root storage is never included) + (note: the root storage is never included) """ prefix = prefix + [node.name] for entry in node.kids: - if entry.kids: + if entry.entry_type == STGTY_STORAGE: # this is a storage if storages: # add it to the list files.append(prefix[1:] + [entry.name]) # check its kids self._list(files, prefix, entry, streams, storages) - else: + elif entry.entry_type == STGTY_STREAM: # this is a stream if streams: # add it to the list files.append(prefix[1:] + [entry.name]) + else: + self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.') + def listdir(self, streams=True, storages=False): """ - Return a list of streams stored in this file + Return a list of streams and/or storages stored in this file :param streams: bool, include streams if True (True by default) - new in v0.26 :param storages: bool, include storages if True (False by default) - new in v0.26 (note: the root storage is never included) + :returns: list of stream and/or storage paths """ files = [] self._list(files, [], self.root, streams, storages) return files + def _find(self, filename): """ Returns directory entry of given filename. (openstream helper) @@ -1656,10 +1866,11 @@ class OleFileIO: - a string using Unix path syntax, for example: 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. + - or a list of storage filenames, path to the desired stream/storage. Example: ['storage_1', 'storage_1.2', 'stream'] + :returns: sid of requested filename - raise IOError if file not found + :exception IOError: if file not found """ # if filename is a string instead of a list, split it on slashes to @@ -1677,15 +1888,17 @@ class OleFileIO: node = kid return node.sid + def openstream(self, filename): """ Open a stream as a read-only file object (BytesIO). + Note: filename is case-insensitive. :param filename: path of stream in storage tree (except root entry), either: - a string using Unix path syntax, for example: 'storage_1/storage_1.2/stream' - - a list of storage filenames, path to the desired stream/storage. + - or a list of storage filenames, path to the desired stream/storage. Example: ['storage_1', 'storage_1.2', 'stream'] :returns: file object (read-only) @@ -1697,6 +1910,68 @@ class OleFileIO: raise IOError("this file is not a stream") return self._open(entry.isectStart, entry.size) + + def write_stream(self, stream_name, data): + """ + Write a stream to disk. For now, it is only possible to replace an + existing stream by data of the same size. + + :param stream_name: path of stream in storage tree (except root entry), either: + + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - or a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + + :param data: bytes, data to be written, must be the same size as the original + stream. + """ + if not isinstance(data, bytes): + raise TypeError("write_stream: data must be a bytes string") + sid = self._find(stream_name) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + raise IOError("this is not a stream") + size = entry.size + if size != len(data): + raise ValueError("write_stream: data must be the same size as the existing stream") + if size < self.minisectorcutoff: + raise NotImplementedError("Writing a stream in MiniFAT is not implemented yet") + sect = entry.isectStart + # number of sectors to write + nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize + debug('nb_sectors = %d' % nb_sectors) + for i in range(nb_sectors): +## try: +## self.fp.seek(offset + self.sectorsize * sect) +## except: +## debug('sect=%d, seek=%d' % +## (sect, offset+self.sectorsize*sect)) +## raise IOError('OLE sector index out of range') + # extract one sector from data, the last one being smaller: + if i<(nb_sectors-1): + data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize] + #TODO: comment this if it works + assert(len(data_sector)==self.sectorsize) + else: + data_sector = data [i*self.sectorsize:] + #TODO: comment this if it works + debug('write_stream: size=%d sectorsize=%d data_sector=%d size%%sectorsize=%d' + % (size, self.sectorsize, len(data_sector), size % self.sectorsize)) + assert(len(data_sector) % self.sectorsize==size % self.sectorsize) + self.write_sect(sect, data_sector) +## self.fp.write(data_sector) + # jump to next sector in the FAT: + try: + sect = self.fat[sect] + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + raise IOError('incorrect OLE FAT, sector index out of range') + #[PL] Last sector should be a "end of chain" marker: + if sect != ENDOFCHAIN: + raise IOError('incorrect last sector index in OLE stream') + + def get_type(self, filename): """ Test if given filename exists as a stream or a storage in the OLE @@ -1716,6 +1991,7 @@ class OleFileIO: except: return False + def getmtime(self, filename): """ Return modification time of a stream/storage. @@ -1731,6 +2007,7 @@ class OleFileIO: entry = self.direntries[sid] return entry.getmtime() + def getctime(self, filename): """ Return creation time of a stream/storage. @@ -1746,20 +2023,23 @@ class OleFileIO: entry = self.direntries[sid] return entry.getctime() + def exists(self, filename): """ Test if given filename exists as a stream or a storage in the OLE container. + Note: filename is case-insensitive. :param filename: path of stream in storage tree. (see openstream for syntax) :returns: True if object exist, else False. """ try: - self._find(filename) + sid = self._find(filename) return True except: return False + def get_size(self, filename): """ Return size of a stream in the OLE container, in bytes. @@ -1767,7 +2047,7 @@ class OleFileIO: :param filename: path of stream in storage tree (see openstream for syntax) :returns: size in bytes (long integer) :exception IOError: if file not found - :exception TypeError: if this is not a stream + :exception TypeError: if this is not a stream. """ sid = self._find(filename) entry = self.direntries[sid] @@ -1776,6 +2056,7 @@ class OleFileIO: raise TypeError('object is not an OLE stream') return entry.size + def get_rootentry_name(self): """ Return root entry name. Should usually be 'Root Entry' or 'R' in most @@ -1783,6 +2064,7 @@ class OleFileIO: """ return self.root.name + def getproperties(self, filename, convert_time=False, no_conversion=None): """ Return properties described in substream. @@ -1791,10 +2073,12 @@ class OleFileIO: :param convert_time: bool, if True timestamps will be converted to Python datetime :param no_conversion: None or list of int, timestamps not to be converted (for example total editing time is not a real timestamp) + :returns: a dictionary of values indexed by id (integer) """ + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx # make sure no_conversion is a list, just to simplify code below: - if no_conversion is None: + if no_conversion == None: no_conversion = [] # stream path as a string to report exceptions: streampath = filename @@ -1808,11 +2092,11 @@ class OleFileIO: try: # header s = fp.read(28) - # clsid = _clsid(s[8:24]) + clsid = _clsid(s[8:24]) # format id s = fp.read(20) - # fmtid = _clsid(s[:16]) + fmtid = _clsid(s[:16]) fp.seek(i32(s, 16)) # get section @@ -1830,34 +2114,34 @@ class OleFileIO: for i in range(num_props): try: - id = 0 # just in case of an exception + id = 0 # just in case of an exception id = i32(s, 8+i*8) offset = i32(s, 12+i*8) type = i32(s, offset) - debug('property id=%d: type=%d offset=%X' % (id, type, offset)) + debug ('property id=%d: type=%d offset=%X' % (id, type, offset)) # test for common types first (should perhaps use # a dictionary instead?) - if type == VT_I2: # 16-bit signed integer + if type == VT_I2: # 16-bit signed integer value = i16(s, offset+4) if value >= 32768: value = value - 65536 - elif type == VT_UI2: # 2-byte unsigned integer + elif type == VT_UI2: # 2-byte unsigned integer value = i16(s, offset+4) elif type in (VT_I4, VT_INT, VT_ERROR): # VT_I4: 32-bit signed integer # VT_ERROR: HRESULT, similar to 32-bit signed integer, # see http://msdn.microsoft.com/en-us/library/cc230330.aspx value = i32(s, offset+4) - elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer - value = i32(s, offset+4) # FIXME + elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer + value = i32(s, offset+4) # FIXME elif type in (VT_BSTR, VT_LPSTR): # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx # size is a 32 bits integer, including the null terminator, and # possibly trailing or embedded null chars - # TODO: if codepage is unicode, the string should be converted as such + #TODO: if codepage is unicode, the string should be converted as such count = i32(s, offset+4) value = s[offset+8:offset+8+count-1] # remove all null chars: @@ -1873,9 +2157,9 @@ class OleFileIO: # "the string should NOT contain embedded or additional trailing # null characters." count = i32(s, offset+4) - value = _unicode(s[offset+8:offset+8+count*2]) + value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) elif type == VT_FILETIME: - value = long(i32(s, offset+4)) + (long(i32(s, offset+8)) << 32) + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) # FILETIME is a 64-bit int: "number of 100ns periods # since Jan 1,1601". if convert_time and id not in no_conversion: @@ -1889,8 +2173,8 @@ class OleFileIO: else: # legacy code kept for backward compatibility: returns a # number of seconds since Jan 1,1601 - value = value // 10000000 # seconds - elif type == VT_UI1: # 1-byte unsigned integer + value = value // 10000000 # seconds + elif type == VT_UI1: # 1-byte unsigned integer value = i8(s[offset+4]) elif type == VT_CLSID: value = _clsid(s[offset+4:offset+20]) @@ -1904,8 +2188,8 @@ class OleFileIO: # see http://msdn.microsoft.com/en-us/library/cc237864.aspx value = bool(i16(s, offset+4)) else: - value = None # everything else yields "None" - debug('property id=%d: type=%d not implemented in parser yet' % (id, type)) + value = None # everything else yields "None" + debug ('property id=%d: type=%d not implemented in parser yet' % (id, type)) # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, @@ -1917,8 +2201,8 @@ class OleFileIO: # type of items, e.g. VT_VECTOR|VT_BSTR # see http://msdn.microsoft.com/en-us/library/dd942011.aspx - # print("%08x" % id, repr(value), end=" ") - # print("(%s)" % VT[i32(s, offset) & 0xFFF]) + #print("%08x" % id, repr(value), end=" ") + #print("(%s)" % VT[i32(s, offset) & 0xFFF]) data[id] = value except BaseException as exc: @@ -1949,105 +2233,112 @@ class OleFileIO: if __name__ == "__main__": + import sys + # [PL] display quick usage info if launched from command-line if len(sys.argv) <= 1: - print(__doc__) - print(""" -Launched from command line, this script parses OLE files and prints info. + print('olefile version %s %s - %s' % (__version__, __date__, __author__)) + print( +""" +Launched from the command line, this script parses OLE files and prints info. -Usage: OleFileIO_PL.py [-d] [-c] [file2 ...] +Usage: olefile.py [-d] [-c] [file2 ...] Options: --d : debug mode (display a lot of debug information, for developers only) +-d : debug mode (displays a lot of debug information, for developers only) -c : check all streams (for debugging purposes) + +For more information, see http://www.decalage.info/olefile """) sys.exit() check_streams = False for filename in sys.argv[1:]: - # try: - # OPTIONS: - if filename == '-d': - # option to switch debug mode on: - set_debug_mode(True) - continue - if filename == '-c': - # option to switch check streams mode on: - check_streams = True - continue +## try: + # OPTIONS: + if filename == '-d': + # option to switch debug mode on: + set_debug_mode(True) + continue + if filename == '-c': + # option to switch check streams mode on: + check_streams = True + continue - ole = OleFileIO(filename) #, raise_defects=DEFECT_INCORRECT) - print("-" * 68) - print(filename) - print("-" * 68) - ole.dumpdirectory() - for streamname in ole.listdir(): - if streamname[-1][0] == "\005": - print(streamname, ": properties") - props = ole.getproperties(streamname, convert_time=True) - props = sorted(props.items()) - for k, v in props: - #[PL]: avoid to display too large or binary values: - if isinstance(v, (basestring, bytes)): - if len(v) > 50: - v = v[:50] - if isinstance(v, bytes): - # quick and dirty binary check: - for c in (1, 2, 3, 4, 5, 6, 7, 11, 12, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31): - if c in bytearray(v): - v = '(binary data)' - break - print(" ", k, v) - - if check_streams: - # Read all streams to check if there are errors: - print('\nChecking streams...') + ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) + print("-" * 68) + print(filename) + print("-" * 68) + ole.dumpdirectory() for streamname in ole.listdir(): - # print name using repr() to convert binary chars to \xNN: - print('-', repr('/'.join(streamname)), '-', end=' ') - st_type = ole.get_type(streamname) - if st_type == STGTY_STREAM: - print('size %d' % ole.get_size(streamname)) - # just try to read stream in memory: - ole.openstream(streamname) - else: - print('NOT a stream : type=%d' % st_type) + if streamname[-1][0] == "\005": + print(streamname, ": properties") + props = ole.getproperties(streamname, convert_time=True) + props = sorted(props.items()) + for k, v in props: + #[PL]: avoid to display too large or binary values: + if isinstance(v, (basestring, bytes)): + if len(v) > 50: + v = v[:50] + if isinstance(v, bytes): + # quick and dirty binary check: + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30,31): + if c in bytearray(v): + v = '(binary data)' + break + print(" ", k, v) + + if check_streams: + # Read all streams to check if there are errors: + print('\nChecking streams...') + for streamname in ole.listdir(): + # print name using repr() to convert binary chars to \xNN: + print('-', repr('/'.join(streamname)),'-', end=' ') + st_type = ole.get_type(streamname) + if st_type == STGTY_STREAM: + print('size %d' % ole.get_size(streamname)) + # just try to read stream in memory: + ole.openstream(streamname) + else: + print('NOT a stream : type=%d' % st_type) + print() + +## for streamname in ole.listdir(): +## # print name using repr() to convert binary chars to \xNN: +## print('-', repr('/'.join(streamname)),'-', end=' ') +## print(ole.getmtime(streamname)) +## print() + + print('Modification/Creation times of all directory entries:') + for entry in ole.direntries: + if entry is not None: + print('- %s: mtime=%s ctime=%s' % (entry.name, + entry.getmtime(), entry.getctime())) print() -## for streamname in ole.listdir(): -## # print name using repr() to convert binary chars to \xNN: -## print('-', repr('/'.join(streamname)),'-', end=' ') -## print(ole.getmtime(streamname)) -## print() + # parse and display metadata: + meta = ole.get_metadata() + meta.dump() + print() + #[PL] Test a few new methods: + root = ole.get_rootentry_name() + print('Root entry name: "%s"' % root) + if ole.exists('worddocument'): + print("This is a Word document.") + print("type of stream 'WordDocument':", ole.get_type('worddocument')) + print("size :", ole.get_size('worddocument')) + if ole.exists('macros/vba'): + print("This document may contain VBA macros.") - print('Modification/Creation times of all directory entries:') - for entry in ole.direntries: - if entry is not None: - print('- %s: mtime=%s ctime=%s' % (entry.name, - entry.getmtime(), entry.getctime())) - print() - - # parse and display metadata: - meta = ole.get_metadata() - meta.dump() - print() - # [PL] Test a few new methods: - root = ole.get_rootentry_name() - print('Root entry name: "%s"' % root) - if ole.exists('worddocument'): - print("This is a Word document.") - print("type of stream 'WordDocument':", ole.get_type('worddocument')) - print("size :", ole.get_size('worddocument')) - if ole.exists('macros/vba'): - print("This document may contain VBA macros.") - - # print parsing issues: - print('\nNon-fatal issues raised during parsing:') - if ole.parsing_issues: - for exctype, msg in ole.parsing_issues: - print('- %s: %s' % (exctype.__name__, msg)) - else: - print('None') + # print parsing issues: + print('\nNon-fatal issues raised during parsing:') + if ole.parsing_issues: + for exctype, msg in ole.parsing_issues: + print('- %s: %s' % (exctype.__name__, msg)) + else: + print('None') ## except IOError as v: ## print("***", "cannot read", file, "-", v) + +# this code was developed while listening to The Wedding Present "Sea Monsters" \ No newline at end of file