- new class OleMetadata to parse standard properties

- added get_metadata method to OleFileIO
This commit is contained in:
decalage 2013-05-05 23:52:20 +02:00 committed by Martin Panter
parent 25158fe8b1
commit d5166fd97e

View File

@ -6,7 +6,7 @@ OleFileIO_PL:
Microsoft Compound Document File Format), such as Microsoft Office Microsoft Compound Document File Format), such as Microsoft Office
documents, Image Composer and FlashPix files, Outlook messages, ... documents, Image Composer and FlashPix files, Outlook messages, ...
version 0.24 2013-05-03 Philippe Lagadec - http://www.decalage.info version 0.24 2013-05-05 Philippe Lagadec - http://www.decalage.info
Project website: http://www.decalage.info/python/olefileio Project website: http://www.decalage.info/python/olefileio
@ -24,7 +24,7 @@ WARNING: THIS IS (STILL) WORK IN PROGRESS.
""" """
__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)" __author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)"
__date__ = "2013-05-03" __date__ = "2013-05-05"
__version__ = '0.24' __version__ = '0.24'
#--- LICENSE ------------------------------------------------------------------ #--- LICENSE ------------------------------------------------------------------
@ -110,9 +110,11 @@ __version__ = '0.24'
# (https://bitbucket.org/decalage/olefileio_pl/issue/7) # (https://bitbucket.org/decalage/olefileio_pl/issue/7)
# - added close method to OleFileIO (fixed issue #2) # - added close method to OleFileIO (fixed issue #2)
# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) # 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr)
# 2013-05-03 v0.24 PL: - getproperties: added conversion from filetime to python # 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python
# datetime # datetime
# - main: displays properties with date format # - main: displays properties with date format
# - new class OleMetadata to parse standard properties
# - added get_metadata method
#----------------------------------------------------------------------------- #-----------------------------------------------------------------------------
@ -428,6 +430,107 @@ except NameError:
#=== CLASSES ================================================================== #=== CLASSES ==================================================================
class OleMetadata:
"""
class to parse and store metadata from standard properties of OLE files.
References for SummaryInformation stream:
- http://msdn.microsoft.com/en-us/library/dd942545.aspx
- http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
- http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
- http://msdn.microsoft.com/en-us/library/aa372045.aspx
- http://sedna-soft.de/summary-information-stream/
- http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
References for DocumentSummaryInformation stream:
- http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
- http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
- http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
"""
# attribute names for SummaryInformation stream properties:
SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',
'template', 'last_saved_by', 'revision_number', 'total_edit_time',
'last_printed', 'create_time', 'last_saved_time', 'num_pages',
'num_words', 'num_chars', 'thumbnail', 'creating_application',
'security']
# attribute names for DocumentSummaryInformation stream properties:
DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',
'slides', 'notes', 'hidden_slides', 'mm_clips',
'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',
'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',
'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',
'content_type', 'content_status', 'language', 'doc_version']
def __init__(self):
self.codepage = None
self.title = None
self.subject = None
self.author = None
self.keywords = None
self.comments = None
self.template = None
self.last_saved_by = None
self.revision_number = None
self.total_edit_time = None
self.last_printed = None
self.create_time = None
self.last_saved_time = None
self.num_pages = None
self.num_words = None
self.num_chars = None
self.thumbnail = None
self.creating_application = None
self.security = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
## self. = None
def parse_properties(self, olefile):
"""
Parse standard properties of an OLE file
"""
if olefile.exists("\x05SummaryInformation"):
# get properties from the stream:
props = olefile.getproperties("\x05SummaryInformation",
convert_time=True)
# store them into this object's attributes:
for i in range(len(self.SUMMARY_ATTRIBS)):
# ids for standards properties start at 0x01, until 0x13
value = props.get(i+1, None)
setattr(self, self.SUMMARY_ATTRIBS[i], value)
if olefile.exists("\x05DocumentSummaryInformation"):
# get properties from the stream:
props = olefile.getproperties("\x05DocumentSummaryInformation",
convert_time=True)
# store them into this object's attributes:
for i in range(len(self.DOCSUM_ATTRIBS)):
# ids for standards properties start at 0x01, until 0x13
value = props.get(i+1, None)
setattr(self, self.DOCSUM_ATTRIBS[i], value)
def dump(self):
print 'Properties from SummaryInformation stream:'
for prop in self.SUMMARY_ATTRIBS:
value = getattr(self, prop)
print '- %s: %s' % (prop, value)
print 'Properties from DocumentSummaryInformation stream:'
for prop in self.DOCSUM_ATTRIBS:
value = getattr(self, prop)
print '- %s: %s' % (prop, value)
#--- _OleStream --------------------------------------------------------------- #--- _OleStream ---------------------------------------------------------------
class _OleStream(StringIO.StringIO): class _OleStream(StringIO.StringIO):
@ -1598,6 +1701,16 @@ class OleFileIO:
return data return data
def get_metadata(self):
"""
Parse standard properties streams, return an OleMetadata object
containing all the available metadata.
(also stored in the metadata attribute of the OleFileIO object)
"""
self.metadata = OleMetadata()
self.metadata.parse_properties(self)
return self.metadata
# #
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# This script can be used to dump the directory of any OLE2 structured # This script can be used to dump the directory of any OLE2 structured
@ -1673,6 +1786,10 @@ Options:
print 'NOT a stream : type=%d' % st_type print 'NOT a stream : type=%d' % st_type
print '' print ''
# parse and display metadata:
meta = ole.get_metadata()
meta.dump()
print ''
#[PL] Test a few new methods: #[PL] Test a few new methods:
root = ole.get_rootentry_name() root = ole.get_rootentry_name()
print 'Root entry name: "%s"' % root print 'Root entry name: "%s"' % root