diff --git a/thirdparty/__init__.py b/thirdparty/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/thirdparty/ansistrm/__init__.py b/thirdparty/ansistrm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/thirdparty/ansistrm/__init__.py~ b/thirdparty/ansistrm/__init__.py~
new file mode 100644
index 000000000..72630d2e8
--- /dev/null
+++ b/thirdparty/ansistrm/__init__.py~
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+
+"""
+Copyright (c) 2006-2012 sqlmap developers (http://sqlmap.org/)
+See the file 'doc/COPYING' for copying permission
+"""
+
+pass
diff --git a/thirdparty/ansistrm/ansistrm.py b/thirdparty/ansistrm/ansistrm.py
new file mode 100644
index 000000000..62e03b37d
--- /dev/null
+++ b/thirdparty/ansistrm/ansistrm.py
@@ -0,0 +1,150 @@
+#
+# Copyright (C) 2010-2012 Vinay Sajip. All rights reserved. Licensed under the new BSD license.
+#
+import logging
+import os
+import re
+
+class ColorizingStreamHandler(logging.StreamHandler):
+ # color names to indices
+ color_map = {
+ 'black': 0,
+ 'red': 1,
+ 'green': 2,
+ 'yellow': 3,
+ 'blue': 4,
+ 'magenta': 5,
+ 'cyan': 6,
+ 'white': 7,
+ }
+
+ # levels to (background, foreground, bold/intense)
+ if os.name == 'nt':
+ level_map = {
+ logging.DEBUG: (None, 'blue', False),
+ logging.INFO: (None, 'green', False),
+ logging.WARNING: (None, 'yellow', False),
+ logging.ERROR: (None, 'red', False),
+ logging.CRITICAL: ('red', 'white', False)
+ }
+ else:
+ level_map = {
+ logging.DEBUG: (None, 'blue', False),
+ logging.INFO: (None, 'green', False),
+ logging.WARNING: (None, 'yellow', False),
+ logging.ERROR: (None, 'red', False),
+ logging.CRITICAL: ('red', 'white', False)
+ }
+ csi = '\x1b['
+ reset = '\x1b[0m'
+
+ @property
+ def is_tty(self):
+ isatty = getattr(self.stream, 'isatty', None)
+ return isatty and isatty()
+
+ def emit(self, record):
+ try:
+ message = self.format(record)
+ stream = self.stream
+
+ if not self.is_tty:
+ stream.write(message)
+ else:
+ self.output_colorized(message)
+ stream.write(getattr(self, 'terminator', '\n'))
+
+ self.flush()
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except:
+ self.handleError(record)
+
+ if os.name != 'nt':
+ def output_colorized(self, message):
+ self.stream.write(message)
+ else:
+ import re
+ ansi_esc = re.compile(r'\x1b\[((?:\d+)(?:;(?:\d+))*)m')
+
+ nt_color_map = {
+ 0: 0x00, # black
+ 1: 0x04, # red
+ 2: 0x02, # green
+ 3: 0x06, # yellow
+ 4: 0x01, # blue
+ 5: 0x05, # magenta
+ 6: 0x03, # cyan
+ 7: 0x07, # white
+ }
+
+ def output_colorized(self, message):
+ import ctypes
+
+ parts = self.ansi_esc.split(message)
+ write = self.stream.write
+ h = None
+ fd = getattr(self.stream, 'fileno', None)
+
+ if fd is not None:
+ fd = fd()
+
+ if fd in (1, 2): # stdout or stderr
+ h = ctypes.windll.kernel32.GetStdHandle(-10 - fd)
+
+ while parts:
+ text = parts.pop(0)
+
+ if text:
+ write(text)
+
+ if parts:
+ params = parts.pop(0)
+
+ if h is not None:
+ params = [int(p) for p in params.split(';')]
+ color = 0
+
+ for p in params:
+ if 40 <= p <= 47:
+ color |= self.nt_color_map[p - 40] << 4
+ elif 30 <= p <= 37:
+ color |= self.nt_color_map[p - 30]
+ elif p == 1:
+ color |= 0x08 # foreground intensity on
+ elif p == 0: # reset to default color
+ color = 0x07
+ else:
+ pass # error condition ignored
+
+ ctypes.windll.kernel32.SetConsoleTextAttribute(h, color)
+
+ def colorize(self, message, record):
+ if record.levelno in self.level_map:
+ bg, fg, bold = self.level_map[record.levelno]
+ params = []
+
+ if bg in self.color_map:
+ params.append(str(self.color_map[bg] + 40))
+
+ if fg in self.color_map:
+ params.append(str(self.color_map[fg] + 30))
+
+ if bold:
+ params.append('1')
+
+ if params and message:
+ if message.lstrip() != message:
+ prefix = re.search(r"\s+", message).group(0)
+ message = message[len(prefix):]
+ else:
+ prefix = ""
+
+ message = "%s%s" % (prefix, ''.join((self.csi, ';'.join(params),
+ 'm', message, self.reset)))
+
+ return message
+
+ def format(self, record):
+ message = logging.StreamHandler.format(self, record)
+ return self.colorize(message, record)
diff --git a/thirdparty/ansistrm/ansistrm.py~ b/thirdparty/ansistrm/ansistrm.py~
new file mode 100644
index 000000000..97cdae649
--- /dev/null
+++ b/thirdparty/ansistrm/ansistrm.py~
@@ -0,0 +1,150 @@
+#
+# Copyright (C) 2010-2012 Vinay Sajip. All rights reserved. Licensed under the new BSD license.
+#
+import logging
+import os
+import re
+
+class ColorizingStreamHandler(logging.StreamHandler):
+ # color names to indices
+ color_map = {
+ 'black': 0,
+ 'red': 1,
+ 'green': 2,
+ 'yellow': 3,
+ 'blue': 4,
+ 'magenta': 5,
+ 'cyan': 6,
+ 'white': 7,
+ }
+
+ # levels to (background, foreground, bold/intense)
+ if os.name == 'nt':
+ level_map = {
+ logging.DEBUG: (None, 'blue', False),
+ logging.INFO: (None, 'green', False),
+ logging.WARNING: (None, 'yellow', False),
+ logging.ERROR: (None, 'red', False),
+ logging.CRITICAL: ('red', 'white', False)
+ }
+ else:
+ level_map = {
+ logging.DEBUG: (None, 'blue', False),
+ logging.INFO: (None, 'green', False),
+ logging.WARNING: (None, 'yellow', False),
+ logging.ERROR: (None, 'red', False),
+ logging.CRITICAL: ('red', 'white', False)
+ }
+ csi = '\x1b['
+ reset = '\x1b[0m'
+
+ @property
+ def is_tty(self):
+ isatty = getattr(self.stream, 'isatty', None)
+ return isatty and isatty()
+
+ def emit(self, record):
+ try:
+ message = self.format(record)
+ stream = self.stream
+
+ if not self.is_tty:
+ stream.write(message)
+ else:
+ self.output_colorized(message)
+ stream.write(getattr(self, 'terminator', '\n'))
+
+ self.flush()
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except:
+ self.handleError(record)
+
+ if os.name != 'nt':
+ def output_colorized(self, message):
+ self.stream.write(message)
+ else:
+ import re
+ ansi_esc = re.compile(r'\x1b\[((?:\d+)(?:;(?:\d+))*)m')
+
+ nt_color_map = {
+ 0: 0x00, # black
+ 1: 0x04, # red
+ 2: 0x02, # green
+ 3: 0x06, # yellow
+ 4: 0x01, # blue
+ 5: 0x05, # magenta
+ 6: 0x03, # cyan
+ 7: 0x07, # white
+ }
+
+ def output_colorized(self, message):
+ import ctypes
+
+ parts = self.ansi_esc.split(message)
+ write = self.stream.write
+ h = None
+ fd = getattr(self.stream, 'fileno', None)
+
+ if fd is not None:
+ fd = fd()
+
+ if fd in (1, 2): # stdout or stderr
+ h = ctypes.windll.kernel32.GetStdHandle(-10 - fd)
+
+ while parts:
+ text = parts.pop(0)
+
+ if text:
+ write(text)
+
+ if parts:
+ params = parts.pop(0)
+
+ if h is not None:
+ params = [int(p) for p in params.split(';')]
+ color = 0
+
+ for p in params:
+ if 40 <= p <= 47:
+ color |= self.nt_color_map[p - 40] << 4
+ elif 30 <= p <= 37:
+ color |= self.nt_color_map[p - 30]
+ elif p == 1:
+ color |= 0x08 # foreground intensity on
+ elif p == 0: # reset to default color
+ color = 0x07
+ else:
+ pass # error condition ignored
+
+ ctypes.windll.kernel32.SetConsoleTextAttribute(h, color)
+
+ def colorize(self, message, record):
+ if record.levelno in self.level_map:
+ bg, fg, bold = self.level_map[record.levelno]
+ params = []
+
+ if bg in self.color_map:
+ params.append(str(self.color_map[bg] + 40))
+
+ if fg in self.color_map:
+ params.append(str(self.color_map[fg] + 30))
+
+ if bold:
+ params.append('1')
+
+ if params:
+ if message.lstrip() != message:
+ prefix = re.search(r"\s+", message).group(0)
+ message = message[len(prefix):]
+ else:
+ prefix = ""
+
+ message = "%s%s" % (prefix, ''.join((self.csi, ';'.join(params),
+ 'm', message, self.reset)))
+
+ return message
+
+ def format(self, record):
+ message = logging.StreamHandler.format(self, record)
+ return self.colorize(message, record)
diff --git a/thirdparty/beautifulsoup/__init__.py b/thirdparty/beautifulsoup/__init__.py
new file mode 100644
index 000000000..7954a3d0a
--- /dev/null
+++ b/thirdparty/beautifulsoup/__init__.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2004-2010, Leonard Richardson
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# * Neither the name of the the Beautiful Soup Consortium and All
+# Night Kosher Bakery nor the names of its contributors may be
+# used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+#
+
+pass
diff --git a/thirdparty/beautifulsoup/beautifulsoup.py b/thirdparty/beautifulsoup/beautifulsoup.py
new file mode 100644
index 000000000..cde92ee11
--- /dev/null
+++ b/thirdparty/beautifulsoup/beautifulsoup.py
@@ -0,0 +1,2014 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2010, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
+__license__ = "New-style BSD"
+
+from sgmllib import SGMLParser, SGMLParseError
+import codecs
+import markupbase
+import types
+import re
+import sgmllib
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+# First, the classes that represent markup elements.
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.index(self)
+ if hasattr(replaceWith, "parent")\
+ and replaceWith.parent is self.parent:
+ # We're replacing this element with one of its siblings.
+ index = replaceWith.parent.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def replaceWithChildren(self):
+ myParent = self.parent
+ myIndex = self.parent.index(self)
+ self.extract()
+ reversedChildren = list(self.contents)
+ reversedChildren.reverse()
+ for child in reversedChildren:
+ myParent.insert(myIndex, child)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ del self.parent.contents[self.parent.index(self)]
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if isinstance(newChild, basestring) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent is self:
+ index = self.index(newChild)
+ if index > position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ # (Possibly) special case some findAll*(...) searches
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True:
+ return [element for element in generator()
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator()
+ if isinstance(element, Tag) and
+ element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ # Build a SoupStrainer
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i is not None:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i is not None:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (NavigableString.__str__(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def __unicode__(self):
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ if encoding:
+ return self.encode(encoding)
+ else:
+ return self
+
+class CData(NavigableString):
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "" % NavigableString.__str__(self, encoding)
+
+class ProcessingInstruction(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ output = self
+ if "%SOUP-ENCODING%" in output:
+ output = self.substituteEncoding(output, encoding)
+ return "%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "" % NavigableString.__str__(self, encoding)
+
+class Declaration(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "" % NavigableString.__str__(self, encoding)
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs is None:
+ attrs = []
+ elif isinstance(attrs, dict):
+ attrs = attrs.items()
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
+ self.attrs = map(convert, self.attrs)
+
+ def getString(self):
+ if (len(self.contents) == 1
+ and isinstance(self.contents[0], NavigableString)):
+ return self.contents[0]
+
+ def setString(self, string):
+ """Replace the contents of the tag with a string"""
+ self.clear()
+ self.append(string)
+
+ string = property(getString, setString)
+
+ def getText(self, separator=u""):
+ if not len(self.contents):
+ return u""
+ stopNode = self._lastRecursiveChild().next
+ strings = []
+ current = self.contents[0]
+ while current is not stopNode:
+ if isinstance(current, NavigableString):
+ strings.append(current.strip())
+ current = current.next
+ return separator.join(strings)
+
+ text = property(getText)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def clear(self):
+ """Extract all children."""
+ for child in self.contents[:]:
+ child.extract()
+
+ def index(self, element):
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in xrange(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if other is self:
+ return True
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in xrange(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.__str__(encoding)
+
+ def __unicode__(self):
+ return self.__str__(None)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding.
+
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
+
+ encodedName = self.toEncoding(self.name, encoding)
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isinstance(val, basestring):
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+ val = self.substituteEncoding(val, encoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
+ attrs.append(fmt % (self.toEncoding(key, encoding),
+ self.toEncoding(val, encoding)))
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '%s>' % encodedName
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.renderContents(encoding, prettyPrint, indentContents)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (encodedName, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ if len(self.contents) == 0:
+ return
+ current = self.contents[0]
+ while current is not None:
+ next = current.next
+ if isinstance(current, Tag):
+ del current.contents[:]
+ current.parent = None
+ current.previous = None
+ current.previousSibling = None
+ current.next = None
+ current.nextSibling = None
+ current = next
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.__str__(encoding, True)
+
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.__str__(encoding)
+ elif isinstance(c, Tag):
+ s.append(c.__str__(encoding, prettyPrint, indentLevel))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def childGenerator(self):
+ # Just use the iterator from the contents
+ return iter(self.contents)
+
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isinstance(attrs, basestring):
+ kwargs['class'] = _match_css_class(attrs)
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if hasattr(markup, "__iter__") \
+ and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isinstance(markup, basestring):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst is True:
+ result = markup is not None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup and not isinstance(markup, basestring):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif hasattr(matchAgainst, '__iter__'): # list-like
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isinstance(markup, basestring):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif hasattr(portion, '__iter__'): # is a list
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "
(No space between name of closing tag and tag close)
+ (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ SGMLParser.__init__(self)
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed
+
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not hasattr(self.markupMassage, "__iter__"):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.reset()
+
+ SGMLParser.feed(self, markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+ if methodName.startswith('start_') or methodName.startswith('end_') \
+ or methodName.startswith('do_'):
+ return SGMLParser.__getattr__(self, methodName)
+ elif not methodName.startswith('__'):
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ SGMLParser.reset(self)
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in xrange(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in xrange(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+
FooBar *
* should pop to 'p', not 'b'. +
Foo
* | * should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in xrange(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers is not None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers is None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "%s> is not real!" % name
+ self.handle_data('%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a tag should implicitly close the previous tag. + + Para1 Para2 + should be transformed into: + Para1 Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a tag should _not_ implicitly close the previous +tag. + + Alice said:Bob said:Blah + should NOT be transformed into: + Alice said:Bob said:Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a |