diff --git a/extra/clientform/__init__.py b/extra/clientform/__init__.py
new file mode 100644
index 000000000..d79a05bda
--- /dev/null
+++ b/extra/clientform/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+#
+# Copyright 2007-2008 David McNab
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see .
+#
+
+pass
diff --git a/extra/clientform/clientform.py b/extra/clientform/clientform.py
new file mode 100644
index 000000000..a622de7b6
--- /dev/null
+++ b/extra/clientform/clientform.py
@@ -0,0 +1,3401 @@
+"""HTML form handling for web clients.
+
+ClientForm is a Python module for handling HTML forms on the client
+side, useful for parsing HTML forms, filling them in and returning the
+completed forms to the server. It has developed from a port of Gisle
+Aas' Perl module HTML::Form, from the libwww-perl library, but the
+interface is not the same.
+
+The most useful docstring is the one for HTMLForm.
+
+RFC 1866: HTML 2.0
+RFC 1867: Form-based File Upload in HTML
+RFC 2388: Returning Values from Forms: multipart/form-data
+HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
+HTML 4.01 Specification, W3C Recommendation 24 December 1999
+
+
+Copyright 2002-2007 John J. Lee
+Copyright 2005 Gary Poster
+Copyright 2005 Zope Corporation
+Copyright 1998-2000 Gisle Aas.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+# XXX
+# Remove parser testing hack
+# safeUrl()-ize action
+# Switch to unicode throughout (would be 0.3.x)
+# See Wichert Akkerman's 2004-01-22 message to c.l.py.
+# Add charset parameter to Content-type headers? How to find value??
+# Add some more functional tests
+# Especially single and multiple file upload on the internet.
+# Does file upload work when name is missing? Sourceforge tracker form
+# doesn't like it. Check standards, and test with Apache. Test
+# binary upload with Apache.
+# mailto submission & enctype text/plain
+# I'm not going to fix this unless somebody tells me what real servers
+# that want this encoding actually expect: If enctype is
+# application/x-www-form-urlencoded and there's a FILE control present.
+# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
+# 17.13.2), but I send "name=" ATM. What about multiple file upload??
+
+# Would be nice, but I'm not going to do it myself:
+# -------------------------------------------------
+# Maybe a 0.4.x?
+# Replace by_label etc. with moniker / selector concept. Allows, eg.,
+# a choice between selection by value / id / label / element
+# contents. Or choice between matching labels exactly or by
+# substring. Etc.
+# Remove deprecated methods.
+# ...what else?
+# Work on DOMForm.
+# XForms? Don't know if there's a need here.
+
+__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
+ 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
+ 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
+ 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
+ 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
+ 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',
+ 'RadioControl', 'ScalarControl', 'SelectControl',
+ 'SubmitButtonControl', 'SubmitControl', 'TextControl',
+ 'TextareaControl', 'XHTMLCompatibleFormParser']
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+try: bool
+except NameError:
+ def bool(expr):
+ if expr: return True
+ else: return False
+
+try:
+ import logging
+ import inspect
+except ImportError:
+ def debug(msg, *args, **kwds):
+ pass
+else:
+ _logger = logging.getLogger("ClientForm")
+ OPTIMIZATION_HACK = True
+
+ def debug(msg, *args, **kwds):
+ if OPTIMIZATION_HACK:
+ return
+
+ caller_name = inspect.stack()[1][3]
+ extended_msg = '%%s %s' % msg
+ extended_args = (caller_name,)+args
+ debug = _logger.debug(extended_msg, *extended_args, **kwds)
+
+ def _show_debug_messages():
+ global OPTIMIZATION_HACK
+ OPTIMIZATION_HACK = False
+ _logger.setLevel(logging.DEBUG)
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setLevel(logging.DEBUG)
+ _logger.addHandler(handler)
+
+import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
+ htmlentitydefs, re, random
+from cStringIO import StringIO
+
+import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile("(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
+
+# HTMLParser.HTMLParser is recent, so live without it if it's not available
+# (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
+try:
+ import HTMLParser
+except ImportError:
+ HAVE_MODULE_HTMLPARSER = False
+else:
+ HAVE_MODULE_HTMLPARSER = True
+
+try:
+ import warnings
+except ImportError:
+ def deprecation(message, stack_offset=0):
+ pass
+else:
+ def deprecation(message, stack_offset=0):
+ warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
+
+VERSION = "0.2.10"
+
+CHUNK = 1024 # size of chunks fed to parser, in bytes
+
+DEFAULT_ENCODING = "latin-1"
+
+class Missing: pass
+
+_compress_re = re.compile(r"\s+")
+def compress_text(text): return _compress_re.sub(" ", text.strip())
+
+def normalize_line_endings(text):
+ return re.sub(r"(?:(?
+ w = MimeWriter(f)
+ ...call w.addheader(key, value) 0 or more times...
+
+ followed by either:
+
+ f = w.startbody(content_type)
+ ...call f.write(data) for body data...
+
+ or:
+
+ w.startmultipartbody(subtype)
+ for each part:
+ subwriter = w.nextpart()
+ ...use the subwriter's methods to create the subpart...
+ w.lastpart()
+
+ The subwriter is another MimeWriter instance, and should be
+ treated in the same way as the toplevel MimeWriter. This way,
+ writing recursive body parts is easy.
+
+ Warning: don't forget to call lastpart()!
+
+ XXX There should be more state so calls made in the wrong order
+ are detected.
+
+ Some special cases:
+
+ - startbody() just returns the file passed to the constructor;
+ but don't use this knowledge, as it may be changed.
+
+ - startmultipartbody() actually returns a file as well;
+ this can be used to write the initial 'if you can read this your
+ mailer is not MIME-aware' message.
+
+ - If you call flushheaders(), the headers accumulated so far are
+ written out (and forgotten); this is useful if you don't need a
+ body part at all, e.g. for a subpart of type message/rfc822
+ that's (mis)used to store some header-like information.
+
+ - Passing a keyword argument 'prefix=' to addheader(),
+ start*body() affects where the header is inserted; 0 means
+ append at the end, 1 means insert at the start; default is
+ append for addheader(), but insert for start*body(), which use
+ it to determine where the Content-type header goes.
+
+ """
+
+ def __init__(self, fp, http_hdrs=None):
+ self._http_hdrs = http_hdrs
+ self._fp = fp
+ self._headers = []
+ self._boundary = []
+ self._first_part = True
+
+ def addheader(self, key, value, prefix=0,
+ add_to_http_hdrs=0):
+ """
+ prefix is ignored if add_to_http_hdrs is true.
+ """
+ lines = value.split("\r\n")
+ while lines and not lines[-1]: del lines[-1]
+ while lines and not lines[0]: del lines[0]
+ if add_to_http_hdrs:
+ value = "".join(lines)
+ # 2.2 urllib2 doesn't normalize header case
+ self._http_hdrs.append((key.capitalize(), value))
+ else:
+ for i in range(1, len(lines)):
+ lines[i] = " " + lines[i].strip()
+ value = "\r\n".join(lines) + "\r\n"
+ line = key.title() + ": " + value
+ if prefix:
+ self._headers.insert(0, line)
+ else:
+ self._headers.append(line)
+
+ def flushheaders(self):
+ self._fp.writelines(self._headers)
+ self._headers = []
+
+ def startbody(self, ctype=None, plist=[], prefix=1,
+ add_to_http_hdrs=0, content_type=1):
+ """
+ prefix is ignored if add_to_http_hdrs is true.
+ """
+ if content_type and ctype:
+ for name, value in plist:
+ ctype = ctype + ';\r\n %s=%s' % (name, value)
+ self.addheader("Content-Type", ctype, prefix=prefix,
+ add_to_http_hdrs=add_to_http_hdrs)
+ self.flushheaders()
+ if not add_to_http_hdrs: self._fp.write("\r\n")
+ self._first_part = True
+ return self._fp
+
+ def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
+ add_to_http_hdrs=0, content_type=1):
+ boundary = boundary or choose_boundary()
+ self._boundary.append(boundary)
+ return self.startbody("multipart/" + subtype,
+ [("boundary", boundary)] + plist,
+ prefix=prefix,
+ add_to_http_hdrs=add_to_http_hdrs,
+ content_type=content_type)
+
+ def nextpart(self):
+ boundary = self._boundary[-1]
+ if self._first_part:
+ self._first_part = False
+ else:
+ self._fp.write("\r\n")
+ self._fp.write("--" + boundary + "\r\n")
+ return self.__class__(self._fp)
+
+ def lastpart(self):
+ if self._first_part:
+ self.nextpart()
+ boundary = self._boundary.pop()
+ self._fp.write("\r\n--" + boundary + "--\r\n")
+
+
+class LocateError(ValueError): pass
+class AmbiguityError(LocateError): pass
+class ControlNotFoundError(LocateError): pass
+class ItemNotFoundError(LocateError): pass
+
+class ItemCountError(ValueError): pass
+
+# for backwards compatibility, ParseError derives from exceptions that were
+# raised by versions of ClientForm <= 0.2.5
+if HAVE_MODULE_HTMLPARSER:
+ SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
+ class ParseError(sgmllib.SGMLParseError,
+ HTMLParser.HTMLParseError,
+ ):
+ pass
+else:
+ if hasattr(sgmllib, "SGMLParseError"):
+ SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
+ class ParseError(sgmllib.SGMLParseError):
+ pass
+ else:
+ SGMLLIB_PARSEERROR = RuntimeError
+ class ParseError(RuntimeError):
+ pass
+
+
+class _AbstractFormParser:
+ """forms attribute contains HTMLForm instances on completion."""
+ # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ if entitydefs is None:
+ entitydefs = get_entitydefs()
+ self._entitydefs = entitydefs
+ self._encoding = encoding
+
+ self.base = None
+ self.forms = []
+ self.labels = []
+ self._current_label = None
+ self._current_form = None
+ self._select = None
+ self._optgroup = None
+ self._option = None
+ self._textarea = None
+
+ # forms[0] will contain all controls that are outside of any form
+ # self._global_form is an alias for self.forms[0]
+ self._global_form = None
+ self.start_form([])
+ self.end_form()
+ self._current_form = self._global_form = self.forms[0]
+
+ def do_base(self, attrs):
+ debug("%s", attrs)
+ for key, value in attrs:
+ if key == "href":
+ self.base = self.unescape_attr_if_required(value)
+
+ def end_body(self):
+ debug("")
+ if self._current_label is not None:
+ self.end_label()
+ if self._current_form is not self._global_form:
+ self.end_form()
+
+ def start_form(self, attrs):
+ debug("%s", attrs)
+ if self._current_form is not self._global_form:
+ raise ParseError("nested FORMs")
+ name = None
+ action = None
+ enctype = "application/x-www-form-urlencoded"
+ method = "GET"
+ d = {}
+ for key, value in attrs:
+ if key == "name":
+ name = self.unescape_attr_if_required(value)
+ elif key == "action":
+ action = self.unescape_attr_if_required(value)
+ elif key == "method":
+ method = self.unescape_attr_if_required(value.upper())
+ elif key == "enctype":
+ enctype = self.unescape_attr_if_required(value.lower())
+ d[key] = self.unescape_attr_if_required(value)
+ controls = []
+ self._current_form = (name, action, method, enctype), d, controls
+
+ def end_form(self):
+ debug("")
+ if self._current_label is not None:
+ self.end_label()
+ if self._current_form is self._global_form:
+ raise ParseError("end of FORM before start")
+ self.forms.append(self._current_form)
+ self._current_form = self._global_form
+
+ def start_select(self, attrs):
+ debug("%s", attrs)
+ if self._select is not None:
+ raise ParseError("nested SELECTs")
+ if self._textarea is not None:
+ raise ParseError("SELECT inside TEXTAREA")
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+
+ self._select = d
+ self._add_label(d)
+
+ self._append_select_control({"__select": d})
+
+ def end_select(self):
+ debug("")
+ if self._select is None:
+ raise ParseError("end of SELECT before start")
+
+ if self._option is not None:
+ self._end_option()
+
+ self._select = None
+
+ def start_optgroup(self, attrs):
+ debug("%s", attrs)
+ if self._select is None:
+ raise ParseError("OPTGROUP outside of SELECT")
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+
+ self._optgroup = d
+
+ def end_optgroup(self):
+ debug("")
+ if self._optgroup is None:
+ raise ParseError("end of OPTGROUP before start")
+ self._optgroup = None
+
+ def _start_option(self, attrs):
+ debug("%s", attrs)
+ if self._select is None:
+ raise ParseError("OPTION outside of SELECT")
+ if self._option is not None:
+ self._end_option()
+
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+
+ self._option = {}
+ self._option.update(d)
+ if (self._optgroup and self._optgroup.has_key("disabled") and
+ not self._option.has_key("disabled")):
+ self._option["disabled"] = None
+
+ def _end_option(self):
+ debug("")
+ if self._option is None:
+ raise ParseError("end of OPTION before start")
+
+ contents = self._option.get("contents", "").strip()
+ self._option["contents"] = contents
+ if not self._option.has_key("value"):
+ self._option["value"] = contents
+ if not self._option.has_key("label"):
+ self._option["label"] = contents
+ # stuff dict of SELECT HTML attrs into a special private key
+ # (gets deleted again later)
+ self._option["__select"] = self._select
+ self._append_select_control(self._option)
+ self._option = None
+
+ def _append_select_control(self, attrs):
+ debug("%s", attrs)
+ controls = self._current_form[2]
+ name = self._select.get("name")
+ controls.append(("select", name, attrs))
+
+ def start_textarea(self, attrs):
+ debug("%s", attrs)
+ if self._textarea is not None:
+ raise ParseError("nested TEXTAREAs")
+ if self._select is not None:
+ raise ParseError("TEXTAREA inside SELECT")
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+ self._add_label(d)
+
+ self._textarea = d
+
+ def end_textarea(self):
+ debug("")
+ if self._textarea is None:
+ raise ParseError("end of TEXTAREA before start")
+ controls = self._current_form[2]
+ name = self._textarea.get("name")
+ controls.append(("textarea", name, self._textarea))
+ self._textarea = None
+
+ def start_label(self, attrs):
+ debug("%s", attrs)
+ if self._current_label:
+ self.end_label()
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+ taken = bool(d.get("for")) # empty id is invalid
+ d["__text"] = ""
+ d["__taken"] = taken
+ if taken:
+ self.labels.append(d)
+ self._current_label = d
+
+ def end_label(self):
+ debug("")
+ label = self._current_label
+ if label is None:
+ # something is ugly in the HTML, but we're ignoring it
+ return
+ self._current_label = None
+ # if it is staying around, it is True in all cases
+ del label["__taken"]
+
+ def _add_label(self, d):
+ #debug("%s", d)
+ if self._current_label is not None:
+ if not self._current_label["__taken"]:
+ self._current_label["__taken"] = True
+ d["__label"] = self._current_label
+
+ def handle_data(self, data):
+ debug("%s", data)
+
+ if self._option is not None:
+ # self._option is a dictionary of the OPTION element's HTML
+ # attributes, but it has two special keys, one of which is the
+ # special "contents" key contains text between OPTION tags (the
+ # other is the "__select" key: see the end_option method)
+ map = self._option
+ key = "contents"
+ elif self._textarea is not None:
+ map = self._textarea
+ key = "value"
+ data = normalize_line_endings(data)
+ # not if within option or textarea
+ elif self._current_label is not None:
+ map = self._current_label
+ key = "__text"
+ else:
+ return
+
+ if data and not map.has_key(key):
+ # according to
+ # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
+ # immediately after start tags or immediately before end tags must
+ # be ignored, but real browsers only ignore a line break after a
+ # start tag, so we'll do that.
+ if data[0:2] == "\r\n":
+ data = data[2:]
+ elif data[0:1] in ["\n", "\r"]:
+ data = data[1:]
+ map[key] = data
+ else:
+ map[key] = map[key] + data
+
+ def do_button(self, attrs):
+ debug("%s", attrs)
+ d = {}
+ d["type"] = "submit" # default
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+ controls = self._current_form[2]
+
+ type = d["type"]
+ name = d.get("name")
+ # we don't want to lose information, so use a type string that
+ # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
+ # e.g. type for BUTTON/RESET is "resetbutton"
+ # (type for INPUT/RESET is "reset")
+ type = type+"button"
+ self._add_label(d)
+ controls.append((type, name, d))
+
+ def do_input(self, attrs):
+ debug("%s", attrs)
+ d = {}
+ d["type"] = "text" # default
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+ controls = self._current_form[2]
+
+ type = d["type"]
+ name = d.get("name")
+ self._add_label(d)
+ controls.append((type, name, d))
+
+ def do_isindex(self, attrs):
+ debug("%s", attrs)
+ d = {}
+ for key, val in attrs:
+ d[key] = self.unescape_attr_if_required(val)
+ controls = self._current_form[2]
+
+ self._add_label(d)
+ # isindex doesn't have type or name HTML attributes
+ controls.append(("isindex", None, d))
+
+ def handle_entityref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape(
+ '&%s;' % name, self._entitydefs, self._encoding))
+
+ def handle_charref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape_charref(name, self._encoding))
+
+ def unescape_attr(self, name):
+ #debug("%s", name)
+ return unescape(name, self._entitydefs, self._encoding)
+
+ def unescape_attrs(self, attrs):
+ #debug("%s", attrs)
+ escaped_attrs = {}
+ for key, val in attrs.items():
+ try:
+ val.items
+ except AttributeError:
+ escaped_attrs[key] = self.unescape_attr(val)
+ else:
+ # e.g. "__select" -- yuck!
+ escaped_attrs[key] = self.unescape_attrs(val)
+ return escaped_attrs
+
+ def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
+ def unknown_charref(self, ref): self.handle_data("%s;" % ref)
+
+
+if not HAVE_MODULE_HTMLPARSER:
+ class XHTMLCompatibleFormParser:
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ raise ValueError("HTMLParser could not be imported")
+else:
+ class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
+ """Good for XHTML, bad for tolerance of incorrect HTML."""
+ # thanks to Michael Howitz for this!
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ HTMLParser.HTMLParser.__init__(self)
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
+
+ def feed(self, data):
+ try:
+ HTMLParser.HTMLParser.feed(self, data)
+ except HTMLParser.HTMLParseError, exc:
+ raise ParseError(exc)
+
+ def start_option(self, attrs):
+ _AbstractFormParser._start_option(self, attrs)
+
+ def end_option(self):
+ _AbstractFormParser._end_option(self)
+
+ def handle_starttag(self, tag, attrs):
+ try:
+ method = getattr(self, "start_" + tag)
+ except AttributeError:
+ try:
+ method = getattr(self, "do_" + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method(attrs)
+ else:
+ method(attrs)
+
+ def handle_endtag(self, tag):
+ try:
+ method = getattr(self, "end_" + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method()
+
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+ def unescape_attr_if_required(self, name):
+ return name # HTMLParser.HTMLParser already did it
+ def unescape_attrs_if_required(self, attrs):
+ return attrs # ditto
+
+ def close(self):
+ HTMLParser.HTMLParser.close(self)
+ self.end_body()
+
+
+class _AbstractSgmllibParser(_AbstractFormParser):
+
+ def do_option(self, attrs):
+ _AbstractFormParser._start_option(self, attrs)
+
+ if sys.version_info[:2] >= (2,5):
+ # we override this attr to decode hex charrefs
+ entity_or_charref = re.compile(
+ '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
+ def convert_entityref(self, name):
+ return unescape("&%s;" % name, self._entitydefs, self._encoding)
+ def convert_charref(self, name):
+ return unescape_charref("%s" % name, self._encoding)
+ def unescape_attr_if_required(self, name):
+ return name # sgmllib already did it
+ def unescape_attrs_if_required(self, attrs):
+ return attrs # ditto
+ else:
+ def unescape_attr_if_required(self, name):
+ return self.unescape_attr(name)
+ def unescape_attrs_if_required(self, attrs):
+ return self.unescape_attrs(attrs)
+
+
+class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
+ """Good for tolerance of incorrect HTML, bad for XHTML."""
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ sgmllib.SGMLParser.__init__(self)
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
+
+ def feed(self, data):
+ try:
+ sgmllib.SGMLParser.feed(self, data)
+ except SGMLLIB_PARSEERROR, exc:
+ raise ParseError(exc)
+
+ def close(self):
+ sgmllib.SGMLParser.close(self)
+ self.end_body()
+
+
+# sigh, must support mechanize by allowing dynamic creation of classes based on
+# its bundled copy of BeautifulSoup (which was necessary because of dependency
+# problems)
+
+def _create_bs_classes(bs,
+ icbinbs,
+ ):
+ class _AbstractBSFormParser(_AbstractSgmllibParser):
+ bs_base_class = None
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
+ self.bs_base_class.__init__(self)
+ def handle_data(self, data):
+ _AbstractFormParser.handle_data(self, data)
+ self.bs_base_class.handle_data(self, data)
+ def feed(self, data):
+ try:
+ self.bs_base_class.feed(self, data)
+ except SGMLLIB_PARSEERROR, exc:
+ raise ParseError(exc)
+ def close(self):
+ self.bs_base_class.close(self)
+ self.end_body()
+
+ class RobustFormParser(_AbstractBSFormParser, bs):
+ """Tries to be highly tolerant of incorrect HTML."""
+ pass
+ RobustFormParser.bs_base_class = bs
+ class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
+ """Tries to be highly tolerant of incorrect HTML.
+
+ Different from RobustFormParser in that it more often guesses nesting
+ above missing end tags (see BeautifulSoup docs).
+
+ """
+ pass
+ NestingRobustFormParser.bs_base_class = icbinbs
+
+ return RobustFormParser, NestingRobustFormParser
+
+try:
+ if sys.version_info[:2] < (2, 2):
+ raise ImportError # BeautifulSoup uses generators
+ import BeautifulSoup
+except ImportError:
+ pass
+else:
+ RobustFormParser, NestingRobustFormParser = _create_bs_classes(
+ BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
+ )
+ __all__ += ['RobustFormParser', 'NestingRobustFormParser']
+
+
+#FormParser = XHTMLCompatibleFormParser # testing hack
+#FormParser = RobustFormParser # testing hack
+
+
+def ParseResponseEx(response,
+ select_default=False,
+ form_parser_class=FormParser,
+ request_class=urllib2.Request,
+ entitydefs=None,
+ encoding=DEFAULT_ENCODING,
+
+ # private
+ _urljoin=urlparse.urljoin,
+ _urlparse=urlparse.urlparse,
+ _urlunparse=urlparse.urlunparse,
+ ):
+ """Identical to ParseResponse, except that:
+
+ 1. The returned list contains an extra item. The first form in the list
+ contains all controls not contained in any FORM element.
+
+ 2. The arguments ignore_errors and backwards_compat have been removed.
+
+ 3. Backwards-compatibility mode (backwards_compat=True) is not available.
+ """
+ return _ParseFileEx(response, response.geturl(),
+ select_default,
+ False,
+ form_parser_class,
+ request_class,
+ entitydefs,
+ False,
+ encoding,
+ _urljoin=_urljoin,
+ _urlparse=_urlparse,
+ _urlunparse=_urlunparse,
+ )
+
+def ParseFileEx(file, base_uri,
+ select_default=False,
+ form_parser_class=FormParser,
+ request_class=urllib2.Request,
+ entitydefs=None,
+ encoding=DEFAULT_ENCODING,
+
+ # private
+ _urljoin=urlparse.urljoin,
+ _urlparse=urlparse.urlparse,
+ _urlunparse=urlparse.urlunparse,
+ ):
+ """Identical to ParseFile, except that:
+
+ 1. The returned list contains an extra item. The first form in the list
+ contains all controls not contained in any FORM element.
+
+ 2. The arguments ignore_errors and backwards_compat have been removed.
+
+ 3. Backwards-compatibility mode (backwards_compat=True) is not available.
+ """
+ return _ParseFileEx(file, base_uri,
+ select_default,
+ False,
+ form_parser_class,
+ request_class,
+ entitydefs,
+ False,
+ encoding,
+ _urljoin=_urljoin,
+ _urlparse=_urlparse,
+ _urlunparse=_urlunparse,
+ )
+
+def ParseResponse(response, *args, **kwds):
+ """Parse HTTP response and return a list of HTMLForm instances.
+
+ The return value of urllib2.urlopen can be conveniently passed to this
+ function as the response parameter.
+
+ ClientForm.ParseError is raised on parse errors.
+
+ response: file-like object (supporting read() method) with a method
+ geturl(), returning the URI of the HTTP response
+ select_default: for multiple-selection SELECT controls and RADIO controls,
+ pick the first item as the default if none are selected in the HTML
+ form_parser_class: class to instantiate and use to pass
+ request_class: class to return from .click() method (default is
+ urllib2.Request)
+ entitydefs: mapping like {"&": "&", ...} containing HTML entity
+ definitions (a sensible default is used)
+ encoding: character encoding used for encoding numeric character references
+ when matching link text. ClientForm does not attempt to find the encoding
+ in a META HTTP-EQUIV attribute in the document itself (mechanize, for
+ example, does do that and will pass the correct value to ClientForm using
+ this parameter).
+
+ backwards_compat: boolean that determines whether the returned HTMLForm
+ objects are backwards-compatible with old code. If backwards_compat is
+ true:
+
+ - ClientForm 0.1 code will continue to work as before.
+
+ - Label searches that do not specify a nr (number or count) will always
+ get the first match, even if other controls match. If
+ backwards_compat is False, label searches that have ambiguous results
+ will raise an AmbiguityError.
+
+ - Item label matching is done by strict string comparison rather than
+ substring matching.
+
+ - De-selecting individual list items is allowed even if the Item is
+ disabled.
+
+ The backwards_compat argument will be deprecated in a future release.
+
+ Pass a true value for select_default if you want the behaviour specified by
+ RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
+ RADIO or multiple-selection SELECT control if none were selected in the
+ HTML. Most browsers (including Microsoft Internet Explorer (IE) and
+ Netscape Navigator) instead leave all items unselected in these cases. The
+ W3C HTML 4.0 standard leaves this behaviour undefined in the case of
+ multiple-selection SELECT controls, but insists that at least one RADIO
+ button should be checked at all times, in contradiction to browser
+ behaviour.
+
+ There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
+ HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
+ sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
+ Note that HTMLParser is only available in Python 2.2 and later. You can
+ pass your own class in here as a hack to work around bad HTML, but at your
+ own risk: there is no well-defined interface.
+
+ """
+ return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
+
+def ParseFile(file, base_uri, *args, **kwds):
+ """Parse HTML and return a list of HTMLForm instances.
+
+ ClientForm.ParseError is raised on parse errors.
+
+ file: file-like object (supporting read() method) containing HTML with zero
+ or more forms to be parsed
+ base_uri: the URI of the document (note that the base URI used to submit
+ the form will be that given in the BASE element if present, not that of
+ the document)
+
+ For the other arguments and further details, see ParseResponse.__doc__.
+
+ """
+ return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
+
+def _ParseFileEx(file, base_uri,
+ select_default=False,
+ ignore_errors=False,
+ form_parser_class=FormParser,
+ request_class=urllib2.Request,
+ entitydefs=None,
+ backwards_compat=True,
+ encoding=DEFAULT_ENCODING,
+ _urljoin=urlparse.urljoin,
+ _urlparse=urlparse.urlparse,
+ _urlunparse=urlparse.urlunparse,
+ ):
+ if backwards_compat:
+ deprecation("operating in backwards-compatibility mode", 1)
+ fp = form_parser_class(entitydefs, encoding)
+ while 1:
+ data = file.read(CHUNK)
+ try:
+ fp.feed(data)
+ except ParseError, e:
+ e.base_uri = base_uri
+ raise
+ if len(data) != CHUNK: break
+ fp.close()
+ if fp.base is not None:
+ # HTML BASE element takes precedence over document URI
+ base_uri = fp.base
+ labels = [] # Label(label) for label in fp.labels]
+ id_to_labels = {}
+ for l in fp.labels:
+ label = Label(l)
+ labels.append(label)
+ for_id = l["for"]
+ coll = id_to_labels.get(for_id)
+ if coll is None:
+ id_to_labels[for_id] = [label]
+ else:
+ coll.append(label)
+ forms = []
+ for (name, action, method, enctype), attrs, controls in fp.forms:
+ if action is None:
+ action = base_uri
+ else:
+ action = _urljoin(base_uri, action)
+ # would be nice to make HTMLForm class (form builder) pluggable
+ form = HTMLForm(
+ action, method, enctype, name, attrs, request_class,
+ forms, labels, id_to_labels, backwards_compat)
+ form._urlparse = _urlparse
+ form._urlunparse = _urlunparse
+ for ii in range(len(controls)):
+ type, name, attrs = controls[ii]
+ # index=ii*10 allows ImageControl to return multiple ordered pairs
+ form.new_control(
+ type, name, attrs, select_default=select_default, index=ii*10)
+ forms.append(form)
+ for form in forms:
+ form.fixup()
+ return forms
+
+
+class Label:
+ def __init__(self, attrs):
+ self.id = attrs.get("for")
+ self._text = attrs.get("__text").strip()
+ self._ctext = compress_text(self._text)
+ self.attrs = attrs
+ self._backwards_compat = False # maintained by HTMLForm
+
+ def __getattr__(self, name):
+ if name == "text":
+ if self._backwards_compat:
+ return self._text
+ else:
+ return self._ctext
+ return getattr(Label, name)
+
+ def __setattr__(self, name, value):
+ if name == "text":
+ # don't see any need for this, so make it read-only
+ raise AttributeError("text attribute is read-only")
+ self.__dict__[name] = value
+
+ def __str__(self):
+ return "