minor update

This commit is contained in:
Miroslav Stampar 2011-01-01 19:07:40 +00:00
parent ef27fd5ea1
commit 0e815177c8

View File

@ -1111,11 +1111,11 @@ def sanitizeAsciiString(subject):
else:
return None
def getFilteredPageContent(page):
def getFilteredPageContent(page, onlyText=True):
retVal = page
if isinstance(page, basestring):
retVal = re.sub(r"(?s)<script.+?</script>|<style.+?</style>|<[^>]+>|\t|\n|\r", " ", page)
retVal = re.sub(r"(?s)<script.+?</script>|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
while retVal.find(" ") != -1:
retVal = retVal.replace(" ", " ")