minor update

This commit is contained in:
Miroslav Stampar 2011-11-13 10:38:27 +00:00
parent bbb7e1562d
commit 76fb6ba666

View File

@ -1586,7 +1586,7 @@ def getFilteredPageContent(page, onlyText=True):
# only if the page's charset has been successfully identified
if isinstance(page, unicode):
retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page, flags=re.I)
while retVal.find(" ") != -1:
retVal = retVal.replace(" ", " ")