revert of the revert (it's a good idea to have it like this because of problems with e.g. --text-only and binary content)

This commit is contained in:
Miroslav Stampar 2011-06-09 07:53:31 +00:00
parent 8ec4bc9d9d
commit af5fe457bd

View File

@ -1399,7 +1399,8 @@ def sanitizeAsciiString(subject):
def getFilteredPageContent(page, onlyText=True): def getFilteredPageContent(page, onlyText=True):
retVal = page retVal = page
if isinstance(page, basestring): # only if the page's charset has been successfully identified
if isinstance(page, unicode):
retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page) retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
while retVal.find(" ") != -1: while retVal.find(" ") != -1:
@ -1412,7 +1413,8 @@ def getFilteredPageContent(page, onlyText=True):
def getPageTextWordsSet(page): def getPageTextWordsSet(page):
retVal = None retVal = None
if isinstance(page, basestring): # only if the page's charset has been successfully identified
if isinstance(page, unicode):
page = getFilteredPageContent(page) page = getFilteredPageContent(page)
retVal = set(re.findall(r"\w+", page)) retVal = set(re.findall(r"\w+", page))