revert of the last commit. have to think about it

This commit is contained in:
Miroslav Stampar 2011-06-09 06:32:53 +00:00
parent 9c093d91f2
commit 8ec4bc9d9d

View File

@ -1399,8 +1399,7 @@ def sanitizeAsciiString(subject):
def getFilteredPageContent(page, onlyText=True): def getFilteredPageContent(page, onlyText=True):
retVal = page retVal = page
# only if the page's charset had been successfully identified if isinstance(page, basestring):
if isinstance(page, unicode):
retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page) retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
while retVal.find(" ") != -1: while retVal.find(" ") != -1:
@ -1413,8 +1412,7 @@ def getFilteredPageContent(page, onlyText=True):
def getPageTextWordsSet(page): def getPageTextWordsSet(page):
retVal = None retVal = None
# only if the page's charset had been successfully identified if isinstance(page, basestring):
if isinstance(page, unicode):
page = getFilteredPageContent(page) page = getFilteredPageContent(page)
retVal = set(re.findall(r"\w+", page)) retVal = set(re.findall(r"\w+", page))