minor update

This commit is contained in:
Miroslav Stampar 2010-10-12 20:01:59 +00:00
parent d2ec132469
commit 9a08f7feb8

View File

@ -1129,6 +1129,8 @@ def getFilteredPageContent(page):
retVal = page retVal = page
if isinstance(page, basestring): if isinstance(page, basestring):
retVal = re.sub(r"(?s)<script.+?</script>|<style.+?</style>|<[^>]+>|\t|\n|\r", " ", page) retVal = re.sub(r"(?s)<script.+?</script>|<style.+?</style>|<[^>]+>|\t|\n|\r", " ", page)
while retVal.find(" ") != -1:
retVal = retVal.replace(" ", " ")
return retVal return retVal
def getPageTextWordsSet(page): def getPageTextWordsSet(page):