From af5fe457bdbae5bad3dd6380d4cc379fc7e50d3f Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 9 Jun 2011 07:53:31 +0000 Subject: [PATCH] revert of the revert (it's a good idea to have it like this because of problems with e.g. --text-only and binary content) --- lib/core/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index e096fcb35..3b8d42a95 100644 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1399,7 +1399,8 @@ def sanitizeAsciiString(subject): def getFilteredPageContent(page, onlyText=True): retVal = page - if isinstance(page, basestring): + # only if the page's charset has been successfully identified + if isinstance(page, unicode): retVal = re.sub(r"(?s)||%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page) while retVal.find(" ") != -1: @@ -1412,7 +1413,8 @@ def getFilteredPageContent(page, onlyText=True): def getPageTextWordsSet(page): retVal = None - if isinstance(page, basestring): + # only if the page's charset has been successfully identified + if isinstance(page, unicode): page = getFilteredPageContent(page) retVal = set(re.findall(r"\w+", page))