Implementation for an Issue #283

This commit is contained in:
Miroslav Stampar 2012-12-06 11:57:57 +01:00
parent ab67344448
commit baccbd6f48
2 changed files with 13 additions and 2 deletions

View File

@ -3110,6 +3110,17 @@ def findPageForms(content, url, raise_=False, addToTargets=False):
if addToTargets and retVal:
for target in retVal:
url = target[0]
# flag to know if we are dealing with the same target host
_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (response.geturl(), url)))
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not _:
continue
kb.targets.add(target)
return retVal

View File

@ -70,12 +70,12 @@ class Crawler(object):
url = urlparse.urljoin(conf.url, tag.get("href"))
# flag to know if we are dealing with the same target host
target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not target:
elif not _:
continue
if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: