From 4d1fa5596bcb6b11cf41b8de7c879f474678cf60 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 20 Jun 2011 12:37:51 +0000 Subject: [PATCH] added support for --scope in --crawl mode --- lib/core/option.py | 2 +- lib/utils/crawler.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/lib/core/option.py b/lib/core/option.py index abb9efe5c..a1f29a842 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -192,7 +192,7 @@ def __feedTargetsDict(reqFile, addedTargetUrls): continue if conf.scope: - getPostReq &= re.search(conf.scope, url) is not None + getPostReq &= re.search(conf.scope, url, re.I) is not None if getPostReq: if not kb.targetUrls or url not in addedTargetUrls: diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py index 9dece4627..266b4e91f 100644 --- a/lib/utils/crawler.py +++ b/lib/utils/crawler.py @@ -58,14 +58,21 @@ class Crawler: for tag in soup('a'): if tag.get("href"): url = urlparse.urljoin(conf.url, tag.get("href")) + # flag to know if we are dealing with the same target host target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url])) - if target: - kb.locks.outputs.acquire() - threadData.shared.deeper.add(url) - if re.search(r"(.*?)\?(.+)", url): - threadData.shared.outputs.add(url) - kb.locks.outputs.release() + + if conf.scope: + if not re.search(conf.scope, url, re.I): + continue + elif not target: + continue + + kb.locks.outputs.acquire() + threadData.shared.deeper.add(url) + if re.search(r"(.*?)\?(.+)", url): + threadData.shared.outputs.add(url) + kb.locks.outputs.release() threadData.shared.deeper = set() threadData.shared.unprocessed = set([conf.url])