mirror of
https://github.com/sqlmapproject/sqlmap.git
synced 2025-02-16 19:40:37 +03:00
Fix for an Issue #324 (crawling when HTML is not well-formed)
This commit is contained in:
parent
127b880577
commit
cb91729913
|
@ -64,10 +64,21 @@ class Crawler(object):
|
|||
|
||||
if isinstance(content, unicode):
|
||||
try:
|
||||
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
|
||||
if match:
|
||||
content = "<html>%s</html>" % match.group(1)
|
||||
|
||||
soup = BeautifulSoup(content)
|
||||
for tag in soup('a'):
|
||||
if tag.get("href"):
|
||||
url = urlparse.urljoin(conf.url, tag.get("href"))
|
||||
tags = soup('a')
|
||||
|
||||
if not tags:
|
||||
tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
|
||||
|
||||
for tag in tags:
|
||||
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
|
||||
|
||||
if href:
|
||||
url = urlparse.urljoin(conf.url, href)
|
||||
|
||||
# flag to know if we are dealing with the same target host
|
||||
_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))
|
||||
|
|
Loading…
Reference in New Issue
Block a user