Fix for an Issue #324 (crawling when HTML is not well-formed)

2025-07-22 05:59:46 +03:00 · 2012-12-27 20:55:37 +01:00 · 2012-12-27 20:55:37 +01:00 · cb91729913
commit cb91729913
parent 127b880577
1 changed files with 14 additions and 3 deletions
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@ -64,10 +64,21 @@ class Crawler(object):

                    if isinstance(content, unicode):
                        try:
+                            match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
+                            if match:
+                                content = "<html>%s</html>" % match.group(1)
+
                            soup = BeautifulSoup(content)
-                            for tag in soup('a'):
-                                if tag.get("href"):
-                                    url = urlparse.urljoin(conf.url, tag.get("href"))
+                            tags = soup('a')
+
+                            if not tags:
+                                tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
+
+                            for tag in tags:
+                                href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
+
+                                if href:
+                                    url = urlparse.urljoin(conf.url, href)

                                    # flag to know if we are dealing with the same target host
                                    _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))