From bdc724cb4627657aedb27646e77972b34417babb Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Tue, 20 Dec 2011 10:34:28 +0000
Subject: [PATCH] minor bug fix

---
 doc/THANKS           |  3 +++
 lib/utils/crawler.py | 47 +++++++++++++++++++++++---------------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/doc/THANKS b/doc/THANKS
index 764889f28..2a42b36ce 100644
--- a/doc/THANKS
+++ b/doc/THANKS
@@ -106,6 +106,9 @@ Alessandro Curio <alessandro.curio@gmail.com>
 Alessio Dalla Piazza <alessio.dallapiazza@gmail.com>
     for reporting a couple of bugs
 
+Sherif El-Deeb <archeldeeb@gmail.com>
+    for reporting a minor bug
+
 Stefano Di Paola <stefano.dipaola@wisec.it>
     for suggesting good features
 
diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py
index 20861964a..e25165238 100644
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -72,29 +72,32 @@ class Crawler:
                         break
 
                     if isinstance(content, unicode):
-                        soup = BeautifulSoup(content)
-                        for tag in soup('a'):
-                            if tag.get("href"):
-                                url = urlparse.urljoin(conf.url, tag.get("href"))
-
-                                # flag to know if we are dealing with the same target host
-                                target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
-
-                                if conf.scope:
-                                    if not re.search(conf.scope, url, re.I):
+                        try:
+                            soup = BeautifulSoup(content)
+                            for tag in soup('a'):
+                                if tag.get("href"):
+                                    url = urlparse.urljoin(conf.url, tag.get("href"))
+    
+                                    # flag to know if we are dealing with the same target host
+                                    target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
+    
+                                    if conf.scope:
+                                        if not re.search(conf.scope, url, re.I):
+                                            continue
+                                    elif not target:
                                         continue
-                                elif not target:
-                                    continue
-
-                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
-                                    kb.locks.outputs.acquire()
-                                    threadData.shared.deeper.add(url)
-                                    if re.search(r"(.*?)\?(.+)", url):
-                                        threadData.shared.outputs.add(url)
-                                    kb.locks.outputs.release()
-
-                        if conf.forms:
-                            findPageForms(content, current, False, True)
+    
+                                    if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
+                                        kb.locks.outputs.acquire()
+                                        threadData.shared.deeper.add(url)
+                                        if re.search(r"(.*?)\?(.+)", url):
+                                            threadData.shared.outputs.add(url)
+                                        kb.locks.outputs.release()
+                        except UnicodeEncodeError: # for non-HTML files
+                            pass
+                        finally:
+                            if conf.forms:
+                                findPageForms(content, current, False, True)
 
                     if conf.verbose in (1, 2):
                         kb.locks.ioLock.acquire()