From 9f4a32ca2b31e1571397998b85a7aa96d74bf012 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Tue, 20 Jan 2015 10:03:35 +0100
Subject: [PATCH] Automatically checking for sitemap existence in case of
 --crawl

---
 lib/utils/crawler.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py
index 28b4d5790..1d7033d26 100644
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -28,6 +28,7 @@ from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
 from lib.core.settings import UNICODE_ENCODING
 from lib.core.threads import getCurrentThreadData
 from lib.core.threads import runThreads
+from lib.parse.sitemap import parseSitemap
 from lib.request.connect import Connect as Request
 from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
 from thirdparty.oset.pyoset import oset
@@ -116,6 +117,26 @@ def crawl(target):
         threadData.shared.deeper = set()
         threadData.shared.unprocessed = set([target])
 
+        if not conf.sitemapUrl:
+            message = "do you want to check for the existence of "
+            message += "site's sitemap(.xml) [Y/n] "
+            test = readInput(message, default="Y")
+            if test[0] not in ("n", "N"):
+                items = None
+                url = "%s://%s/sitemap.xml" % (conf.scheme, conf.hostname)
+                try:
+                    items = parseSitemap(url)
+                except:
+                    pass
+                finally:
+                    if items:
+                        for item in items:
+                            if re.search(r"(.*?)\?(.+)", item):
+                                threadData.shared.value.add(item)
+                        if conf.crawlDepth > 1:
+                            threadData.shared.unprocessed.update(items)
+                    logger.info("%s links found" % ("no" if not items else len(items)))
+
         infoMsg = "starting crawler"
         if conf.bulkFile:
             infoMsg += " for target URL '%s'" % target