Automatically checking for sitemap existence in case of --crawl

2025-09-19 10:32:31 +03:00 · 2015-01-20 10:03:35 +01:00 · 2015-01-20 10:03:35 +01:00 · 9f4a32ca2b
commit 9f4a32ca2b
parent a603002acd
1 changed files with 21 additions and 0 deletions
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@ -28,6 +28,7 @@ from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
 from lib.core.settings import UNICODE_ENCODING
 from lib.core.threads import getCurrentThreadData
 from lib.core.threads import runThreads
 from lib.parse.sitemap import parseSitemap
 from lib.request.connect import Connect as Request
 from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
 from thirdparty.oset.pyoset import oset
@ -116,6 +117,26 @@ def crawl(target):
        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])
        if not conf.sitemapUrl:
            message = "do you want to check for the existence of "
            message += "site's sitemap(.xml) [Y/n] "
            test = readInput(message, default="Y")
            if test[0] not in ("n", "N"):
                items = None
                url = "%s://%s/sitemap.xml" % (conf.scheme, conf.hostname)
                try:
                    items = parseSitemap(url)
                except:
                    pass
                finally:
                    if items:
                        for item in items:
                            if re.search(r"(.*?)\?(.+)", item):
                                threadData.shared.value.add(item)
                        if conf.crawlDepth > 1:
                            threadData.shared.unprocessed.update(items)
                    logger.info("%s links found" % ("no" if not items else len(items)))
        infoMsg = "starting crawler"
        if conf.bulkFile:
            infoMsg += " for target URL '%s'" % target