Automatically checking for sitemap existence in case of --crawl

This commit is contained in:
Miroslav Stampar 2015-01-20 10:03:35 +01:00
parent a603002acd
commit 9f4a32ca2b

View File

@ -28,6 +28,7 @@ from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.settings import UNICODE_ENCODING from lib.core.settings import UNICODE_ENCODING
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request from lib.request.connect import Connect as Request
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.oset.pyoset import oset from thirdparty.oset.pyoset import oset
@ -116,6 +117,26 @@ def crawl(target):
threadData.shared.deeper = set() threadData.shared.deeper = set()
threadData.shared.unprocessed = set([target]) threadData.shared.unprocessed = set([target])
if not conf.sitemapUrl:
message = "do you want to check for the existence of "
message += "site's sitemap(.xml) [Y/n] "
test = readInput(message, default="Y")
if test[0] not in ("n", "N"):
items = None
url = "%s://%s/sitemap.xml" % (conf.scheme, conf.hostname)
try:
items = parseSitemap(url)
except:
pass
finally:
if items:
for item in items:
if re.search(r"(.*?)\?(.+)", item):
threadData.shared.value.add(item)
if conf.crawlDepth > 1:
threadData.shared.unprocessed.update(items)
logger.info("%s links found" % ("no" if not items else len(items)))
infoMsg = "starting crawler" infoMsg = "starting crawler"
if conf.bulkFile: if conf.bulkFile:
infoMsg += " for target URL '%s'" % target infoMsg += " for target URL '%s'" % target