2017-01-03 20:17:57 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-04-07 16:54:25 +03:00
|
|
|
# The use of this module turns out to be important, to avoid pathological
|
|
|
|
# back-tracking. See Issue #957
|
2017-04-07 16:47:36 +03:00
|
|
|
import regex
|
2017-01-03 20:17:57 +03:00
|
|
|
|
2017-03-05 01:13:11 +03:00
|
|
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
|
|
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
|
|
|
_URL_PATTERN = (
|
|
|
|
r"^"
|
|
|
|
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
|
|
|
r"(?=[\w])"
|
|
|
|
# protocol identifier
|
|
|
|
r"(?:(?:https?|ftp|mailto)://)?"
|
|
|
|
# user:pass authentication
|
|
|
|
r"(?:\S+(?::\S*)?@)?"
|
|
|
|
r"(?:"
|
|
|
|
# IP address exclusion
|
|
|
|
# private & local networks
|
|
|
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
|
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
|
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
|
|
# IP address dotted notation octets
|
|
|
|
# excludes loopback network 0.0.0.0
|
|
|
|
# excludes reserved space >= 224.0.0.0
|
|
|
|
# excludes network & broadcast addresses
|
|
|
|
# (first & last IP address of each class)
|
2017-04-07 16:54:25 +03:00
|
|
|
# MH: Do we really need this? Seems excessive, and seems to have caused
|
|
|
|
# Issue #957
|
2017-03-05 01:13:11 +03:00
|
|
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
|
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
|
|
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
|
|
r"|"
|
|
|
|
# host name
|
2017-10-11 12:24:00 +03:00
|
|
|
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
|
2017-03-05 01:13:11 +03:00
|
|
|
# domain name
|
2017-10-11 12:24:00 +03:00
|
|
|
r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*"
|
2017-03-05 01:13:11 +03:00
|
|
|
# TLD identifier
|
2017-10-11 12:24:00 +03:00
|
|
|
r"(?:\.(?:[a-z]{2,}))"
|
2017-03-05 01:13:11 +03:00
|
|
|
r")"
|
|
|
|
# port number
|
|
|
|
r"(?::\d{2,5})?"
|
|
|
|
# resource path
|
|
|
|
r"(?:/\S*)?"
|
|
|
|
# query parameters
|
|
|
|
r"\??(:?\S*)?"
|
|
|
|
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
|
|
|
r"(?<=[\w/])"
|
|
|
|
r"$"
|
|
|
|
).strip()
|
2017-01-03 20:17:57 +03:00
|
|
|
|
2017-04-07 16:47:36 +03:00
|
|
|
TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match
|
2017-01-03 20:17:57 +03:00
|
|
|
|
|
|
|
__all__ = ['TOKEN_MATCH']
|