from __future__ import unicode_literals import re # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # A few minor mods to this regex to account for use cases represented in test_urls _URL_PATTERN = ( r"^" # in order to support the prefix tokenization (see prefix test cases in test_urls). r"(?=[\w])" # protocol identifier r"(?:(?:https?|ftp|mailto)://)?" # user:pass authentication r"(?:\S+(?::\S*)?@)?" r"(?:" # IP address exclusion # private & local networks r"(?!(?:10|127)(?:\.\d{1,3}){3})" r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" # IP address dotted notation octets # excludes loopback network 0.0.0.0 # excludes reserved space >= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" # domain name r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" # TLD identifier r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:/\S*)?" # query parameters r"\??(:?\S*)?" # in order to support the suffix tokenization (see suffix test cases in test_urls), r"(?<=[\w/])" r"$" ).strip() TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match __all__ = ['TOKEN_MATCH']