Fix and improve URL pattern (#4882)

* match domains longer than `hostname.domain.tld` like `www.foo.co.uk`
* expand allowed characters in domain names while only matching
lowercase TLDs so that "this.That" isn't matched as a URL and can be
split on the period as an infix (relevant for at least English, German,
and Tatar)
This commit is contained in:
adrianeboyd 2020-01-06 14:58:30 +01:00 committed by Matthew Honnibal
parent a1b22e90cd
commit de69bc6509
2 changed files with 31 additions and 14 deletions

View File

@ -3,14 +3,18 @@ from __future__ import unicode_literals
import re import re
from .char_classes import ALPHA_LOWER
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# A few minor mods to this regex to account for use cases represented in test_urls # and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
# A few mods to this regex to account for use cases represented in test_urls
URL_PATTERN = ( URL_PATTERN = (
# fmt: off
r"^" r"^"
# protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) # protocol identifier (mods: make optional and expand schemes)
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
r"(?:(?:[\w\+\-\.]{2,})://)?" r"(?:(?:[\w\+\-\.]{2,})://)?"
# mailto:user or user:pass authentication # mailto:user or user:pass authentication
r"(?:\S+(?::\S*)?@)?" r"(?:\S+(?::\S*)?@)?"
@ -31,18 +35,27 @@ URL_PATTERN = (
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|" r"|"
# host name # host & domain names
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # mods: match is case-sensitive, so include [A-Z]
# domain name "(?:"
r"(?:\.(?:[a-z0-9])(?:[a-z0-9\-])*[a-z0-9])?" "(?:"
"[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
")?"
"[A-Za-z0-9\u00a1-\uffff]\."
")+"
# TLD identifier # TLD identifier
r"(?:\.(?:[a-z]{2,}))" # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
# strings like "lower.Upper", which can be split on "." by infixes in some
# languages
r"(?:[" + ALPHA_LOWER + "]{2,63})"
r")" r")"
# port number # port number
r"(?::\d{2,5})?" r"(?::\d{2,5})?"
# resource path # resource path
r"(?:[/?#]\S*)?" r"(?:[/?#]\S*)?"
r"$" r"$"
# fmt: on
).strip() ).strip()
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match

View File

@ -20,6 +20,7 @@ URLS_FULL = URLS_BASIC + [
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex # URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
URLS_SHOULD_MATCH = [ URLS_SHOULD_MATCH = [
"http://foo.com/blah_blah", "http://foo.com/blah_blah",
"http://BlahBlah.com/Blah_Blah",
"http://foo.com/blah_blah/", "http://foo.com/blah_blah/",
"http://www.example.com/wpstyle/?p=364", "http://www.example.com/wpstyle/?p=364",
"https://www.example.com/foo/?bar=baz&inga=42&quux", "https://www.example.com/foo/?bar=baz&inga=42&quux",
@ -57,14 +58,17 @@ URLS_SHOULD_MATCH = [
), ),
"http://foo.com/blah_blah_(wikipedia)", "http://foo.com/blah_blah_(wikipedia)",
"http://foo.com/blah_blah_(wikipedia)_(again)", "http://foo.com/blah_blah_(wikipedia)_(again)",
pytest.param("http://⌘.ws", marks=pytest.mark.xfail()), "http://www.foo.co.uk",
pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()), "http://www.foo.co.uk/",
pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()), "http://www.foo.co.uk/blah/blah",
pytest.param("http://✪df.ws/123", marks=pytest.mark.xfail()), "http://⌘.ws",
pytest.param("http://➡.ws/䨹", marks=pytest.mark.xfail()), "http://⌘.ws/",
pytest.param("http://مثال.إختبار", marks=pytest.mark.xfail()), "http://☺.damowmow.com/",
"http://✪df.ws/123",
"http://➡.ws/䨹",
"http://مثال.إختبار",
pytest.param("http://例子.测试", marks=pytest.mark.xfail()), pytest.param("http://例子.测试", marks=pytest.mark.xfail()),
pytest.param("http://उदाहरण.परीक्षा", marks=pytest.mark.xfail()), "http://उदाहरण.परीक्षा",
] ]
URLS_SHOULD_NOT_MATCH = [ URLS_SHOULD_NOT_MATCH = [