mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix and improve URL pattern (#4882)
* match domains longer than `hostname.domain.tld` like `www.foo.co.uk` * expand allowed characters in domain names while only matching lowercase TLDs so that "this.That" isn't matched as a URL and can be split on the period as an infix (relevant for at least English, German, and Tatar)
This commit is contained in:
parent
a1b22e90cd
commit
de69bc6509
|
@ -3,14 +3,18 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from .char_classes import ALPHA_LOWER
|
||||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
|
|
||||||
|
|
||||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||||
# A few minor mods to this regex to account for use cases represented in test_urls
|
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
|
||||||
|
# A few mods to this regex to account for use cases represented in test_urls
|
||||||
URL_PATTERN = (
|
URL_PATTERN = (
|
||||||
|
# fmt: off
|
||||||
r"^"
|
r"^"
|
||||||
# protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
# protocol identifier (mods: make optional and expand schemes)
|
||||||
|
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
||||||
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
||||||
# mailto:user or user:pass authentication
|
# mailto:user or user:pass authentication
|
||||||
r"(?:\S+(?::\S*)?@)?"
|
r"(?:\S+(?::\S*)?@)?"
|
||||||
|
@ -31,18 +35,27 @@ URL_PATTERN = (
|
||||||
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
||||||
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
||||||
r"|"
|
r"|"
|
||||||
# host name
|
# host & domain names
|
||||||
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
# domain name
|
"(?:"
|
||||||
r"(?:\.(?:[a-z0-9])(?:[a-z0-9\-])*[a-z0-9])?"
|
"(?:"
|
||||||
|
"[A-Za-z0-9\u00a1-\uffff]"
|
||||||
|
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
|
")?"
|
||||||
|
"[A-Za-z0-9\u00a1-\uffff]\."
|
||||||
|
")+"
|
||||||
# TLD identifier
|
# TLD identifier
|
||||||
r"(?:\.(?:[a-z]{2,}))"
|
# mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
|
||||||
|
# strings like "lower.Upper", which can be split on "." by infixes in some
|
||||||
|
# languages
|
||||||
|
r"(?:[" + ALPHA_LOWER + "]{2,63})"
|
||||||
r")"
|
r")"
|
||||||
# port number
|
# port number
|
||||||
r"(?::\d{2,5})?"
|
r"(?::\d{2,5})?"
|
||||||
# resource path
|
# resource path
|
||||||
r"(?:[/?#]\S*)?"
|
r"(?:[/?#]\S*)?"
|
||||||
r"$"
|
r"$"
|
||||||
|
# fmt: on
|
||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
|
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
|
||||||
|
|
|
@ -20,6 +20,7 @@ URLS_FULL = URLS_BASIC + [
|
||||||
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
|
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
|
||||||
URLS_SHOULD_MATCH = [
|
URLS_SHOULD_MATCH = [
|
||||||
"http://foo.com/blah_blah",
|
"http://foo.com/blah_blah",
|
||||||
|
"http://BlahBlah.com/Blah_Blah",
|
||||||
"http://foo.com/blah_blah/",
|
"http://foo.com/blah_blah/",
|
||||||
"http://www.example.com/wpstyle/?p=364",
|
"http://www.example.com/wpstyle/?p=364",
|
||||||
"https://www.example.com/foo/?bar=baz&inga=42&quux",
|
"https://www.example.com/foo/?bar=baz&inga=42&quux",
|
||||||
|
@ -57,14 +58,17 @@ URLS_SHOULD_MATCH = [
|
||||||
),
|
),
|
||||||
"http://foo.com/blah_blah_(wikipedia)",
|
"http://foo.com/blah_blah_(wikipedia)",
|
||||||
"http://foo.com/blah_blah_(wikipedia)_(again)",
|
"http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||||
pytest.param("http://⌘.ws", marks=pytest.mark.xfail()),
|
"http://www.foo.co.uk",
|
||||||
pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()),
|
"http://www.foo.co.uk/",
|
||||||
pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()),
|
"http://www.foo.co.uk/blah/blah",
|
||||||
pytest.param("http://✪df.ws/123", marks=pytest.mark.xfail()),
|
"http://⌘.ws",
|
||||||
pytest.param("http://➡.ws/䨹", marks=pytest.mark.xfail()),
|
"http://⌘.ws/",
|
||||||
pytest.param("http://مثال.إختبار", marks=pytest.mark.xfail()),
|
"http://☺.damowmow.com/",
|
||||||
|
"http://✪df.ws/123",
|
||||||
|
"http://➡.ws/䨹",
|
||||||
|
"http://مثال.إختبار",
|
||||||
pytest.param("http://例子.测试", marks=pytest.mark.xfail()),
|
pytest.param("http://例子.测试", marks=pytest.mark.xfail()),
|
||||||
pytest.param("http://उदाहरण.परीक्षा", marks=pytest.mark.xfail()),
|
"http://उदाहरण.परीक्षा",
|
||||||
]
|
]
|
||||||
|
|
||||||
URLS_SHOULD_NOT_MATCH = [
|
URLS_SHOULD_NOT_MATCH = [
|
||||||
|
|
Loading…
Reference in New Issue
Block a user