From de69bc65098169a3f7098353d790a21b06d01f04 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Jan 2020 14:58:30 +0100 Subject: [PATCH] Fix and improve URL pattern (#4882) * match domains longer than `hostname.domain.tld` like `www.foo.co.uk` * expand allowed characters in domain names while only matching lowercase TLDs so that "this.That" isn't matched as a URL and can be split on the period as an infix (relevant for at least English, German, and Tatar) --- spacy/lang/tokenizer_exceptions.py | 27 ++++++++++++++++++++------- spacy/tests/tokenizer/test_urls.py | 18 +++++++++++------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 57771cca4..2c0fc9cf7 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,14 +3,18 @@ from __future__ import unicode_literals import re +from .char_classes import ALPHA_LOWER from ..symbols import ORTH, POS, TAG, LEMMA, SPACE # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex -# A few minor mods to this regex to account for use cases represented in test_urls +# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License) +# A few mods to this regex to account for use cases represented in test_urls URL_PATTERN = ( + # fmt: off r"^" - # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) + # protocol identifier (mods: make optional and expand schemes) + # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" # mailto:user or user:pass authentication r"(?:\S+(?::\S*)?@)?" @@ -31,18 +35,27 @@ URL_PATTERN = ( r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" - # host name - r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" - # domain name - r"(?:\.(?:[a-z0-9])(?:[a-z0-9\-])*[a-z0-9])?" + # host & domain names + # mods: match is case-sensitive, so include [A-Z] + "(?:" + "(?:" + "[A-Za-z0-9\u00a1-\uffff]" + "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" + ")?" + "[A-Za-z0-9\u00a1-\uffff]\." + ")+" # TLD identifier - r"(?:\.(?:[a-z]{2,}))" + # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match + # strings like "lower.Upper", which can be split on "." by infixes in some + # languages + r"(?:[" + ALPHA_LOWER + "]{2,63})" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" r"$" + # fmt: on ).strip() TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 21e1819b7..ef99484ee 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -20,6 +20,7 @@ URLS_FULL = URLS_BASIC + [ # URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex URLS_SHOULD_MATCH = [ "http://foo.com/blah_blah", + "http://BlahBlah.com/Blah_Blah", "http://foo.com/blah_blah/", "http://www.example.com/wpstyle/?p=364", "https://www.example.com/foo/?bar=baz&inga=42&quux", @@ -57,14 +58,17 @@ URLS_SHOULD_MATCH = [ ), "http://foo.com/blah_blah_(wikipedia)", "http://foo.com/blah_blah_(wikipedia)_(again)", - pytest.param("http://⌘.ws", marks=pytest.mark.xfail()), - pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()), - pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()), - pytest.param("http://✪df.ws/123", marks=pytest.mark.xfail()), - pytest.param("http://➡.ws/䨹", marks=pytest.mark.xfail()), - pytest.param("http://مثال.إختبار", marks=pytest.mark.xfail()), + "http://www.foo.co.uk", + "http://www.foo.co.uk/", + "http://www.foo.co.uk/blah/blah", + "http://⌘.ws", + "http://⌘.ws/", + "http://☺.damowmow.com/", + "http://✪df.ws/123", + "http://➡.ws/䨹", + "http://مثال.إختبار", pytest.param("http://例子.测试", marks=pytest.mark.xfail()), - pytest.param("http://उदाहरण.परीक्षा", marks=pytest.mark.xfail()), + "http://उदाहरण.परीक्षा", ] URLS_SHOULD_NOT_MATCH = [