Improve URL_PATTERN and handling in tokenizer (#4374)

* Move prefix and suffix detection for URL_PATTERN Move prefix and suffix detection for `URL_PATTERN` into the tokenizer. Remove associated lookahead and lookbehind from `URL_PATTERN`. Fix tokenization for Hungarian given new modified handling of prefixes and suffixes. * Match a wider range of URI schemes
2025-12-24 02:23:19 +03:00 · 2019-10-05 13:00:09 +02:00 · 2019-10-05 13:00:09 +02:00 · cbc2cee2c8
commit cbc2cee2c8
parent e65dffd80b
4 changed files with 15 additions and 17 deletions
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
 _units = UNITS.replace("%", "")
 _prefixes = (
-    [r"\+"]
+    LIST_PUNCT
    + LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
    + [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
        ),
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 # A few minor mods to this regex to account for use cases represented in test_urls
 URL_PATTERN = (
    r"^"
-    # in order to support the prefix tokenization (see prefix test cases in test_urls).
+    # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
-    r"(?=[\w])"
+    r"(?:(?:[\w\+\-\.]{2,})://)?"
-    # protocol identifier
+    # mailto:user or user:pass authentication
    r"(?:(?:https?|ftp|mailto)://)?"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
@ -43,11 +41,7 @@ URL_PATTERN = (
    # port number
    r"(?::\d{2,5})?"
    # resource path
-    r"(?:/\S*)?"
+    r"(?:[/?#]\S*)?"
    # query parameters
    r"\??(:?\S*)?"
    # in order to support the suffix tokenization (see suffix test cases in test_urls),
    r"(?<=[\w/])"
    r"$"
 ).strip()
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -12,6 +12,7 @@ URLS_BASIC = [
 URLS_FULL = URLS_BASIC + [
    "mailto:foo-bar@baz-co.com",
    "mailto:foo-bar@baz-co.com?subject=hi",
    "www.google.com?q=google",
    "http://foo.com/blah_(wikipedia)#cite-1",
 ]
@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [
    "http://a.b-c.de",
    "http://223.255.255.254",
    "http://a.b--c.de/",  # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
    "ssh://login@server.com:12345/repository.git",
    "svn+ssh://user@ssh.yourdomain.com/path",
    pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
    pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
    pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
    pytest.param(
        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [
    "http:// shouldfail.com",
    ":// should fail",
    "http://foo.bar/foo(bar)baz quux",
    "ftps://foo.bar/",
    "http://-error-.invalid/",
    "http://a.b-.co",
    "http://0.0.0.0",
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -227,7 +227,9 @@ cdef class Tokenizer:
        cdef unicode minus_suf
        cdef size_t last_size = 0
        while string and len(string) != last_size:
-            if self.token_match and self.token_match(string):
+            if self.token_match and self.token_match(string) \
                    and not self.find_prefix(string) \
                    and not self.find_suffix(string):
                break
            if self._specials.get(hash_string(string)) != NULL:
                has_special[0] = 1
@ -243,8 +245,6 @@ cdef class Tokenizer:
                    prefixes.push_back(self.vocab.get(mem, prefix))
                    has_special[0] = 1
                    break
                if self.token_match and self.token_match(string):
                    break
            suf_len = self.find_suffix(string)
            if suf_len != 0:
                suffix = string[-suf_len:]