Rename to url_match

Rename to `url_match` and update docs.
2026-03-03 19:31:35 +03:00 · 2020-05-22 12:41:03 +02:00 · 2020-05-22 12:41:03 +02:00 · e4a1b5dab1
commit e4a1b5dab1
parent 730fa493a4
7 changed files with 51 additions and 44 deletions
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -59,7 +59,7 @@ URL_PATTERN = (
 ).strip()

 TOKEN_MATCH = None
-TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match
+URL_MATCH = re.compile("(?u)" + URL_PATTERN).match


 BASE_EXCEPTIONS = {}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
 from .attrs import IS_STOP, LANG, NORM
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
-from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
+from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
 from .lang.norm_exceptions import BASE_NORMS
 from .lang.tag_map import TAG_MAP
 from .tokens import Doc
@ -89,7 +89,7 @@ class BaseDefaults(object):
    def create_tokenizer(cls, nlp=None):
        rules = cls.tokenizer_exceptions
        token_match = cls.token_match
-        token_match_with_affixes = cls.token_match_with_affixes
+        url_match = cls.url_match
        prefix_search = (
            util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
        )
@ -107,12 +107,12 @@ class BaseDefaults(object):
            suffix_search=suffix_search,
            infix_finditer=infix_finditer,
            token_match=token_match,
-            token_match_with_affixes=token_match_with_affixes,
+            url_match=url_match,
        )

    pipe_names = ["tagger", "parser", "ner"]
    token_match = TOKEN_MATCH
-    token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
+    url_match = URL_MATCH
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]

@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
 def test_should_match(en_tokenizer, url):
-    assert en_tokenizer.token_match_with_affixes(url) is not None
+    assert en_tokenizer.url_match(url) is not None


@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
 def test_should_not_match(en_tokenizer, url):
-    assert en_tokenizer.token_match_with_affixes(url) is None
+    assert en_tokenizer.url_match(url) is None


@pytest.mark.parametrize("url", URLS_BASIC)
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -17,7 +17,7 @@ cdef class Tokenizer:
    cpdef readonly Vocab vocab

    cdef object _token_match
-    cdef object _token_match_with_affixes
+    cdef object _url_match
    cdef object _prefix_search
    cdef object _suffix_search
    cdef object _infix_finditer
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -31,7 +31,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 token_match_with_affixes=None):
+                 url_match=None):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -44,7 +44,7 @@ cdef class Tokenizer:
            `re.compile(string).finditer` to find infixes.
        token_match (callable): A boolean function matching strings to be
            recognised as tokens.
-        token_match_with_affixes (callable): A boolean function matching strings to be
+        url_match (callable): A boolean function matching strings to be
            recognised as tokens after considering prefixes and suffixes.
        RETURNS (Tokenizer): The newly constructed object.

@ -58,7 +58,7 @@ cdef class Tokenizer:
        self._cache = PreshMap()
        self._specials = PreshMap()
        self.token_match = token_match
-        self.token_match_with_affixes = token_match_with_affixes
+        self.url_match = url_match
        self.prefix_search = prefix_search
        self.suffix_search = suffix_search
        self.infix_finditer = infix_finditer
@ -74,12 +74,12 @@ cdef class Tokenizer:
            self._token_match = token_match
            self._flush_cache()

-    property token_match_with_affixes:
+    property url_match:
        def __get__(self):
-            return self._token_match_with_affixes
+            return self._url_match

-        def __set__(self, token_match_with_affixes):
-            self._token_match_with_affixes = token_match_with_affixes
+        def __set__(self, url_match):
+            self._url_match = url_match
            self._flush_cache()

    property prefix_search:
@ -125,7 +125,7 @@ cdef class Tokenizer:
                self.suffix_search,
                self.infix_finditer,
                self.token_match,
-                self.token_match_with_affixes)
+                self.url_match)
        return (self.__class__, args, None, None)

    cpdef Doc tokens_from_list(self, list strings):
@ -311,8 +311,8 @@ cdef class Tokenizer:
            if cache_hit:
                pass
            elif (self.token_match and self.token_match(string)) or \
-                    (self.token_match_with_affixes and \
-                    self.token_match_with_affixes(string)):
+                    (self.url_match and \
+                    self.url_match(string)):
                # We're always saying 'no' to spaces here -- the caller will
                # fix up the outermost one, with reference to the original.
                # See Issue #859
@ -467,9 +467,9 @@ cdef class Tokenizer:
        token_match = self.token_match
        if token_match is None:
            token_match = re.compile("a^").match
-        token_match_with_affixes = self.token_match_with_affixes
-        if token_match_with_affixes is None:
-            token_match_with_affixes = re.compile("a^").match
+        url_match = self.url_match
+        if url_match is None:
+            url_match = re.compile("a^").match
        special_cases = {}
        for orth, special_tokens in self.rules.items():
            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -505,8 +505,8 @@ cdef class Tokenizer:
                if token_match(substring):
                    tokens.append(("TOKEN_MATCH", substring))
                    substring = ''
-                elif token_match_with_affixes(substring):
-                    tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
+                elif url_match(substring):
+                    tokens.append(("URL_MATCH", substring))
                    substring = ''
                elif substring in special_cases:
                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
@ -572,7 +572,7 @@ cdef class Tokenizer:
            ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
            ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
            ("token_match", lambda: _get_regex_pattern(self.token_match)),
-            ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
+            ("url_match", lambda: _get_regex_pattern(self.url_match)),
            ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
        ))
        exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -594,12 +594,12 @@ cdef class Tokenizer:
            ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
            ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
            ("token_match", lambda b: data.setdefault("token_match", b)),
-            ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
+            ("url_match", lambda b: data.setdefault("url_match", b)),
            ("exceptions", lambda b: data.setdefault("rules", b))
        ))
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
+        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
            if key in data:
                data[key] = unescape_unicode(data[key])
        if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -610,8 +610,8 @@ cdef class Tokenizer:
            self.infix_finditer = re.compile(data["infix_finditer"]).finditer
        if "token_match" in data and isinstance(data["token_match"], basestring_):
            self.token_match = re.compile(data["token_match"]).match
-        if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
-            self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
+        if "url_match" in data and isinstance(data["url_match"], basestring_):
+            self.url_match = re.compile(data["url_match"]).match
        if "rules" in data and isinstance(data["rules"], dict):
            # make sure to hard reset the cache to remove data from the default exceptions
            self._rules = {}
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -35,15 +35,15 @@ the
 > ```

 | Name             | Type        | Description                                                                                                                   |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                         |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                                |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                            |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                            |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                            |
 | `token_match`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches.                                         |
-| `token_match_with_affixes`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes.                                         |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
+| `url_match`      | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                  |

 ## Tokenizer.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
            if token_match(substring):
                tokens.append(substring)
                substring = ''
+            elif url_match(substring):
+                tokens.append(substring)
+                substring = ''
            elif substring in special_cases:
                tokens.extend(special_cases[substring])
                substring = ''
@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
 The algorithm can be summarized as follows:

 1. Iterate over whitespace-separated substrings.
-2. Look for a token match. If there is a match, stop processing and keep this token.
-3. Check whether we have an explicitly defined rule for this substring. If we
-   do, use it.
-4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
-   so that the token match and special cases always get priority.
+2. Look for a token match. If there is a match, stop processing and keep this
+   token.
+3. Check whether we have an explicitly defined special case for this substring.
+   If we do, use it.
+4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
+   #2, so that the token match and special cases always get priority.
 5. If we didn't consume a prefix, try to consume a suffix and then go back to
   #2.
-6. If we can't consume a prefix or a suffix, look for a special case.
-7. Look for "infixes" — stuff like hyphens etc. and split the substring into
+6. If we can't consume a prefix or a suffix, look for a URL match.
+7. If there's no URL match, then look for a special case.
+8. Look for "infixes" — stuff like hyphens etc. and split the substring into
   tokens on all infixes.
-8. Once we can't consume any more of the string, handle it as a single token.
+9. Once we can't consume any more of the string, handle it as a single token.

 #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}

@ -836,6 +841,8 @@ domain. There are five things you would need to define:
   hyphens etc.
 5. An optional boolean function `token_match` matching strings that should never
   be split, overriding the infix rules. Useful for things like URLs or numbers.
+6. An optional boolean function `url_match`, which is similar to `token_match`
+   except prefixes and suffixes are removed before applying the match.

 You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
 to use `re.compile()` to build a regular expression object, and pass its