Rename to url_match

Rename to `url_match` and update docs.
This commit is contained in:
Adriane Boyd 2020-05-22 12:41:03 +02:00
parent 730fa493a4
commit e4a1b5dab1
7 changed files with 51 additions and 44 deletions

View File

@ -59,7 +59,7 @@ URL_PATTERN = (
).strip() ).strip()
TOKEN_MATCH = None TOKEN_MATCH = None
TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
BASE_EXCEPTIONS = {} BASE_EXCEPTIONS = {}

View File

@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG, NORM from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
from .lang.norm_exceptions import BASE_NORMS from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .tokens import Doc from .tokens import Doc
@ -89,7 +89,7 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions rules = cls.tokenizer_exceptions
token_match = cls.token_match token_match = cls.token_match
token_match_with_affixes = cls.token_match_with_affixes url_match = cls.url_match
prefix_search = ( prefix_search = (
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
) )
@ -107,12 +107,12 @@ class BaseDefaults(object):
suffix_search=suffix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, infix_finditer=infix_finditer,
token_match=token_match, token_match=token_match,
token_match_with_affixes=token_match_with_affixes, url_match=url_match,
) )
pipe_names = ["tagger", "parser", "ner"] pipe_names = ["tagger", "parser", "ner"]
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES url_match = URL_MATCH
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url): def test_should_match(en_tokenizer, url):
assert en_tokenizer.token_match_with_affixes(url) is not None assert en_tokenizer.url_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url): def test_should_not_match(en_tokenizer, url):
assert en_tokenizer.token_match_with_affixes(url) is None assert en_tokenizer.url_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC) @pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -17,7 +17,7 @@ cdef class Tokenizer:
cpdef readonly Vocab vocab cpdef readonly Vocab vocab
cdef object _token_match cdef object _token_match
cdef object _token_match_with_affixes cdef object _url_match
cdef object _prefix_search cdef object _prefix_search
cdef object _suffix_search cdef object _suffix_search
cdef object _infix_finditer cdef object _infix_finditer

View File

@ -31,7 +31,7 @@ cdef class Tokenizer:
""" """
def __init__(self, Vocab vocab, rules=None, prefix_search=None, def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None, suffix_search=None, infix_finditer=None, token_match=None,
token_match_with_affixes=None): url_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -44,7 +44,7 @@ cdef class Tokenizer:
`re.compile(string).finditer` to find infixes. `re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be token_match (callable): A boolean function matching strings to be
recognised as tokens. recognised as tokens.
token_match_with_affixes (callable): A boolean function matching strings to be url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes. recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object. RETURNS (Tokenizer): The newly constructed object.
@ -58,7 +58,7 @@ cdef class Tokenizer:
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
self.token_match = token_match self.token_match = token_match
self.token_match_with_affixes = token_match_with_affixes self.url_match = url_match
self.prefix_search = prefix_search self.prefix_search = prefix_search
self.suffix_search = suffix_search self.suffix_search = suffix_search
self.infix_finditer = infix_finditer self.infix_finditer = infix_finditer
@ -74,12 +74,12 @@ cdef class Tokenizer:
self._token_match = token_match self._token_match = token_match
self._flush_cache() self._flush_cache()
property token_match_with_affixes: property url_match:
def __get__(self): def __get__(self):
return self._token_match_with_affixes return self._url_match
def __set__(self, token_match_with_affixes): def __set__(self, url_match):
self._token_match_with_affixes = token_match_with_affixes self._url_match = url_match
self._flush_cache() self._flush_cache()
property prefix_search: property prefix_search:
@ -125,7 +125,7 @@ cdef class Tokenizer:
self.suffix_search, self.suffix_search,
self.infix_finditer, self.infix_finditer,
self.token_match, self.token_match,
self.token_match_with_affixes) self.url_match)
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
@ -311,8 +311,8 @@ cdef class Tokenizer:
if cache_hit: if cache_hit:
pass pass
elif (self.token_match and self.token_match(string)) or \ elif (self.token_match and self.token_match(string)) or \
(self.token_match_with_affixes and \ (self.url_match and \
self.token_match_with_affixes(string)): self.url_match(string)):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -467,9 +467,9 @@ cdef class Tokenizer:
token_match = self.token_match token_match = self.token_match
if token_match is None: if token_match is None:
token_match = re.compile("a^").match token_match = re.compile("a^").match
token_match_with_affixes = self.token_match_with_affixes url_match = self.url_match
if token_match_with_affixes is None: if url_match is None:
token_match_with_affixes = re.compile("a^").match url_match = re.compile("a^").match
special_cases = {} special_cases = {}
for orth, special_tokens in self.rules.items(): for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -505,8 +505,8 @@ cdef class Tokenizer:
if token_match(substring): if token_match(substring):
tokens.append(("TOKEN_MATCH", substring)) tokens.append(("TOKEN_MATCH", substring))
substring = '' substring = ''
elif token_match_with_affixes(substring): elif url_match(substring):
tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) tokens.append(("URL_MATCH", substring))
substring = '' substring = ''
elif substring in special_cases: elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
@ -572,7 +572,7 @@ cdef class Tokenizer:
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
("token_match", lambda: _get_regex_pattern(self.token_match)), ("token_match", lambda: _get_regex_pattern(self.token_match)),
("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), ("url_match", lambda: _get_regex_pattern(self.url_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
)) ))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs) exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -594,12 +594,12 @@ cdef class Tokenizer:
("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
("token_match", lambda b: data.setdefault("token_match", b)), ("token_match", lambda b: data.setdefault("token_match", b)),
("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), ("url_match", lambda b: data.setdefault("url_match", b)),
("exceptions", lambda b: data.setdefault("rules", b)) ("exceptions", lambda b: data.setdefault("rules", b))
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
if key in data: if key in data:
data[key] = unescape_unicode(data[key]) data[key] = unescape_unicode(data[key])
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -610,8 +610,8 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], basestring_): if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): if "url_match" in data and isinstance(data["url_match"], basestring_):
self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict): if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions # make sure to hard reset the cache to remove data from the default exceptions
self._rules = {} self._rules = {}

View File

@ -35,14 +35,14 @@ the
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. | | `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. | | **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"} ## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
if token_match(substring): if token_match(substring):
tokens.append(substring) tokens.append(substring)
substring = '' substring = ''
elif url_match(substring):
tokens.append(substring)
substring = ''
elif substring in special_cases: elif substring in special_cases:
tokens.extend(special_cases[substring]) tokens.extend(special_cases[substring])
substring = '' substring = ''
@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
The algorithm can be summarized as follows: The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings. 1. Iterate over whitespace-separated substrings.
2. Look for a token match. If there is a match, stop processing and keep this token. 2. Look for a token match. If there is a match, stop processing and keep this
3. Check whether we have an explicitly defined rule for this substring. If we token.
do, use it. 3. Check whether we have an explicitly defined special case for this substring.
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, If we do, use it.
so that the token match and special cases always get priority. 4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
#2, so that the token match and special cases always get priority.
5. If we didn't consume a prefix, try to consume a suffix and then go back to 5. If we didn't consume a prefix, try to consume a suffix and then go back to
#2. #2.
6. If we can't consume a prefix or a suffix, look for a special case. 6. If we can't consume a prefix or a suffix, look for a URL match.
7. Look for "infixes" — stuff like hyphens etc. and split the substring into 7. If there's no URL match, then look for a special case.
8. Look for "infixes" — stuff like hyphens etc. and split the substring into
tokens on all infixes. tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token. 9. Once we can't consume any more of the string, handle it as a single token.
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
@ -836,6 +841,8 @@ domain. There are five things you would need to define:
hyphens etc. hyphens etc.
5. An optional boolean function `token_match` matching strings that should never 5. An optional boolean function `token_match` matching strings that should never
be split, overriding the infix rules. Useful for things like URLs or numbers. be split, overriding the infix rules. Useful for things like URLs or numbers.
6. An optional boolean function `url_match`, which is similar to `token_match`
except prefixes and suffixes are removed before applying the match.
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its to use `re.compile()` to build a regular expression object, and pass its