Rename to url_match

Rename to `url_match` and update docs.
This commit is contained in:
Adriane Boyd 2020-05-22 12:41:03 +02:00
parent 730fa493a4
commit e4a1b5dab1
7 changed files with 51 additions and 44 deletions

View File

@ -59,7 +59,7 @@ URL_PATTERN = (
).strip()
TOKEN_MATCH = None
TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
BASE_EXCEPTIONS = {}

View File

@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP
from .tokens import Doc
@ -89,7 +89,7 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
token_match_with_affixes = cls.token_match_with_affixes
url_match = cls.url_match
prefix_search = (
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
)
@ -107,12 +107,12 @@ class BaseDefaults(object):
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
token_match_with_affixes=token_match_with_affixes,
url_match=url_match,
)
pipe_names = ["tagger", "parser", "ner"]
token_match = TOKEN_MATCH
token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
url_match = URL_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)

View File

@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url):
assert en_tokenizer.token_match_with_affixes(url) is not None
assert en_tokenizer.url_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url):
assert en_tokenizer.token_match_with_affixes(url) is None
assert en_tokenizer.url_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -17,7 +17,7 @@ cdef class Tokenizer:
cpdef readonly Vocab vocab
cdef object _token_match
cdef object _token_match_with_affixes
cdef object _url_match
cdef object _prefix_search
cdef object _suffix_search
cdef object _infix_finditer

View File

@ -31,7 +31,7 @@ cdef class Tokenizer:
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None,
token_match_with_affixes=None):
url_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -44,7 +44,7 @@ cdef class Tokenizer:
`re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be
recognised as tokens.
token_match_with_affixes (callable): A boolean function matching strings to be
url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object.
@ -58,7 +58,7 @@ cdef class Tokenizer:
self._cache = PreshMap()
self._specials = PreshMap()
self.token_match = token_match
self.token_match_with_affixes = token_match_with_affixes
self.url_match = url_match
self.prefix_search = prefix_search
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
@ -74,12 +74,12 @@ cdef class Tokenizer:
self._token_match = token_match
self._flush_cache()
property token_match_with_affixes:
property url_match:
def __get__(self):
return self._token_match_with_affixes
return self._url_match
def __set__(self, token_match_with_affixes):
self._token_match_with_affixes = token_match_with_affixes
def __set__(self, url_match):
self._url_match = url_match
self._flush_cache()
property prefix_search:
@ -125,7 +125,7 @@ cdef class Tokenizer:
self.suffix_search,
self.infix_finditer,
self.token_match,
self.token_match_with_affixes)
self.url_match)
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
@ -311,8 +311,8 @@ cdef class Tokenizer:
if cache_hit:
pass
elif (self.token_match and self.token_match(string)) or \
(self.token_match_with_affixes and \
self.token_match_with_affixes(string)):
(self.url_match and \
self.url_match(string)):
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
@ -467,9 +467,9 @@ cdef class Tokenizer:
token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
token_match_with_affixes = self.token_match_with_affixes
if token_match_with_affixes is None:
token_match_with_affixes = re.compile("a^").match
url_match = self.url_match
if url_match is None:
url_match = re.compile("a^").match
special_cases = {}
for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -505,8 +505,8 @@ cdef class Tokenizer:
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
elif token_match_with_affixes(substring):
tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
elif url_match(substring):
tokens.append(("URL_MATCH", substring))
substring = ''
elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
@ -572,7 +572,7 @@ cdef class Tokenizer:
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
("token_match", lambda: _get_regex_pattern(self.token_match)),
("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
("url_match", lambda: _get_regex_pattern(self.url_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -594,12 +594,12 @@ cdef class Tokenizer:
("suffix_search", lambda b: data.setdefault("suffix_search", b)),
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
("token_match", lambda b: data.setdefault("token_match", b)),
("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
("url_match", lambda b: data.setdefault("url_match", b)),
("exceptions", lambda b: data.setdefault("rules", b))
))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
if key in data:
data[key] = unescape_unicode(data[key])
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -610,8 +610,8 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match
if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
if "url_match" in data and isinstance(data["url_match"], basestring_):
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}

View File

@ -35,15 +35,15 @@ the
> ```
| Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
if token_match(substring):
tokens.append(substring)
substring = ''
elif url_match(substring):
tokens.append(substring)
substring = ''
elif substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings.
2. Look for a token match. If there is a match, stop processing and keep this token.
3. Check whether we have an explicitly defined rule for this substring. If we
do, use it.
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
so that the token match and special cases always get priority.
2. Look for a token match. If there is a match, stop processing and keep this
token.
3. Check whether we have an explicitly defined special case for this substring.
If we do, use it.
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
#2, so that the token match and special cases always get priority.
5. If we didn't consume a prefix, try to consume a suffix and then go back to
#2.
6. If we can't consume a prefix or a suffix, look for a special case.
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
6. If we can't consume a prefix or a suffix, look for a URL match.
7. If there's no URL match, then look for a special case.
8. Look for "infixes" — stuff like hyphens etc. and split the substring into
tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token.
9. Once we can't consume any more of the string, handle it as a single token.
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
@ -836,6 +841,8 @@ domain. There are five things you would need to define:
hyphens etc.
5. An optional boolean function `token_match` matching strings that should never
be split, overriding the infix rules. Useful for things like URLs or numbers.
6. An optional boolean function `url_match`, which is similar to `token_match`
except prefixes and suffixes are removed before applying the match.
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its