Merge pull request #5121 from adrianeboyd/bugfix/revert-token-match

Revert token_match priority changes from #4374 and extend token match options
This commit is contained in:
Matthew Honnibal 2020-05-22 14:42:51 +02:00 committed by GitHub
commit f6078d866a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 89 additions and 46 deletions

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA
@ -455,9 +454,6 @@ _regular_exp += [
for hc in _hyphen_combination
]
# URLs
_regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(

View File

@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
_currency = r"\$¢£€¥฿"
_quotes = CONCAT_QUOTES.replace("'", "")
_units = UNITS.replace("%", "")
_prefixes = (
LIST_PUNCT
@ -21,7 +20,8 @@ _prefixes = (
)
_suffixes = (
LIST_PUNCT
[r"\+"]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=_units),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re
from ..punctuation import ALPHA_LOWER, CURRENCY
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH
@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .char_classes import ALPHA_LOWER
from .char_classes import ALPHA_LOWER, ALPHA
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
@ -58,7 +58,8 @@ URL_PATTERN = (
# fmt: on
).strip()
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
TOKEN_MATCH = None
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
BASE_EXCEPTIONS = {}

View File

@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP
from .tokens import Doc
@ -89,6 +89,7 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
url_match = cls.url_match
prefix_search = (
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
)
@ -106,10 +107,12 @@ class BaseDefaults(object):
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
url_match=url_match,
)
pipe_names = ["tagger", "parser", "ner"]
token_match = TOKEN_MATCH
url_match = URL_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)

View File

@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is not None
assert en_tokenizer.url_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is None
assert en_tokenizer.url_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -17,6 +17,7 @@ cdef class Tokenizer:
cpdef readonly Vocab vocab
cdef object _token_match
cdef object _url_match
cdef object _prefix_search
cdef object _suffix_search
cdef object _infix_finditer

View File

@ -30,7 +30,8 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None):
suffix_search=None, infix_finditer=None, token_match=None,
url_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -43,6 +44,8 @@ cdef class Tokenizer:
`re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be
recognised as tokens.
url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object.
EXAMPLE:
@ -55,6 +58,7 @@ cdef class Tokenizer:
self._cache = PreshMap()
self._specials = PreshMap()
self.token_match = token_match
self.url_match = url_match
self.prefix_search = prefix_search
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
@ -70,6 +74,14 @@ cdef class Tokenizer:
self._token_match = token_match
self._flush_cache()
property url_match:
def __get__(self):
return self._url_match
def __set__(self, url_match):
self._url_match = url_match
self._flush_cache()
property prefix_search:
def __get__(self):
return self._prefix_search
@ -108,11 +120,12 @@ cdef class Tokenizer:
def __reduce__(self):
args = (self.vocab,
self._rules,
self.rules,
self.prefix_search,
self.suffix_search,
self.infix_finditer,
self.token_match)
self.token_match,
self.url_match)
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
@ -240,6 +253,8 @@ cdef class Tokenizer:
cdef unicode minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string):
break
if self._specials.get(hash_string(string)) != NULL:
has_special[0] = 1
break
@ -295,7 +310,9 @@ cdef class Tokenizer:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
elif self.token_match and self.token_match(string):
elif (self.token_match and self.token_match(string)) or \
(self.url_match and \
self.url_match(string)):
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
@ -448,6 +465,11 @@ cdef class Tokenizer:
suffix_search = self.suffix_search
infix_finditer = self.infix_finditer
token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
url_match = self.url_match
if url_match is None:
url_match = re.compile("a^").match
special_cases = {}
for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -456,6 +478,10 @@ cdef class Tokenizer:
suffixes = []
while substring:
while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
break
if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
@ -476,12 +502,15 @@ cdef class Tokenizer:
break
suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split]
if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif token_match(substring):
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
elif url_match(substring):
tokens.append(("URL_MATCH", substring))
substring = ''
elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif list(infix_finditer(substring)):
infixes = infix_finditer(substring)
offset = 0
@ -543,6 +572,7 @@ cdef class Tokenizer:
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
("token_match", lambda: _get_regex_pattern(self.token_match)),
("url_match", lambda: _get_regex_pattern(self.url_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -564,11 +594,12 @@ cdef class Tokenizer:
("suffix_search", lambda b: data.setdefault("suffix_search", b)),
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
("token_match", lambda b: data.setdefault("token_match", b)),
("url_match", lambda b: data.setdefault("url_match", b)),
("exceptions", lambda b: data.setdefault("rules", b))
))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]:
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
if key in data:
data[key] = unescape_unicode(data[key])
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -579,6 +610,8 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match
if "url_match" in data and isinstance(data["url_match"], basestring_):
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}

View File

@ -35,14 +35,15 @@ the
> ```
| Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -738,6 +738,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
suffixes = []
while substring:
while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(substring)
substring = ''
break
if substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
@ -752,12 +756,15 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
split = suffix_search(substring).start()
suffixes.append(substring[split:])
substring = substring[:split]
if substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
elif token_match(substring):
if token_match(substring):
tokens.append(substring)
substring = ''
elif url_match(substring):
tokens.append(substring)
substring = ''
elif substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
elif list(infix_finditer(substring)):
infixes = infix_finditer(substring)
offset = 0
@ -778,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings.
2. Check whether we have an explicitly defined rule for this substring. If we
do, use it.
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
so that special cases always get priority.
4. If we didn't consume a prefix, try to consume a suffix and then go back to
2. Look for a token match. If there is a match, stop processing and keep this
token.
3. Check whether we have an explicitly defined special case for this substring.
If we do, use it.
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
#2, so that the token match and special cases always get priority.
5. If we didn't consume a prefix, try to consume a suffix and then go back to
#2.
5. If we can't consume a prefix or a suffix, look for a special case.
6. Next, look for a token match.
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
6. If we can't consume a prefix or a suffix, look for a URL match.
7. If there's no URL match, then look for a special case.
8. Look for "infixes" — stuff like hyphens etc. and split the substring into
tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token.
9. Once we can't consume any more of the string, handle it as a single token.
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
@ -832,8 +841,8 @@ domain. There are five things you would need to define:
hyphens etc.
5. An optional boolean function `token_match` matching strings that should never
be split, overriding the infix rules. Useful for things like URLs or numbers.
Note that prefixes and suffixes will be split off before `token_match` is
applied.
6. An optional boolean function `url_match`, which is similar to `token_match`
except prefixes and suffixes are removed before applying the match.
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its