Merge pull request #5121 from adrianeboyd/bugfix/revert-token-match

Revert token_match priority changes from #4374 and extend token match options
This commit is contained in:
Matthew Honnibal 2020-05-22 14:42:51 +02:00 committed by GitHub
commit f6078d866a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 89 additions and 46 deletions

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re import re
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
@ -455,9 +454,6 @@ _regular_exp += [
for hc in _hyphen_combination for hc in _hyphen_combination
] ]
# URLs
_regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile( TOKEN_MATCH = re.compile(

View File

@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
_currency = r"\$¢£€¥฿" _currency = r"\$¢£€¥฿"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_units = UNITS.replace("%", "")
_prefixes = ( _prefixes = (
LIST_PUNCT LIST_PUNCT
@ -21,7 +20,8 @@ _prefixes = (
) )
_suffixes = ( _suffixes = (
LIST_PUNCT [r"\+"]
+ LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
+ [_concat_icons] + [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=_units), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format( r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
), ),

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re import re
from ..punctuation import ALPHA_LOWER, CURRENCY from ..punctuation import ALPHA_LOWER, CURRENCY
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH from ...symbols import ORTH
@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re import re
from .char_classes import ALPHA_LOWER from .char_classes import ALPHA_LOWER, ALPHA
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
@ -58,7 +58,8 @@ URL_PATTERN = (
# fmt: on # fmt: on
).strip() ).strip()
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match TOKEN_MATCH = None
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
BASE_EXCEPTIONS = {} BASE_EXCEPTIONS = {}

View File

@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG, NORM from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
from .lang.norm_exceptions import BASE_NORMS from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .tokens import Doc from .tokens import Doc
@ -89,6 +89,7 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions rules = cls.tokenizer_exceptions
token_match = cls.token_match token_match = cls.token_match
url_match = cls.url_match
prefix_search = ( prefix_search = (
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
) )
@ -106,10 +107,12 @@ class BaseDefaults(object):
suffix_search=suffix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, infix_finditer=infix_finditer,
token_match=token_match, token_match=token_match,
url_match=url_match,
) )
pipe_names = ["tagger", "parser", "ner"] pipe_names = ["tagger", "parser", "ner"]
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
url_match = URL_MATCH
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url): def test_should_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is not None assert en_tokenizer.url_match(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url): def test_should_not_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is None assert en_tokenizer.url_match(url) is None
@pytest.mark.parametrize("url", URLS_BASIC) @pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -17,6 +17,7 @@ cdef class Tokenizer:
cpdef readonly Vocab vocab cpdef readonly Vocab vocab
cdef object _token_match cdef object _token_match
cdef object _url_match
cdef object _prefix_search cdef object _prefix_search
cdef object _suffix_search cdef object _suffix_search
cdef object _infix_finditer cdef object _infix_finditer

View File

@ -30,7 +30,8 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer DOCS: https://spacy.io/api/tokenizer
""" """
def __init__(self, Vocab vocab, rules=None, prefix_search=None, def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None): suffix_search=None, infix_finditer=None, token_match=None,
url_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -43,6 +44,8 @@ cdef class Tokenizer:
`re.compile(string).finditer` to find infixes. `re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be token_match (callable): A boolean function matching strings to be
recognised as tokens. recognised as tokens.
url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object. RETURNS (Tokenizer): The newly constructed object.
EXAMPLE: EXAMPLE:
@ -55,6 +58,7 @@ cdef class Tokenizer:
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
self.token_match = token_match self.token_match = token_match
self.url_match = url_match
self.prefix_search = prefix_search self.prefix_search = prefix_search
self.suffix_search = suffix_search self.suffix_search = suffix_search
self.infix_finditer = infix_finditer self.infix_finditer = infix_finditer
@ -70,6 +74,14 @@ cdef class Tokenizer:
self._token_match = token_match self._token_match = token_match
self._flush_cache() self._flush_cache()
property url_match:
def __get__(self):
return self._url_match
def __set__(self, url_match):
self._url_match = url_match
self._flush_cache()
property prefix_search: property prefix_search:
def __get__(self): def __get__(self):
return self._prefix_search return self._prefix_search
@ -108,11 +120,12 @@ cdef class Tokenizer:
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,
self._rules, self.rules,
self.prefix_search, self.prefix_search,
self.suffix_search, self.suffix_search,
self.infix_finditer, self.infix_finditer,
self.token_match) self.token_match,
self.url_match)
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
@ -240,6 +253,8 @@ cdef class Tokenizer:
cdef unicode minus_suf cdef unicode minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string and len(string) != last_size: while string and len(string) != last_size:
if self.token_match and self.token_match(string):
break
if self._specials.get(hash_string(string)) != NULL: if self._specials.get(hash_string(string)) != NULL:
has_special[0] = 1 has_special[0] = 1
break break
@ -295,7 +310,9 @@ cdef class Tokenizer:
cache_hit = self._try_cache(hash_string(string), tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit: if cache_hit:
pass pass
elif self.token_match and self.token_match(string): elif (self.token_match and self.token_match(string)) or \
(self.url_match and \
self.url_match(string)):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -448,6 +465,11 @@ cdef class Tokenizer:
suffix_search = self.suffix_search suffix_search = self.suffix_search
infix_finditer = self.infix_finditer infix_finditer = self.infix_finditer
token_match = self.token_match token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
url_match = self.url_match
if url_match is None:
url_match = re.compile("a^").match
special_cases = {} special_cases = {}
for orth, special_tokens in self.rules.items(): for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -456,6 +478,10 @@ cdef class Tokenizer:
suffixes = [] suffixes = []
while substring: while substring:
while prefix_search(substring) or suffix_search(substring): while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
break
if substring in special_cases: if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = '' substring = ''
@ -476,12 +502,15 @@ cdef class Tokenizer:
break break
suffixes.append(("SUFFIX", substring[split:])) suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split] substring = substring[:split]
if substring in special_cases: if token_match(substring):
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif token_match(substring):
tokens.append(("TOKEN_MATCH", substring)) tokens.append(("TOKEN_MATCH", substring))
substring = '' substring = ''
elif url_match(substring):
tokens.append(("URL_MATCH", substring))
substring = ''
elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif list(infix_finditer(substring)): elif list(infix_finditer(substring)):
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
@ -543,6 +572,7 @@ cdef class Tokenizer:
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
("token_match", lambda: _get_regex_pattern(self.token_match)), ("token_match", lambda: _get_regex_pattern(self.token_match)),
("url_match", lambda: _get_regex_pattern(self.url_match)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
)) ))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs) exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -564,11 +594,12 @@ cdef class Tokenizer:
("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
("token_match", lambda b: data.setdefault("token_match", b)), ("token_match", lambda b: data.setdefault("token_match", b)),
("url_match", lambda b: data.setdefault("url_match", b)),
("exceptions", lambda b: data.setdefault("rules", b)) ("exceptions", lambda b: data.setdefault("rules", b))
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
if key in data: if key in data:
data[key] = unescape_unicode(data[key]) data[key] = unescape_unicode(data[key])
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -579,6 +610,8 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], basestring_): if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if "url_match" in data and isinstance(data["url_match"], basestring_):
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict): if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions # make sure to hard reset the cache to remove data from the default exceptions
self._rules = {} self._rules = {}

View File

@ -35,14 +35,15 @@ the
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. | | `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| **RETURNS** | `Tokenizer` | The newly constructed object. | | `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"} ## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -738,6 +738,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
suffixes = [] suffixes = []
while substring: while substring:
while prefix_search(substring) or suffix_search(substring): while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(substring)
substring = ''
break
if substring in special_cases: if substring in special_cases:
tokens.extend(special_cases[substring]) tokens.extend(special_cases[substring])
substring = '' substring = ''
@ -752,12 +756,15 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
split = suffix_search(substring).start() split = suffix_search(substring).start()
suffixes.append(substring[split:]) suffixes.append(substring[split:])
substring = substring[:split] substring = substring[:split]
if substring in special_cases: if token_match(substring):
tokens.extend(special_cases[substring])
substring = ''
elif token_match(substring):
tokens.append(substring) tokens.append(substring)
substring = '' substring = ''
elif url_match(substring):
tokens.append(substring)
substring = ''
elif substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
elif list(infix_finditer(substring)): elif list(infix_finditer(substring)):
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
@ -778,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
The algorithm can be summarized as follows: The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings. 1. Iterate over whitespace-separated substrings.
2. Check whether we have an explicitly defined rule for this substring. If we 2. Look for a token match. If there is a match, stop processing and keep this
do, use it. token.
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, 3. Check whether we have an explicitly defined special case for this substring.
so that special cases always get priority. If we do, use it.
4. If we didn't consume a prefix, try to consume a suffix and then go back to 4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
#2, so that the token match and special cases always get priority.
5. If we didn't consume a prefix, try to consume a suffix and then go back to
#2. #2.
5. If we can't consume a prefix or a suffix, look for a special case. 6. If we can't consume a prefix or a suffix, look for a URL match.
6. Next, look for a token match. 7. If there's no URL match, then look for a special case.
7. Look for "infixes" — stuff like hyphens etc. and split the substring into 8. Look for "infixes" — stuff like hyphens etc. and split the substring into
tokens on all infixes. tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token. 9. Once we can't consume any more of the string, handle it as a single token.
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
@ -832,8 +841,8 @@ domain. There are five things you would need to define:
hyphens etc. hyphens etc.
5. An optional boolean function `token_match` matching strings that should never 5. An optional boolean function `token_match` matching strings that should never
be split, overriding the infix rules. Useful for things like URLs or numbers. be split, overriding the infix rules. Useful for things like URLs or numbers.
Note that prefixes and suffixes will be split off before `token_match` is 6. An optional boolean function `url_match`, which is similar to `token_match`
applied. except prefixes and suffixes are removed before applying the match.
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its to use `re.compile()` to build a regular expression object, and pass its