Add tokenizer option for token match with affixes

To fix the slow tokenizer URL (#4374) and allow `token_match` to take
priority over prefixes and suffixes by default, introduce a new
tokenizer option for a token match pattern that's applied after prefixes
and suffixes but before infixes.
This commit is contained in:
Adriane Boyd 2020-05-05 10:35:33 +02:00
parent 792c8af8cf
commit 565e0eef73
8 changed files with 46 additions and 26 deletions

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re import re
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
@ -455,9 +454,6 @@ _regular_exp += [
for hc in _hyphen_combination for hc in _hyphen_combination
] ]
# URLs
_regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile( TOKEN_MATCH = re.compile(

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re import re
from ..punctuation import ALPHA_LOWER, CURRENCY from ..punctuation import ALPHA_LOWER, CURRENCY
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH from ...symbols import ORTH
@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
URL_PATTERN = ( URL_PATTERN = (
# fmt: off # fmt: off
r"^" r"^"
# in order to support the prefix tokenization (see prefix test cases in test_urls).
r"(?=[" + ALPHA + "\w])"
# protocol identifier (mods: make optional and expand schemes) # protocol identifier (mods: make optional and expand schemes)
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
r"(?:(?:[\w\+\-\.]{2,})://)?" r"(?:(?:[\w\+\-\.]{2,})://)?"
@ -56,13 +54,12 @@ URL_PATTERN = (
r"(?::\d{2,5})?" r"(?::\d{2,5})?"
# resource path # resource path
r"(?:[/?#]\S*)?" r"(?:[/?#]\S*)?"
# in order to support the suffix tokenization (see suffix test cases in test_urls),
r"(?<=[" + ALPHA + "\w/])"
r"$" r"$"
# fmt: on # fmt: on
).strip() ).strip()
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match TOKEN_MATCH = None
TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match
BASE_EXCEPTIONS = {} BASE_EXCEPTIONS = {}

View File

@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG from .attrs import IS_STOP, LANG
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .tokens import Doc from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -86,6 +86,7 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions rules = cls.tokenizer_exceptions
token_match = cls.token_match token_match = cls.token_match
token_match_with_affixes = cls.token_match_with_affixes
prefix_search = ( prefix_search = (
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
) )
@ -103,10 +104,12 @@ class BaseDefaults(object):
suffix_search=suffix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, infix_finditer=infix_finditer,
token_match=token_match, token_match=token_match,
token_match_with_affixes=token_match_with_affixes,
) )
pipe_names = ["tagger", "parser", "ner"] pipe_names = ["tagger", "parser", "ner"]
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [
pytest.param( pytest.param(
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
), ),
pytest.param( "http://foo.com/blah_blah_(wikipedia)",
"http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() "http://foo.com/blah_blah_(wikipedia)_(again)",
),
pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
),
"http://www.foo.co.uk", "http://www.foo.co.uk",
"http://www.foo.co.uk/", "http://www.foo.co.uk/",
"http://www.foo.co.uk/blah/blah", "http://www.foo.co.uk/blah/blah",
@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url): def test_should_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is not None assert en_tokenizer.token_match_with_affixes(url) is not None
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url): def test_should_not_match(en_tokenizer, url):
assert en_tokenizer.token_match(url) is None assert en_tokenizer.token_match_with_affixes(url) is None
@pytest.mark.parametrize("url", URLS_BASIC) @pytest.mark.parametrize("url", URLS_BASIC)

View File

@ -17,6 +17,7 @@ cdef class Tokenizer:
cpdef readonly Vocab vocab cpdef readonly Vocab vocab
cdef object _token_match cdef object _token_match
cdef object _token_match_with_affixes
cdef object _prefix_search cdef object _prefix_search
cdef object _suffix_search cdef object _suffix_search
cdef object _infix_finditer cdef object _infix_finditer

View File

@ -30,7 +30,8 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer DOCS: https://spacy.io/api/tokenizer
""" """
def __init__(self, Vocab vocab, rules=None, prefix_search=None, def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None): suffix_search=None, infix_finditer=None, token_match=None,
token_match_with_affixes=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -43,6 +44,8 @@ cdef class Tokenizer:
`re.compile(string).finditer` to find infixes. `re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be token_match (callable): A boolean function matching strings to be
recognised as tokens. recognised as tokens.
token_match_with_affixes (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object. RETURNS (Tokenizer): The newly constructed object.
EXAMPLE: EXAMPLE:
@ -55,6 +58,7 @@ cdef class Tokenizer:
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
self.token_match = token_match self.token_match = token_match
self.token_match_with_affixes = token_match_with_affixes
self.prefix_search = prefix_search self.prefix_search = prefix_search
self.suffix_search = suffix_search self.suffix_search = suffix_search
self.infix_finditer = infix_finditer self.infix_finditer = infix_finditer
@ -70,6 +74,14 @@ cdef class Tokenizer:
self._token_match = token_match self._token_match = token_match
self._flush_cache() self._flush_cache()
property token_match_with_affixes:
def __get__(self):
return self._token_match_with_affixes
def __set__(self, token_match_with_affixes):
self._token_match_with_affixes = token_match_with_affixes
self._flush_cache()
property prefix_search: property prefix_search:
def __get__(self): def __get__(self):
return self._prefix_search return self._prefix_search
@ -108,11 +120,12 @@ cdef class Tokenizer:
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,
self._rules, self.rules,
self.prefix_search, self.prefix_search,
self.suffix_search, self.suffix_search,
self.infix_finditer, self.infix_finditer,
self.token_match) self.token_match,
self.token_match_with_affixes)
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
@ -297,7 +310,9 @@ cdef class Tokenizer:
cache_hit = self._try_cache(hash_string(string), tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit: if cache_hit:
pass pass
elif self.token_match and self.token_match(string): elif (self.token_match and self.token_match(string)) or \
(self.token_match_with_affixes and \
self.token_match_with_affixes(string)):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -450,6 +465,11 @@ cdef class Tokenizer:
suffix_search = self.suffix_search suffix_search = self.suffix_search
infix_finditer = self.infix_finditer infix_finditer = self.infix_finditer
token_match = self.token_match token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
token_match_with_affixes = self.token_match_with_affixes
if token_match_with_affixes is None:
token_match_with_affixes = re.compile("a^").match
special_cases = {} special_cases = {}
for orth, special_tokens in self.rules.items(): for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -485,6 +505,9 @@ cdef class Tokenizer:
if token_match(substring): if token_match(substring):
tokens.append(("TOKEN_MATCH", substring)) tokens.append(("TOKEN_MATCH", substring))
substring = '' substring = ''
elif token_match_with_affixes(substring):
tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
substring = ''
elif substring in special_cases: elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = '' substring = ''
@ -549,6 +572,7 @@ cdef class Tokenizer:
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
("token_match", lambda: _get_regex_pattern(self.token_match)), ("token_match", lambda: _get_regex_pattern(self.token_match)),
("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
)) ))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs) exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -570,11 +594,12 @@ cdef class Tokenizer:
("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
("token_match", lambda b: data.setdefault("token_match", b)), ("token_match", lambda b: data.setdefault("token_match", b)),
("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
("exceptions", lambda b: data.setdefault("rules", b)) ("exceptions", lambda b: data.setdefault("rules", b))
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
if key in data: if key in data:
data[key] = unescape_unicode(data[key]) data[key] = unescape_unicode(data[key])
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -585,6 +610,8 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], basestring_): if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
if "rules" in data and isinstance(data["rules"], dict): if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions # make sure to hard reset the cache to remove data from the default exceptions
self._rules = {} self._rules = {}

View File

@ -41,7 +41,8 @@ the
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. | | **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"} ## Tokenizer.\_\_call\_\_ {#call tag="method"}