Use special matcher for exceptions with spaces (#6668)

Use the special cases phrase matcher for exceptions that include space
characters so that exceptions including spaces are supported.
This commit is contained in:
Adriane Boyd 2021-01-06 05:05:10 +01:00 committed by GitHub
parent afc5714d32
commit 0041dfbc7f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 1 deletions

View File

@ -180,3 +180,9 @@ def test_tokenizer_special_cases_idx(tokenizer):
doc = tokenizer(text) doc = tokenizer(text)
assert doc[1].idx == 4 assert doc[1].idx == 4
assert doc[2].idx == 7 assert doc[2].idx == 7
def test_tokenizer_special_cases_spaces(tokenizer):
assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"]
tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}])
assert [t.text for t in tokenizer("a b c")] == ["a b c"]

View File

@ -611,7 +611,7 @@ cdef class Tokenizer:
self.mem.free(stale_special) self.mem.free(stale_special)
self._rules[string] = substrings self._rules[string] = substrings
self._flush_cache() self._flush_cache()
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
def _reload_special_cases(self): def _reload_special_cases(self):