mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Use special matcher for exceptions with spaces (#6668)
Use the special cases phrase matcher for exceptions that include space characters so that exceptions including spaces are supported.
This commit is contained in:
parent
afc5714d32
commit
0041dfbc7f
|
@ -180,3 +180,9 @@ def test_tokenizer_special_cases_idx(tokenizer):
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
assert doc[1].idx == 4
|
assert doc[1].idx == 4
|
||||||
assert doc[2].idx == 7
|
assert doc[2].idx == 7
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_special_cases_spaces(tokenizer):
|
||||||
|
assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"]
|
||||||
|
tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}])
|
||||||
|
assert [t.text for t in tokenizer("a b c")] == ["a b c"]
|
||||||
|
|
|
@ -611,7 +611,7 @@ cdef class Tokenizer:
|
||||||
self.mem.free(stale_special)
|
self.mem.free(stale_special)
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
|
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
|
||||||
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
|
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
|
||||||
|
|
||||||
def _reload_special_cases(self):
|
def _reload_special_cases(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user