diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 82032b2da..4f5eddb95 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -180,3 +180,9 @@ def test_tokenizer_special_cases_idx(tokenizer): doc = tokenizer(text) assert doc[1].idx == 4 assert doc[2].idx == 7 + + +def test_tokenizer_special_cases_spaces(tokenizer): + assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"] + tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}]) + assert [t.text for t in tokenizer("a b c")] == ["a b c"] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index e22e0094b..d54c3521d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -611,7 +611,7 @@ cdef class Tokenizer: self.mem.free(stale_special) self._rules[string] = substrings self._flush_cache() - if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): + if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string: self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) def _reload_special_cases(self):