Ignore prefix in suffix matches

Ignore the currently matched prefix when looking for suffix matches in
the tokenizer. Otherwise a lookbehind in the suffix pattern may match
incorrectly due the presence of the prefix in the token string.
This commit is contained in:
Adriane Boyd 2021-09-06 15:13:16 +02:00
parent c5de9b463a
commit 96d50a3cb3
2 changed files with 19 additions and 2 deletions

View File

@ -2,7 +2,7 @@ import pytest
import re
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.util import ensure_path
from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex
from spacy.lang.en import English
@ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab):
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
tokenizer1.rules = {}
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
prefixes = ['a(?=.)']
suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
prefix_re = compile_prefix_regex(prefixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("a10.")]
assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens

View File

@ -410,7 +410,7 @@ cdef class Tokenizer:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
break
suf_len = self.find_suffix(string)
suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]