Fix infix as prefix in Tokenizer.explain (#10140)

* Fix infix as prefix in Tokenizer.explain

Update `Tokenizer.explain` to align with the `Tokenizer` algorithm:

* skip infix matches that are prefixes in the current substring

* Update tokenizer pseudocode in docs
This commit is contained in:
Adriane Boyd 2022-01-28 17:00:54 +01:00 committed by GitHub
parent 30cf9d6a05
commit 4f441dfa24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 22 additions and 0 deletions

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
from spacy.util import compile_infix_regex
from spacy.vocab import Vocab
from spacy.symbols import ORTH
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
infixes = ["±"]
suffixes = ["%"]
infix_re = compile_infix_regex(infixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
infix_finditer=infix_re.finditer,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("±10%")]
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens

View File

@ -683,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:

View File

@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()