mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 05:33:15 +03:00
Fix infix as prefix in Tokenizer.explain (#10140)
* Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs
This commit is contained in:
parent
30cf9d6a05
commit
4f441dfa24
|
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
||||||
|
from spacy.util import compile_infix_regex
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.symbols import ORTH
|
from spacy.symbols import ORTH
|
||||||
|
|
||||||
|
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
||||||
assert tokens == ["a", "10", "."]
|
assert tokens == ["a", "10", "."]
|
||||||
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_infix_prefix(en_vocab):
|
||||||
|
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||||
|
infixes = ["±"]
|
||||||
|
suffixes = ["%"]
|
||||||
|
infix_re = compile_infix_regex(infixes)
|
||||||
|
suffix_re = compile_suffix_regex(suffixes)
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
)
|
||||||
|
tokens = [t.text for t in tokenizer("±10%")]
|
||||||
|
assert tokens == ["±10", "%"]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
|
@ -683,6 +683,8 @@ cdef class Tokenizer:
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
for match in infixes:
|
for match in infixes:
|
||||||
|
if offset == 0 and match.start() == 0:
|
||||||
|
continue
|
||||||
if substring[offset : match.start()]:
|
if substring[offset : match.start()]:
|
||||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||||
if substring[match.start() : match.end()]:
|
if substring[match.start() : match.end()]:
|
||||||
|
|
|
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
for match in infixes:
|
for match in infixes:
|
||||||
|
if offset == 0 and match.start() == 0:
|
||||||
|
continue
|
||||||
tokens.append(substring[offset : match.start()])
|
tokens.append(substring[offset : match.start()])
|
||||||
tokens.append(substring[match.start() : match.end()])
|
tokens.append(substring[match.start() : match.end()])
|
||||||
offset = match.end()
|
offset = match.end()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user