mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Update Tokenizer.explain with special matches (#7749)
* Update Tokenizer.explain with special matches Update `Tokenizer.explain` and the pseudo-code in the docs to include the processing of special cases that contain affixes or whitespace. * Handle optional settings in explain * Add test for special matches in explain Add test for `Tokenizer.explain` for special cases containing affixes.
This commit is contained in:
parent
07b41c38ae
commit
0e7f94b247
|
@ -1,5 +1,7 @@
|
|||
import pytest
|
||||
import re
|
||||
from spacy.util import get_lang_class
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
||||
# Only include languages with no external dependencies
|
||||
# "is" seems to confuse importlib, so we're also excluding it for now
|
||||
|
@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
|
|||
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
||||
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||
assert tokens == debug_tokens
|
||||
|
||||
|
||||
def test_tokenizer_explain_special_matcher(en_vocab):
|
||||
suffix_re = re.compile(r"[\.]$")
|
||||
infix_re = re.compile(r"[/]")
|
||||
rules = {"a.": [{"ORTH": "a."}]}
|
||||
tokenizer = Tokenizer(
|
||||
en_vocab,
|
||||
rules=rules,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
)
|
||||
tokens = [t.text for t in tokenizer("a/a.")]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
|
||||
assert tokens == explain_tokens
|
||||
|
|
|
@ -20,11 +20,12 @@ from .attrs import intify_attrs
|
|||
from .symbols import ORTH, NORM
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import registry
|
||||
from .util import registry, get_words_and_spaces
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
from .scorer import Scorer
|
||||
from .training import validate_examples
|
||||
from .tokens import Span
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -638,8 +639,14 @@ cdef class Tokenizer:
|
|||
DOCS: https://spacy.io/api/tokenizer#explain
|
||||
"""
|
||||
prefix_search = self.prefix_search
|
||||
if prefix_search is None:
|
||||
prefix_search = re.compile("a^").search
|
||||
suffix_search = self.suffix_search
|
||||
if suffix_search is None:
|
||||
suffix_search = re.compile("a^").search
|
||||
infix_finditer = self.infix_finditer
|
||||
if infix_finditer is None:
|
||||
infix_finditer = re.compile("a^").finditer
|
||||
token_match = self.token_match
|
||||
if token_match is None:
|
||||
token_match = re.compile("a^").match
|
||||
|
@ -687,7 +694,7 @@ cdef class Tokenizer:
|
|||
tokens.append(("URL_MATCH", substring))
|
||||
substring = ''
|
||||
elif substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
elif list(infix_finditer(substring)):
|
||||
infixes = infix_finditer(substring)
|
||||
|
@ -705,7 +712,33 @@ cdef class Tokenizer:
|
|||
tokens.append(("TOKEN", substring))
|
||||
substring = ''
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
# Find matches for special cases handled by special matcher
|
||||
words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
|
||||
t_words = []
|
||||
t_spaces = []
|
||||
for word, space in zip(words, spaces):
|
||||
if not word.isspace():
|
||||
t_words.append(word)
|
||||
t_spaces.append(space)
|
||||
doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
|
||||
matches = self._special_matcher(doc)
|
||||
spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
|
||||
spans = util.filter_spans(spans)
|
||||
# Replace matched tokens with their exceptions
|
||||
i = 0
|
||||
final_tokens = []
|
||||
spans_by_start = {s.start: s for s in spans}
|
||||
while i < len(tokens):
|
||||
if i in spans_by_start:
|
||||
span = spans_by_start[i]
|
||||
exc = [d[ORTH] for d in special_cases[span.label_]]
|
||||
for j, orth in enumerate(exc):
|
||||
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
||||
i += len(span)
|
||||
else:
|
||||
final_tokens.append(tokens[i])
|
||||
i += 1
|
||||
return final_tokens
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
validate_examples(examples, "Tokenizer.score")
|
||||
|
|
|
@ -786,6 +786,7 @@ rather than performance:
|
|||
|
||||
```python
|
||||
def tokenizer_pseudo_code(
|
||||
text,
|
||||
special_cases,
|
||||
prefix_search,
|
||||
suffix_search,
|
||||
|
@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
|
|||
tokens.append(substring)
|
||||
substring = ""
|
||||
tokens.extend(reversed(suffixes))
|
||||
for match in matcher(special_cases, text):
|
||||
tokens.replace(match, special_cases[match])
|
||||
return tokens
|
||||
```
|
||||
|
||||
The algorithm can be summarized as follows:
|
||||
|
||||
1. Iterate over whitespace-separated substrings.
|
||||
1. Iterate over space-separated substrings.
|
||||
2. Look for a token match. If there is a match, stop processing and keep this
|
||||
token.
|
||||
3. Check whether we have an explicitly defined special case for this substring.
|
||||
|
@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
|
|||
8. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
||||
tokens on all infixes.
|
||||
9. Once we can't consume any more of the string, handle it as a single token.
|
||||
10. Make a final pass over the text to check for special cases that include
|
||||
spaces or that were missed due to the incremental processing of affixes.
|
||||
|
||||
</Accordion>
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user