From 0e7f94b247e0e616439339c66588871a4be30750 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Apr 2021 11:08:20 +0200 Subject: [PATCH] Update Tokenizer.explain with special matches (#7749) * Update Tokenizer.explain with special matches Update `Tokenizer.explain` and the pseudo-code in the docs to include the processing of special cases that contain affixes or whitespace. * Handle optional settings in explain * Add test for special matches in explain Add test for `Tokenizer.explain` for special cases containing affixes. --- spacy/tests/tokenizer/test_explain.py | 17 ++++++++++ spacy/tokenizer.pyx | 39 +++++++++++++++++++++-- website/docs/usage/linguistic-features.md | 7 +++- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index ea6cf91be..0a10ae67d 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,5 +1,7 @@ import pytest +import re from spacy.util import get_lang_class +from spacy.tokenizer import Tokenizer # Only include languages with no external dependencies # "is" seems to confuse importlib, so we're also excluding it for now @@ -60,3 +62,18 @@ def test_tokenizer_explain(lang): tokens = [t.text for t in tokenizer(sentence) if not t.is_space] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] assert tokens == debug_tokens + + +def test_tokenizer_explain_special_matcher(en_vocab): + suffix_re = re.compile(r"[\.]$") + infix_re = re.compile(r"[/]") + rules = {"a.": [{"ORTH": "a."}]} + tokenizer = Tokenizer( + en_vocab, + rules=rules, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + ) + tokens = [t.text for t in tokenizer("a/a.")] + explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] + assert tokens == explain_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5bd6e7aa3..41bbaeee6 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -20,11 +20,12 @@ from .attrs import intify_attrs from .symbols import ORTH, NORM from .errors import Errors, Warnings from . import util -from .util import registry +from .util import registry, get_words_and_spaces from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer from .training import validate_examples +from .tokens import Span cdef class Tokenizer: @@ -638,8 +639,14 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#explain """ prefix_search = self.prefix_search + if prefix_search is None: + prefix_search = re.compile("a^").search suffix_search = self.suffix_search + if suffix_search is None: + suffix_search = re.compile("a^").search infix_finditer = self.infix_finditer + if infix_finditer is None: + infix_finditer = re.compile("a^").finditer token_match = self.token_match if token_match is None: token_match = re.compile("a^").match @@ -687,7 +694,7 @@ cdef class Tokenizer: tokens.append(("URL_MATCH", substring)) substring = '' elif substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) @@ -705,7 +712,33 @@ cdef class Tokenizer: tokens.append(("TOKEN", substring)) substring = '' tokens.extend(reversed(suffixes)) - return tokens + # Find matches for special cases handled by special matcher + words, spaces = get_words_and_spaces([t[1] for t in tokens], text) + t_words = [] + t_spaces = [] + for word, space in zip(words, spaces): + if not word.isspace(): + t_words.append(word) + t_spaces.append(space) + doc = Doc(self.vocab, words=t_words, spaces=t_spaces) + matches = self._special_matcher(doc) + spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches] + spans = util.filter_spans(spans) + # Replace matched tokens with their exceptions + i = 0 + final_tokens = [] + spans_by_start = {s.start: s for s in spans} + while i < len(tokens): + if i in spans_by_start: + span = spans_by_start[i] + exc = [d[ORTH] for d in special_cases[span.label_]] + for j, orth in enumerate(exc): + final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) + i += len(span) + else: + final_tokens.append(tokens[i]) + i += 1 + return final_tokens def score(self, examples, **kwargs): validate_examples(examples, "Tokenizer.score") diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 2d3390049..077b1a556 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -786,6 +786,7 @@ rather than performance: ```python def tokenizer_pseudo_code( + text, special_cases, prefix_search, suffix_search, @@ -839,12 +840,14 @@ def tokenizer_pseudo_code( tokens.append(substring) substring = "" tokens.extend(reversed(suffixes)) + for match in matcher(special_cases, text): + tokens.replace(match, special_cases[match]) return tokens ``` The algorithm can be summarized as follows: -1. Iterate over whitespace-separated substrings. +1. Iterate over space-separated substrings. 2. Look for a token match. If there is a match, stop processing and keep this token. 3. Check whether we have an explicitly defined special case for this substring. @@ -858,6 +861,8 @@ The algorithm can be summarized as follows: 8. Look for "infixes" – stuff like hyphens etc. and split the substring into tokens on all infixes. 9. Once we can't consume any more of the string, handle it as a single token. +10. Make a final pass over the text to check for special cases that include + spaces or that were missed due to the incremental processing of affixes.