From 297dd82c86372c7aa0a181e55dc72512718aafe8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Mar 2022 10:50:47 +0100 Subject: [PATCH] Fix initial special cases for Tokenizer.explain (#10460) Add the missing initial check for special cases to `Tokenizer.explain` to align with `Tokenizer._tokenize_affixes`. --- spacy/tests/tokenizer/test_tokenizer.py | 13 +++++++++++ spacy/tokenizer.pyx | 4 ++++ website/docs/usage/linguistic-features.md | 28 ++++++++++++++--------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index a7270cb1e..ed11508b4 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab): assert tokens == ["±10", "%"] explain_tokens = [t[1] for t in tokenizer.explain("±10%")] assert tokens == explain_tokens + + +def test_tokenizer_initial_special_case_explain(en_vocab): + tokenizer = Tokenizer( + en_vocab, + token_match=re.compile("^id$").match, + rules={ + "id": [{"ORTH": "i"}, {"ORTH": "d"}], + } + ) + tokens = [t.text for t in tokenizer("id")] + explain_tokens = [t[1] for t in tokenizer.explain("id")] + assert tokens == explain_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 91f228032..ac55a61f3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -643,6 +643,10 @@ cdef class Tokenizer: for substring in text.split(): suffixes = [] while substring: + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + continue while prefix_search(substring) or suffix_search(substring): if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f8baf5588..c3f25565a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -799,6 +799,10 @@ def tokenizer_pseudo_code( for substring in text.split(): suffixes = [] while substring: + if substring in special_cases: + tokens.extend(special_cases[substring]) + substring = "" + continue while prefix_search(substring) or suffix_search(substring): if token_match(substring): tokens.append(substring) @@ -851,20 +855,22 @@ def tokenizer_pseudo_code( The algorithm can be summarized as follows: 1. Iterate over space-separated substrings. -2. Look for a token match. If there is a match, stop processing and keep this - token. -3. Check whether we have an explicitly defined special case for this substring. +2. Check whether we have an explicitly defined special case for this substring. If we do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, +3. Look for a token match. If there is a match, stop processing and keep this + token. +4. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3, so that the token match and special cases always get priority. -5. If we didn't consume a prefix, try to consume a suffix and then go back to - #2. -6. If we can't consume a prefix or a suffix, look for a URL match. -7. If there's no URL match, then look for a special case. -8. Look for "infixes" – stuff like hyphens etc. and split the substring into +6. If we didn't consume a prefix, try to consume a suffix and then go back to + #3. +7. If we can't consume a prefix or a suffix, look for a URL match. +8. If there's no URL match, then look for a special case. +9. Look for "infixes" – stuff like hyphens etc. and split the substring into tokens on all infixes. -9. Once we can't consume any more of the string, handle it as a single token. -10. Make a final pass over the text to check for special cases that include +10. Once we can't consume any more of the string, handle it as a single token. +11. Make a final pass over the text to check for special cases that include spaces or that were missed due to the incremental processing of affixes.