mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Fix initial special cases for Tokenizer.explain (#10460)
Add the missing initial check for special cases to `Tokenizer.explain` to align with `Tokenizer._tokenize_affixes`.
This commit is contained in:
parent
01ec6349ea
commit
297dd82c86
|
@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab):
|
|||
assert tokens == ["±10", "%"]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
|
||||
assert tokens == explain_tokens
|
||||
|
||||
|
||||
def test_tokenizer_initial_special_case_explain(en_vocab):
|
||||
tokenizer = Tokenizer(
|
||||
en_vocab,
|
||||
token_match=re.compile("^id$").match,
|
||||
rules={
|
||||
"id": [{"ORTH": "i"}, {"ORTH": "d"}],
|
||||
}
|
||||
)
|
||||
tokens = [t.text for t in tokenizer("id")]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("id")]
|
||||
assert tokens == explain_tokens
|
||||
|
|
|
@ -643,6 +643,10 @@ cdef class Tokenizer:
|
|||
for substring in text.split():
|
||||
suffixes = []
|
||||
while substring:
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
continue
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(("TOKEN_MATCH", substring))
|
||||
|
|
|
@ -799,6 +799,10 @@ def tokenizer_pseudo_code(
|
|||
for substring in text.split():
|
||||
suffixes = []
|
||||
while substring:
|
||||
if substring in special_cases:
|
||||
tokens.extend(special_cases[substring])
|
||||
substring = ""
|
||||
continue
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(substring)
|
||||
|
@ -851,20 +855,22 @@ def tokenizer_pseudo_code(
|
|||
The algorithm can be summarized as follows:
|
||||
|
||||
1. Iterate over space-separated substrings.
|
||||
2. Look for a token match. If there is a match, stop processing and keep this
|
||||
token.
|
||||
3. Check whether we have an explicitly defined special case for this substring.
|
||||
2. Check whether we have an explicitly defined special case for this substring.
|
||||
If we do, use it.
|
||||
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
||||
3. Look for a token match. If there is a match, stop processing and keep this
|
||||
token.
|
||||
4. Check whether we have an explicitly defined special case for this substring.
|
||||
If we do, use it.
|
||||
5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3,
|
||||
so that the token match and special cases always get priority.
|
||||
5. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||
#2.
|
||||
6. If we can't consume a prefix or a suffix, look for a URL match.
|
||||
7. If there's no URL match, then look for a special case.
|
||||
8. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
||||
6. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||
#3.
|
||||
7. If we can't consume a prefix or a suffix, look for a URL match.
|
||||
8. If there's no URL match, then look for a special case.
|
||||
9. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
||||
tokens on all infixes.
|
||||
9. Once we can't consume any more of the string, handle it as a single token.
|
||||
10. Make a final pass over the text to check for special cases that include
|
||||
10. Once we can't consume any more of the string, handle it as a single token.
|
||||
11. Make a final pass over the text to check for special cases that include
|
||||
spaces or that were missed due to the incremental processing of affixes.
|
||||
|
||||
</Accordion>
|
||||
|
|
Loading…
Reference in New Issue
Block a user