mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Ignore prefix in suffix matches (#9155)
* Ignore prefix in suffix matches
Ignore the currently matched prefix when looking for suffix matches in
the tokenizer. Otherwise a lookbehind in the suffix pattern may match
incorrectly due the presence of the prefix in the token string.
* Move °[cfkCFK]. to a tokenizer exception
* Adjust exceptions for same tokenization as v3.1
* Also update test accordingly
* Continue to split . after °CFK if ° is not a prefix
* Exclude new ° exceptions for pl
* Switch back to default tokenization of "° C ."
* Revert "Exclude new ° exceptions for pl"
This reverts commit 952013a5b4
.
* Add exceptions for °C for hu
This commit is contained in:
parent
4170110ce7
commit
2ea9b58006
|
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for u in "cfkCFK":
|
||||||
|
_exc[f"°{u}"] = [{ORTH: f"°{u}"}]
|
||||||
|
_exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||||
|
|
|
@ -250,3 +250,9 @@ o.0
|
||||||
|
|
||||||
for orth in emoticons:
|
for orth in emoticons:
|
||||||
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
||||||
|
# for lookbehinds
|
||||||
|
for u in "cfkCFK":
|
||||||
|
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
|
||||||
|
|
|
@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
if sys.maxunicode >= 1114111:
|
if sys.maxunicode >= 1114111:
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_degree(tokenizer):
|
||||||
|
for u in "cfkCFK":
|
||||||
|
assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
|
||||||
|
assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
|
||||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
||||||
import re
|
import re
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.util import ensure_path
|
from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab):
|
||||||
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
|
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
|
||||||
tokenizer1.rules = {}
|
tokenizer1.rules = {}
|
||||||
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
|
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
||||||
|
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||||
|
prefixes = ['a(?=.)']
|
||||||
|
suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
|
||||||
|
prefix_re = compile_prefix_regex(prefixes)
|
||||||
|
suffix_re = compile_suffix_regex(suffixes)
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
)
|
||||||
|
tokens = [t.text for t in tokenizer("a10.")]
|
||||||
|
assert tokens == ["a", "10", "."]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
|
@ -408,7 +408,7 @@ cdef class Tokenizer:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
break
|
break
|
||||||
suf_len = self.find_suffix(string)
|
suf_len = self.find_suffix(string[pre_len:])
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
suffix = string[-suf_len:]
|
suffix = string[-suf_len:]
|
||||||
minus_suf = string[:-suf_len]
|
minus_suf = string[:-suf_len]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user