mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
3711af74e5
* Add tokenizer option to allow Matcher handling for all rules
Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.
* Reset all caches when reloading special cases
* Revert "Reset all caches when reloading special cases"
This reverts commit 4ef6bd171d
.
* Initialize max_length properly
* Add new tag to API docs
* Rename to faster heuristics
145 lines
5.3 KiB
Python
145 lines
5.3 KiB
Python
import pickle
|
||
import re
|
||
|
||
import pytest
|
||
|
||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||
from spacy.lang.en import English
|
||
from spacy.tokenizer import Tokenizer
|
||
from spacy.tokens import Doc
|
||
from spacy.util import compile_infix_regex, compile_prefix_regex
|
||
from spacy.util import compile_suffix_regex, get_lang_class, load_model
|
||
|
||
from ..util import assert_packed_msg_equal, make_tempdir
|
||
|
||
|
||
def load_tokenizer(b):
|
||
tok = get_lang_class("en")().tokenizer
|
||
tok.from_bytes(b)
|
||
return tok
|
||
|
||
|
||
@pytest.mark.issue(2833)
|
||
def test_issue2833(en_vocab):
|
||
"""Test that a custom error is raised if a token or span is pickled."""
|
||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||
with pytest.raises(NotImplementedError):
|
||
pickle.dumps(doc[0])
|
||
with pytest.raises(NotImplementedError):
|
||
pickle.dumps(doc[0:2])
|
||
|
||
|
||
@pytest.mark.issue(3012)
|
||
def test_issue3012(en_vocab):
|
||
"""Test that the is_tagged attribute doesn't get overwritten when we from_array
|
||
without tag information."""
|
||
words = ["This", "is", "10", "%", "."]
|
||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
|
||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||
assert doc.has_annotation("TAG")
|
||
expected = ("10", "NUM", "CD", "PERCENT")
|
||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||
header = [ENT_IOB, ENT_TYPE]
|
||
ent_array = doc.to_array(header)
|
||
doc.from_array(header, ent_array)
|
||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||
# Serializing then deserializing
|
||
doc_bytes = doc.to_bytes()
|
||
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
||
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
|
||
|
||
|
||
@pytest.mark.issue(4190)
|
||
def test_issue4190():
|
||
def customize_tokenizer(nlp):
|
||
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||
exceptions = {
|
||
k: v
|
||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||
if not (len(k) == 2 and k[1] == ".")
|
||
}
|
||
new_tokenizer = Tokenizer(
|
||
nlp.vocab,
|
||
exceptions,
|
||
prefix_search=prefix_re.search,
|
||
suffix_search=suffix_re.search,
|
||
infix_finditer=infix_re.finditer,
|
||
token_match=nlp.tokenizer.token_match,
|
||
faster_heuristics=False,
|
||
)
|
||
nlp.tokenizer = new_tokenizer
|
||
|
||
test_string = "Test c."
|
||
# Load default language
|
||
nlp_1 = English()
|
||
doc_1a = nlp_1(test_string)
|
||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||
# Modify tokenizer
|
||
customize_tokenizer(nlp_1)
|
||
doc_1b = nlp_1(test_string)
|
||
result_1b = [token.text for token in doc_1b]
|
||
# Save and Reload
|
||
with make_tempdir() as model_dir:
|
||
nlp_1.to_disk(model_dir)
|
||
nlp_2 = load_model(model_dir)
|
||
# This should be the modified tokenizer
|
||
doc_2 = nlp_2(test_string)
|
||
result_2 = [token.text for token in doc_2]
|
||
assert result_1b == result_2
|
||
assert nlp_2.tokenizer.faster_heuristics is False
|
||
|
||
|
||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||
"""Test that custom tokenizer with not all functions defined or empty
|
||
properties can be serialized and deserialized correctly (see #2494,
|
||
#4991)."""
|
||
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
||
tokenizer_bytes = tokenizer.to_bytes()
|
||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||
|
||
# test that empty/unset values are set correctly on deserialization
|
||
tokenizer = get_lang_class("en")().tokenizer
|
||
tokenizer.token_match = re.compile("test").match
|
||
assert tokenizer.rules != {}
|
||
assert tokenizer.token_match is not None
|
||
assert tokenizer.url_match is not None
|
||
assert tokenizer.prefix_search is not None
|
||
assert tokenizer.infix_finditer is not None
|
||
tokenizer.from_bytes(tokenizer_bytes)
|
||
assert tokenizer.rules == {}
|
||
assert tokenizer.token_match is None
|
||
assert tokenizer.url_match is None
|
||
assert tokenizer.prefix_search is None
|
||
assert tokenizer.infix_finditer is None
|
||
|
||
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
||
tokenizer.rules = {}
|
||
tokenizer_bytes = tokenizer.to_bytes()
|
||
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||
assert tokenizer_reloaded.rules == {}
|
||
|
||
|
||
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||
tokenizer = en_tokenizer
|
||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
||
assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
||
doc1 = tokenizer(text)
|
||
doc2 = new_tokenizer(text)
|
||
assert [token.text for token in doc1] == [token.text for token in doc2]
|
||
|
||
|
||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||
tokenizer = en_tokenizer
|
||
with make_tempdir() as d:
|
||
file_path = d / "tokenizer"
|
||
tokenizer.to_disk(file_path)
|
||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|