mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-09 16:10:33 +03:00
2f981d5af1
Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS.
267 lines
9.1 KiB
Python
267 lines
9.1 KiB
Python
import pytest
|
||
from spacy.lang.en import English
|
||
from spacy.lang.de import German
|
||
from spacy.pipeline.defaults import default_ner
|
||
from spacy.pipeline import EntityRuler, EntityRecognizer
|
||
from spacy.matcher import Matcher, PhraseMatcher
|
||
from spacy.tokens import Doc
|
||
from spacy.vocab import Vocab
|
||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||
from spacy.compat import pickle
|
||
from spacy import displacy
|
||
import numpy
|
||
|
||
from spacy.vectors import Vectors
|
||
from ..util import get_doc
|
||
|
||
|
||
def test_issue3002():
|
||
"""Test that the tokenizer doesn't hang on a long list of dots"""
|
||
nlp = German()
|
||
doc = nlp(
|
||
"880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
|
||
)
|
||
assert len(doc) == 5
|
||
|
||
|
||
def test_issue3009(en_vocab):
|
||
"""Test problem with matcher quantifiers"""
|
||
patterns = [
|
||
[{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
|
||
[
|
||
{"ORTH": "has"},
|
||
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
|
||
{"LOWER": "to"},
|
||
{"LOWER": "do"},
|
||
{"TAG": "IN"},
|
||
],
|
||
[
|
||
{"ORTH": "has"},
|
||
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
|
||
{"LOWER": "to"},
|
||
{"LOWER": "do"},
|
||
{"TAG": "IN"},
|
||
],
|
||
]
|
||
words = ["also", "has", "to", "do", "with"]
|
||
tags = ["RB", "VBZ", "TO", "VB", "IN"]
|
||
pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
|
||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos)
|
||
matcher = Matcher(en_vocab)
|
||
for i, pattern in enumerate(patterns):
|
||
matcher.add(str(i), [pattern])
|
||
matches = matcher(doc)
|
||
assert matches
|
||
|
||
|
||
def test_issue3012(en_vocab):
|
||
"""Test that the is_tagged attribute doesn't get overwritten when we from_array
|
||
without tag information."""
|
||
words = ["This", "is", "10", "%", "."]
|
||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||
ents = [(2, 4, "PERCENT")]
|
||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||
assert doc.is_tagged
|
||
|
||
expected = ("10", "NUM", "CD", "PERCENT")
|
||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||
|
||
header = [ENT_IOB, ENT_TYPE]
|
||
ent_array = doc.to_array(header)
|
||
doc.from_array(header, ent_array)
|
||
|
||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||
|
||
# Serializing then deserializing
|
||
doc_bytes = doc.to_bytes()
|
||
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
||
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
|
||
|
||
|
||
def test_issue3199():
|
||
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
|
||
is available. To make this test future-proof, we're constructing a Doc
|
||
with a new Vocab here and setting is_parsed to make sure the noun chunks run.
|
||
"""
|
||
doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
|
||
doc.is_parsed = True
|
||
assert list(doc[0:3].noun_chunks) == []
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue3209():
|
||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||
mapped to classes incorrectly after loading the model, when the labels
|
||
were added using ner.add_label().
|
||
"""
|
||
nlp = English()
|
||
ner = nlp.create_pipe("ner")
|
||
nlp.add_pipe(ner)
|
||
|
||
ner.add_label("ANIMAL")
|
||
nlp.begin_training()
|
||
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
||
assert ner.move_names == move_names
|
||
nlp2 = English()
|
||
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||
model = nlp2.get_pipe("ner").model
|
||
model.attrs["resize_output"](model, ner.moves.n_moves)
|
||
nlp2.from_bytes(nlp.to_bytes())
|
||
assert nlp2.get_pipe("ner").move_names == move_names
|
||
|
||
|
||
def test_issue3248_1():
|
||
"""Test that the PhraseMatcher correctly reports its number of rules, not
|
||
total number of patterns."""
|
||
nlp = English()
|
||
matcher = PhraseMatcher(nlp.vocab)
|
||
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
||
matcher.add("TEST2", [nlp("d")])
|
||
assert len(matcher) == 2
|
||
|
||
|
||
def test_issue3248_2():
|
||
"""Test that the PhraseMatcher can be pickled correctly."""
|
||
nlp = English()
|
||
matcher = PhraseMatcher(nlp.vocab)
|
||
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
||
matcher.add("TEST2", [nlp("d")])
|
||
data = pickle.dumps(matcher)
|
||
new_matcher = pickle.loads(data)
|
||
assert len(new_matcher) == len(matcher)
|
||
|
||
|
||
def test_issue3277(es_tokenizer):
|
||
"""Test that hyphens are split correctly as prefixes."""
|
||
doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
|
||
assert len(doc) == 14
|
||
assert doc[0].text == "\u2014"
|
||
assert doc[5].text == "\u2013"
|
||
assert doc[9].text == "\u2013"
|
||
|
||
|
||
def test_issue3288(en_vocab):
|
||
"""Test that retokenization works correctly via displaCy when punctuation
|
||
is merged onto the preceeding token and tensor is resized."""
|
||
words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
|
||
heads = [1, 0, -1, 1, 0, 1, -2, -3]
|
||
deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
|
||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||
doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
|
||
displacy.render(doc)
|
||
|
||
|
||
def test_issue3289():
|
||
"""Test that Language.to_bytes handles serializing a pipeline component
|
||
with an uninitialized model."""
|
||
nlp = English()
|
||
nlp.add_pipe(nlp.create_pipe("textcat"))
|
||
bytes_data = nlp.to_bytes()
|
||
new_nlp = English()
|
||
new_nlp.add_pipe(nlp.create_pipe("textcat"))
|
||
new_nlp.from_bytes(bytes_data)
|
||
|
||
|
||
def test_issue3328(en_vocab):
|
||
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
|
||
matcher = Matcher(en_vocab)
|
||
patterns = [
|
||
[{"LOWER": {"IN": ["hello", "how"]}}],
|
||
[{"LOWER": {"IN": ["you", "doing"]}}],
|
||
]
|
||
matcher.add("TEST", patterns)
|
||
matches = matcher(doc)
|
||
assert len(matches) == 4
|
||
matched_texts = [doc[start:end].text for _, start, end in matches]
|
||
assert matched_texts == ["Hello", "how", "you", "doing"]
|
||
|
||
|
||
def test_issue3331(en_vocab):
|
||
"""Test that duplicate patterns for different rules result in multiple
|
||
matches, one per rule.
|
||
"""
|
||
matcher = PhraseMatcher(en_vocab)
|
||
matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
|
||
matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
|
||
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 2
|
||
match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
|
||
assert sorted(match_ids) == ["A", "B"]
|
||
|
||
|
||
def test_issue3345():
|
||
"""Test case where preset entity crosses sentence boundary."""
|
||
nlp = English()
|
||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||
doc[4].is_sent_start = True
|
||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||
config = {
|
||
"learn_tokens": False,
|
||
"min_action_freq": 30,
|
||
"beam_width": 1,
|
||
"beam_update_prob": 1.0,
|
||
}
|
||
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||
ner.moves.add_action(5, "")
|
||
ner.add_label("GPE")
|
||
doc = ruler(doc)
|
||
# Get into the state just before "New"
|
||
state = ner.moves.init_batch([doc])[0]
|
||
ner.moves.apply_transition(state, "O")
|
||
ner.moves.apply_transition(state, "O")
|
||
ner.moves.apply_transition(state, "O")
|
||
# Check that B-GPE is valid.
|
||
assert ner.moves.is_valid(state, "B-GPE")
|
||
|
||
|
||
def test_issue3412():
|
||
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
|
||
vectors = Vectors(data=data, keys=["A", "B", "C"])
|
||
keys, best_rows, scores = vectors.most_similar(
|
||
numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
|
||
)
|
||
assert best_rows[0] == 2
|
||
|
||
|
||
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
|
||
def test_issue3449():
|
||
nlp = English()
|
||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||
text1 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||
text2 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||
text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
|
||
t1 = nlp(text1)
|
||
t2 = nlp(text2)
|
||
t3 = nlp(text3)
|
||
assert t1[5].text == "I"
|
||
assert t2[5].text == "I"
|
||
assert t3[5].text == "I"
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue3456():
|
||
# this crashed because of a padding error in layer.ops.unflatten in thinc
|
||
nlp = English()
|
||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||
nlp.begin_training()
|
||
list(nlp.pipe(["hi", ""]))
|
||
|
||
|
||
def test_issue3468():
|
||
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can
|
||
be restored after serialization."""
|
||
nlp = English()
|
||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||
doc = nlp("Hello world")
|
||
assert doc[0].is_sent_start
|
||
assert doc.is_sentenced
|
||
assert len(list(doc.sents)) == 1
|
||
doc_bytes = doc.to_bytes()
|
||
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
|
||
assert new_doc[0].is_sent_start
|
||
assert new_doc.is_sentenced
|
||
assert len(list(new_doc.sents)) == 1
|