2020-12-02 14:57:08 +03:00
|
|
|
import pytest
|
2021-04-22 15:58:29 +03:00
|
|
|
from numpy.testing import assert_almost_equal
|
|
|
|
from thinc.api import Config, fix_random_seed, get_current_ops
|
2020-12-02 14:57:08 +03:00
|
|
|
|
2020-10-15 11:20:21 +03:00
|
|
|
from spacy.lang.en import English
|
2021-01-06 05:07:14 +03:00
|
|
|
from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
|
|
|
|
from spacy.pipeline.textcat import single_label_cnn_config
|
|
|
|
from spacy.pipeline.textcat_multilabel import multi_label_default_config
|
|
|
|
from spacy.pipeline.textcat_multilabel import multi_label_bow_config
|
|
|
|
from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
|
2020-10-15 11:20:21 +03:00
|
|
|
from spacy.tokens import Span
|
|
|
|
from spacy import displacy
|
|
|
|
from spacy.pipeline import merge_entities
|
2020-12-02 14:57:08 +03:00
|
|
|
from spacy.training import Example
|
2020-10-15 11:20:21 +03:00
|
|
|
|
|
|
|
|
2020-12-02 14:57:08 +03:00
|
|
|
@pytest.mark.parametrize(
|
2021-01-06 05:07:14 +03:00
|
|
|
"textcat_config",
|
|
|
|
[
|
|
|
|
single_label_default_config,
|
|
|
|
single_label_bow_config,
|
|
|
|
single_label_cnn_config,
|
|
|
|
multi_label_default_config,
|
|
|
|
multi_label_bow_config,
|
|
|
|
multi_label_cnn_config,
|
|
|
|
],
|
2020-12-02 14:57:08 +03:00
|
|
|
)
|
2021-11-05 04:27:19 +03:00
|
|
|
@pytest.mark.issue(5551)
|
2020-12-02 14:57:08 +03:00
|
|
|
def test_issue5551(textcat_config):
|
2020-10-15 11:20:21 +03:00
|
|
|
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
|
|
|
|
component = "textcat"
|
2020-12-02 14:57:08 +03:00
|
|
|
|
|
|
|
pipe_cfg = Config().from_str(textcat_config)
|
2020-10-15 11:20:21 +03:00
|
|
|
results = []
|
|
|
|
for i in range(3):
|
|
|
|
fix_random_seed(0)
|
|
|
|
nlp = English()
|
2020-12-02 14:57:08 +03:00
|
|
|
text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
|
|
|
|
annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
|
2020-10-15 11:20:21 +03:00
|
|
|
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
2020-12-02 14:57:08 +03:00
|
|
|
for label in set(annots["cats"]):
|
2020-10-15 11:20:21 +03:00
|
|
|
pipe.add_label(label)
|
2020-12-02 14:57:08 +03:00
|
|
|
# Train
|
2020-10-15 11:20:21 +03:00
|
|
|
nlp.initialize()
|
2020-12-02 14:57:08 +03:00
|
|
|
doc = nlp.make_doc(text)
|
|
|
|
nlp.update([Example.from_dict(doc, annots)])
|
2020-10-15 11:20:21 +03:00
|
|
|
# Store the result of each iteration
|
2020-12-02 14:57:08 +03:00
|
|
|
result = pipe.model.predict([doc])
|
2021-04-22 15:58:29 +03:00
|
|
|
results.append(result[0])
|
2020-10-15 11:20:21 +03:00
|
|
|
# All results should be the same because of the fixed seed
|
|
|
|
assert len(results) == 3
|
2021-04-22 15:58:29 +03:00
|
|
|
ops = get_current_ops()
|
2021-10-13 11:47:56 +03:00
|
|
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
|
|
|
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
|
2020-10-15 11:20:21 +03:00
|
|
|
|
|
|
|
|
2021-11-05 04:27:19 +03:00
|
|
|
@pytest.mark.issue(5838)
|
2020-10-15 11:20:21 +03:00
|
|
|
def test_issue5838():
|
|
|
|
# Displacy's EntityRenderer break line
|
|
|
|
# not working after last entity
|
|
|
|
sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
|
|
|
|
nlp = English()
|
|
|
|
doc = nlp(sample_text)
|
|
|
|
doc.ents = [Span(doc, 7, 8, label="test")]
|
|
|
|
html = displacy.render(doc, style="ent")
|
|
|
|
found = html.count("</br>")
|
|
|
|
assert found == 4
|
|
|
|
|
|
|
|
|
2021-11-05 04:27:19 +03:00
|
|
|
@pytest.mark.issue(5918)
|
2020-10-15 11:20:21 +03:00
|
|
|
def test_issue5918():
|
|
|
|
# Test edge case when merging entities.
|
|
|
|
nlp = English()
|
|
|
|
ruler = nlp.add_pipe("entity_ruler")
|
|
|
|
patterns = [
|
|
|
|
{"label": "ORG", "pattern": "Digicon Inc"},
|
|
|
|
{"label": "ORG", "pattern": "Rotan Mosle Inc's"},
|
|
|
|
{"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
|
|
|
|
]
|
|
|
|
ruler.add_patterns(patterns)
|
|
|
|
|
|
|
|
text = """
|
|
|
|
Digicon Inc said it has completed the previously-announced disposition
|
|
|
|
of its computer systems division to an investment group led by
|
|
|
|
Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
|
|
|
|
"""
|
|
|
|
doc = nlp(text)
|
|
|
|
assert len(doc.ents) == 3
|
|
|
|
# make it so that the third span's head is within the entity (ent_iob=I)
|
|
|
|
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
|
|
|
|
# TODO: test for logging here
|
|
|
|
# with pytest.warns(UserWarning):
|
|
|
|
# doc[29].head = doc[33]
|
|
|
|
doc = merge_entities(doc)
|
|
|
|
assert len(doc.ents) == 3
|