spaCy/spacy/tests/doc/test_add_entities.py
Matthew Honnibal 6f5e308d17
Support negative examples in partial NER annotations (#8106)
* Support a cfg field in transition system

* Make NER 'has gold' check use right alignment for span

* Pass 'negative_samples_key' property into NER transition system

* Add field for negative samples to NER transition system

* Check neg_key in NER has_gold

* Support negative examples in NER oracle

* Test for negative examples in NER

* Fix name of config variable in NER

* Remove vestiges of old-style partial annotation

* Remove obsolete tests

* Add comment noting lack of support for negative samples in parser

* Additions to "neg examples" PR (#8201)

* add custom error and test for deprecated format

* add test for unlearning an entity

* add break also for Begin's cost

* add negative_samples_key property on Parser

* rename

* extend docs & fix some older docs issues

* add subclass constructors, clean up tests, fix docs

* add flaky test with ValueError if gold parse was not found

* remove ValueError if n_gold == 0

* fix docstring

* Hack in environment variables to try out training

* Remove hack

* Remove NER hack, and support 'negative O' samples

* Fix O oracle

* Fix transition parser

* Remove 'not O' from oracle

* Fix NER oracle

* check for spans in both gold.ents and gold.spans and raise if so, to prevent memory access violation

* use set instead of list in consistency check

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2021-06-17 17:33:00 +10:00

57 lines
1.8 KiB
Python

from spacy.pipeline.ner import DEFAULT_NER_MODEL
from spacy.training import Example
from spacy.pipeline import EntityRecognizer
from spacy.tokens import Span, Doc
from spacy import registry
import pytest
def _ner_example(ner):
doc = Doc(
ner.vocab,
words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
)
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)
def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"]
doc = Doc(en_vocab, words=text)
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model)
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
doc.ents = [("ANIMAL", 3, 4)]
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
doc.ents = [("WORD", 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def test_ents_reset(en_vocab):
"""Ensure that resetting doc.ents does not change anything"""
text = ["This", "is", "a", "lion"]
doc = Doc(en_vocab, words=text)
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model)
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents)
assert [t.ent_iob_ for t in doc] == orig_iobs
def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)
entity = Span(doc, 0, 4, label=391)
doc.ents = [entity]
new_entity = Span(doc, 0, 1, label=392)
with pytest.raises(ValueError):
doc.ents = list(doc.ents) + [new_entity]