mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 13:17:06 +03:00
06f0a8daa0
* fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
from spacy.pipeline import EntityRecognizer
|
|
from spacy.tokens import Span
|
|
import pytest
|
|
|
|
from ..util import get_doc
|
|
from ...ml.models.defaults import default_ner
|
|
|
|
|
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
|
text = ["This", "is", "a", "lion"]
|
|
doc = get_doc(en_vocab, text)
|
|
ner = EntityRecognizer(en_vocab, default_ner())
|
|
ner.begin_training([])
|
|
ner(doc)
|
|
assert len(list(doc.ents)) == 0
|
|
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
|
|
|
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
|
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
|
|
|
|
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
|
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
|
|
|
|
|
|
def test_ents_reset(en_vocab):
|
|
text = ["This", "is", "a", "lion"]
|
|
doc = get_doc(en_vocab, text)
|
|
ner = EntityRecognizer(en_vocab, default_ner())
|
|
ner.begin_training([])
|
|
ner(doc)
|
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
|
doc.ents = list(doc.ents)
|
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
|
|
|
|
|
def test_add_overlapping_entities(en_vocab):
|
|
text = ["Louisiana", "Office", "of", "Conservation"]
|
|
doc = get_doc(en_vocab, text)
|
|
entity = Span(doc, 0, 4, label=391)
|
|
doc.ents = [entity]
|
|
|
|
new_entity = Span(doc, 0, 1, label=392)
|
|
with pytest.raises(ValueError):
|
|
doc.ents = list(doc.ents) + [new_entity]
|