From 2d5f5cb5fb736074a9b0e7efbc98907304c65936 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 Jun 2020 15:49:25 +0200 Subject: [PATCH] Add NER data test --- spacy/tests/test_gold.py | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 4b6d8e785..392a1e3f8 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -3,6 +3,7 @@ from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example +from spacy.gold.converters import json2docs from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree from spacy.tokens import Doc, DocBin @@ -152,6 +153,77 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_example_from_dict_no_ner(en_vocab): + words = ["a", "b", "c", "d"] + spaces = [True, True, False, True] + predicted = Doc(en_vocab, words=words, spaces=spaces) + example = Example.from_dict(predicted, {"words": words}) + ner_tags = example.get_aligned_ner() + assert ner_tags == [None, None, None, None] + +def test_json2docs_no_ner(en_vocab): + data = [{ + "id":1, + "paragraphs":[ + { + "sentences":[ + { + "tokens":[ + { + "dep":"nn", + "head":1, + "tag":"NNP", + "orth":"Ms." + }, + { + "dep":"nsubj", + "head":1, + "tag":"NNP", + "orth":"Haag" + }, + { + "dep":"ROOT", + "head":0, + "tag":"VBZ", + "orth":"plays" + }, + { + "dep":"dobj", + "head":-1, + "tag":"NNP", + "orth":"Elianti" + }, + { + "dep":"punct", + "head":-2, + "tag":".", + "orth":"." + } + ] + } + ] + } + ] + }] + docs = json2docs(data) + assert len(docs) == 1 + for doc in docs: + assert not doc.is_nered + for token in doc: + assert token.ent_iob == 0 + eg = Example( + Doc( + doc.vocab, + words=[w.text for w in doc], + spaces=[bool(w.whitespace_) for w in doc] + ), + doc + ) + ner_tags = eg.get_aligned_ner() + assert ner_tags == [None, None, None, None, None] + + + def test_split_sentences(en_vocab): words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] doc = Doc(en_vocab, words=words) @@ -504,6 +576,7 @@ def test_tuple_format_implicit_invalid(): _train(train_data) + def _train(train_data): nlp = English() ner = nlp.create_pipe("ner")