From f32ee2e533c709c8f2cc00b9cce28b779f4a0304 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:24:52 +0100 Subject: [PATCH] Fix NER check in CoNLL-U converter (#10302) * Fix NER check in CoNLL-U converter Leave ents unset if no NER annotation is found in the MISC column. * Revert to global rather than per-sentence NER check * Update spacy/training/converters/conllu_to_docs.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/tests/test_cli.py | 8 ++++++-- spacy/training/converters/conllu_to_docs.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index fc35ff86e..ec512b839 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -34,7 +34,7 @@ from .util import make_tempdir @pytest.mark.issue(4665) -def test_issue4665(): +def test_cli_converters_conllu_empty_heads_ner(): """ conllu_to_docs should not raise an exception if the HEAD column contains an underscore @@ -59,7 +59,11 @@ def test_issue4665(): 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ - conllu_to_docs(input_data) + docs = list(conllu_to_docs(input_data)) + # heads are all 0 + assert not all([t.head.i for t in docs[0]]) + # NER is unset + assert not docs[0].has_annotation("ENT_IOB") @pytest.mark.issue(4924) diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 7a4f44d3b..a4e70b01f 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -71,6 +71,7 @@ def read_conllx( ): """Yield docs, one for each sentence""" vocab = Vocab() # need vocab to make a minimal Doc + set_ents = has_ner(input_data, ner_tag_pattern) for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: @@ -83,6 +84,7 @@ def read_conllx( merge_subtokens=merge_subtokens, append_morphology=append_morphology, ner_map=ner_map, + set_ents=set_ents, ) yield doc @@ -133,6 +135,7 @@ def conllu_sentence_to_doc( merge_subtokens=False, append_morphology=False, ner_map=None, + set_ents=False, ): """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. @@ -214,8 +217,10 @@ def conllu_sentence_to_doc( doc[i]._.merged_morph = morphs[i] doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_spaceafter = spaces[i] - ents = get_entities(lines, ner_tag_pattern, ner_map) - doc.ents = biluo_tags_to_spans(doc, ents) + ents = None + if set_ents: + ents = get_entities(lines, ner_tag_pattern, ner_map) + doc.ents = biluo_tags_to_spans(doc, ents) if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) @@ -247,7 +252,8 @@ def conllu_sentence_to_doc( deps=deps, heads=heads, ) - doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + if set_ents: + doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] return doc_x