Fix NER check in CoNLL-U converter (#10302)

* Fix NER check in CoNLL-U converter

Leave ents unset if no NER annotation is found in the MISC column.

* Revert to global rather than per-sentence NER check

* Update spacy/training/converters/conllu_to_docs.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2022-02-21 10:24:52 +01:00 committed by GitHub
parent 3358fb9bdd
commit f32ee2e533
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 5 deletions

View File

@ -34,7 +34,7 @@ from .util import make_tempdir
@pytest.mark.issue(4665)
def test_issue4665():
def test_cli_converters_conllu_empty_heads_ner():
"""
conllu_to_docs should not raise an exception if the HEAD column contains an
underscore
@ -59,7 +59,11 @@ def test_issue4665():
17 . _ PUNCT . _ _ punct _ _
18 ] _ PUNCT -RRB- _ _ punct _ _
"""
conllu_to_docs(input_data)
docs = list(conllu_to_docs(input_data))
# heads are all 0
assert not all([t.head.i for t in docs[0]])
# NER is unset
assert not docs[0].has_annotation("ENT_IOB")
@pytest.mark.issue(4924)

View File

@ -71,6 +71,7 @@ def read_conllx(
):
"""Yield docs, one for each sentence"""
vocab = Vocab() # need vocab to make a minimal Doc
set_ents = has_ner(input_data, ner_tag_pattern)
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
@ -83,6 +84,7 @@ def read_conllx(
merge_subtokens=merge_subtokens,
append_morphology=append_morphology,
ner_map=ner_map,
set_ents=set_ents,
)
yield doc
@ -133,6 +135,7 @@ def conllu_sentence_to_doc(
merge_subtokens=False,
append_morphology=False,
ner_map=None,
set_ents=False,
):
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
@ -214,6 +217,8 @@ def conllu_sentence_to_doc(
doc[i]._.merged_morph = morphs[i]
doc[i]._.merged_lemma = lemmas[i]
doc[i]._.merged_spaceafter = spaces[i]
ents = None
if set_ents:
ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = biluo_tags_to_spans(doc, ents)
@ -247,6 +252,7 @@ def conllu_sentence_to_doc(
deps=deps,
heads=heads,
)
if set_ents:
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
return doc_x