mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix NER check in CoNLL-U converter (#10302)
* Fix NER check in CoNLL-U converter Leave ents unset if no NER annotation is found in the MISC column. * Revert to global rather than per-sentence NER check * Update spacy/training/converters/conllu_to_docs.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
3358fb9bdd
commit
f32ee2e533
|
@ -34,7 +34,7 @@ from .util import make_tempdir
|
|||
|
||||
|
||||
@pytest.mark.issue(4665)
|
||||
def test_issue4665():
|
||||
def test_cli_converters_conllu_empty_heads_ner():
|
||||
"""
|
||||
conllu_to_docs should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
|
@ -59,7 +59,11 @@ def test_issue4665():
|
|||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
conllu_to_docs(input_data)
|
||||
docs = list(conllu_to_docs(input_data))
|
||||
# heads are all 0
|
||||
assert not all([t.head.i for t in docs[0]])
|
||||
# NER is unset
|
||||
assert not docs[0].has_annotation("ENT_IOB")
|
||||
|
||||
|
||||
@pytest.mark.issue(4924)
|
||||
|
|
|
@ -71,6 +71,7 @@ def read_conllx(
|
|||
):
|
||||
"""Yield docs, one for each sentence"""
|
||||
vocab = Vocab() # need vocab to make a minimal Doc
|
||||
set_ents = has_ner(input_data, ner_tag_pattern)
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
|
@ -83,6 +84,7 @@ def read_conllx(
|
|||
merge_subtokens=merge_subtokens,
|
||||
append_morphology=append_morphology,
|
||||
ner_map=ner_map,
|
||||
set_ents=set_ents,
|
||||
)
|
||||
yield doc
|
||||
|
||||
|
@ -133,6 +135,7 @@ def conllu_sentence_to_doc(
|
|||
merge_subtokens=False,
|
||||
append_morphology=False,
|
||||
ner_map=None,
|
||||
set_ents=False,
|
||||
):
|
||||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||
subtokens and appending morphology to tags if required.
|
||||
|
@ -214,8 +217,10 @@ def conllu_sentence_to_doc(
|
|||
doc[i]._.merged_morph = morphs[i]
|
||||
doc[i]._.merged_lemma = lemmas[i]
|
||||
doc[i]._.merged_spaceafter = spaces[i]
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = biluo_tags_to_spans(doc, ents)
|
||||
ents = None
|
||||
if set_ents:
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = biluo_tags_to_spans(doc, ents)
|
||||
|
||||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
@ -247,7 +252,8 @@ def conllu_sentence_to_doc(
|
|||
deps=deps,
|
||||
heads=heads,
|
||||
)
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
if set_ents:
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
|
||||
return doc_x
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user