mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Fix NER check in CoNLL-U converter (#10302)
* Fix NER check in CoNLL-U converter Leave ents unset if no NER annotation is found in the MISC column. * Revert to global rather than per-sentence NER check * Update spacy/training/converters/conllu_to_docs.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									3358fb9bdd
								
							
						
					
					
						commit
						f32ee2e533
					
				|  | @ -34,7 +34,7 @@ from .util import make_tempdir | |||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(4665) | ||||
| def test_issue4665(): | ||||
| def test_cli_converters_conllu_empty_heads_ner(): | ||||
|     """ | ||||
|     conllu_to_docs should not raise an exception if the HEAD column contains an | ||||
|     underscore | ||||
|  | @ -59,7 +59,11 @@ def test_issue4665(): | |||
| 17	.	_	PUNCT	.	_	_	punct	_	_ | ||||
| 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | ||||
| """ | ||||
|     conllu_to_docs(input_data) | ||||
|     docs = list(conllu_to_docs(input_data)) | ||||
|     # heads are all 0 | ||||
|     assert not all([t.head.i for t in docs[0]]) | ||||
|     # NER is unset | ||||
|     assert not docs[0].has_annotation("ENT_IOB") | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(4924) | ||||
|  |  | |||
|  | @ -71,6 +71,7 @@ def read_conllx( | |||
| ): | ||||
|     """Yield docs, one for each sentence""" | ||||
|     vocab = Vocab()  # need vocab to make a minimal Doc | ||||
|     set_ents = has_ner(input_data, ner_tag_pattern) | ||||
|     for sent in input_data.strip().split("\n\n"): | ||||
|         lines = sent.strip().split("\n") | ||||
|         if lines: | ||||
|  | @ -83,6 +84,7 @@ def read_conllx( | |||
|                 merge_subtokens=merge_subtokens, | ||||
|                 append_morphology=append_morphology, | ||||
|                 ner_map=ner_map, | ||||
|                 set_ents=set_ents, | ||||
|             ) | ||||
|             yield doc | ||||
| 
 | ||||
|  | @ -133,6 +135,7 @@ def conllu_sentence_to_doc( | |||
|     merge_subtokens=False, | ||||
|     append_morphology=False, | ||||
|     ner_map=None, | ||||
|     set_ents=False, | ||||
| ): | ||||
|     """Create an Example from the lines for one CoNLL-U sentence, merging | ||||
|     subtokens and appending morphology to tags if required. | ||||
|  | @ -214,8 +217,10 @@ def conllu_sentence_to_doc( | |||
|         doc[i]._.merged_morph = morphs[i] | ||||
|         doc[i]._.merged_lemma = lemmas[i] | ||||
|         doc[i]._.merged_spaceafter = spaces[i] | ||||
|     ents = get_entities(lines, ner_tag_pattern, ner_map) | ||||
|     doc.ents = biluo_tags_to_spans(doc, ents) | ||||
|     ents = None | ||||
|     if set_ents: | ||||
|         ents = get_entities(lines, ner_tag_pattern, ner_map) | ||||
|         doc.ents = biluo_tags_to_spans(doc, ents) | ||||
| 
 | ||||
|     if merge_subtokens: | ||||
|         doc = merge_conllu_subtokens(lines, doc) | ||||
|  | @ -247,7 +252,8 @@ def conllu_sentence_to_doc( | |||
|         deps=deps, | ||||
|         heads=heads, | ||||
|     ) | ||||
|     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] | ||||
|     if set_ents: | ||||
|         doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] | ||||
| 
 | ||||
|     return doc_x | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user