mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Fix NER check in CoNLL-U converter (#10302)
* Fix NER check in CoNLL-U converter Leave ents unset if no NER annotation is found in the MISC column. * Revert to global rather than per-sentence NER check * Update spacy/training/converters/conllu_to_docs.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									3358fb9bdd
								
							
						
					
					
						commit
						f32ee2e533
					
				|  | @ -34,7 +34,7 @@ from .util import make_tempdir | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.issue(4665) | @pytest.mark.issue(4665) | ||||||
| def test_issue4665(): | def test_cli_converters_conllu_empty_heads_ner(): | ||||||
|     """ |     """ | ||||||
|     conllu_to_docs should not raise an exception if the HEAD column contains an |     conllu_to_docs should not raise an exception if the HEAD column contains an | ||||||
|     underscore |     underscore | ||||||
|  | @ -59,7 +59,11 @@ def test_issue4665(): | ||||||
| 17	.	_	PUNCT	.	_	_	punct	_	_ | 17	.	_	PUNCT	.	_	_	punct	_	_ | ||||||
| 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | ||||||
| """ | """ | ||||||
|     conllu_to_docs(input_data) |     docs = list(conllu_to_docs(input_data)) | ||||||
|  |     # heads are all 0 | ||||||
|  |     assert not all([t.head.i for t in docs[0]]) | ||||||
|  |     # NER is unset | ||||||
|  |     assert not docs[0].has_annotation("ENT_IOB") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.issue(4924) | @pytest.mark.issue(4924) | ||||||
|  |  | ||||||
|  | @ -71,6 +71,7 @@ def read_conllx( | ||||||
| ): | ): | ||||||
|     """Yield docs, one for each sentence""" |     """Yield docs, one for each sentence""" | ||||||
|     vocab = Vocab()  # need vocab to make a minimal Doc |     vocab = Vocab()  # need vocab to make a minimal Doc | ||||||
|  |     set_ents = has_ner(input_data, ner_tag_pattern) | ||||||
|     for sent in input_data.strip().split("\n\n"): |     for sent in input_data.strip().split("\n\n"): | ||||||
|         lines = sent.strip().split("\n") |         lines = sent.strip().split("\n") | ||||||
|         if lines: |         if lines: | ||||||
|  | @ -83,6 +84,7 @@ def read_conllx( | ||||||
|                 merge_subtokens=merge_subtokens, |                 merge_subtokens=merge_subtokens, | ||||||
|                 append_morphology=append_morphology, |                 append_morphology=append_morphology, | ||||||
|                 ner_map=ner_map, |                 ner_map=ner_map, | ||||||
|  |                 set_ents=set_ents, | ||||||
|             ) |             ) | ||||||
|             yield doc |             yield doc | ||||||
| 
 | 
 | ||||||
|  | @ -133,6 +135,7 @@ def conllu_sentence_to_doc( | ||||||
|     merge_subtokens=False, |     merge_subtokens=False, | ||||||
|     append_morphology=False, |     append_morphology=False, | ||||||
|     ner_map=None, |     ner_map=None, | ||||||
|  |     set_ents=False, | ||||||
| ): | ): | ||||||
|     """Create an Example from the lines for one CoNLL-U sentence, merging |     """Create an Example from the lines for one CoNLL-U sentence, merging | ||||||
|     subtokens and appending morphology to tags if required. |     subtokens and appending morphology to tags if required. | ||||||
|  | @ -214,8 +217,10 @@ def conllu_sentence_to_doc( | ||||||
|         doc[i]._.merged_morph = morphs[i] |         doc[i]._.merged_morph = morphs[i] | ||||||
|         doc[i]._.merged_lemma = lemmas[i] |         doc[i]._.merged_lemma = lemmas[i] | ||||||
|         doc[i]._.merged_spaceafter = spaces[i] |         doc[i]._.merged_spaceafter = spaces[i] | ||||||
|     ents = get_entities(lines, ner_tag_pattern, ner_map) |     ents = None | ||||||
|     doc.ents = biluo_tags_to_spans(doc, ents) |     if set_ents: | ||||||
|  |         ents = get_entities(lines, ner_tag_pattern, ner_map) | ||||||
|  |         doc.ents = biluo_tags_to_spans(doc, ents) | ||||||
| 
 | 
 | ||||||
|     if merge_subtokens: |     if merge_subtokens: | ||||||
|         doc = merge_conllu_subtokens(lines, doc) |         doc = merge_conllu_subtokens(lines, doc) | ||||||
|  | @ -247,7 +252,8 @@ def conllu_sentence_to_doc( | ||||||
|         deps=deps, |         deps=deps, | ||||||
|         heads=heads, |         heads=heads, | ||||||
|     ) |     ) | ||||||
|     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] |     if set_ents: | ||||||
|  |         doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] | ||||||
| 
 | 
 | ||||||
|     return doc_x |     return doc_x | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user