mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add NER map option to convert CLI (#4763)
Instead of a hard-coded NER tag simplification function that was only
intended for NorNE, map NER tags in CoNLL-U converter using a dict
provided as JSON as a command-line option.
Map NER entity types or new tag or to "" for 'O', e.g.:
```
{"PER": "PERSON", "BAD": ""}
=>
B-PER -> B-PERSON
B-BAD -> O
```
			
			
This commit is contained in:
		
							parent
							
								
									68f711b409
								
							
						
					
					
						commit
						eb9b1858c4
					
				|  | @ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl") | |||
|     converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), | ||||
|     lang=("Language (if tokenizer required)", "option", "l", str), | ||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool), | ||||
|     ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), | ||||
| ) | ||||
| def convert( | ||||
|     input_file, | ||||
|  | @ -49,6 +50,7 @@ def convert( | |||
|     model=None, | ||||
|     morphology=False, | ||||
|     converter="auto", | ||||
|     ner_map_path=None, | ||||
|     lang=None, | ||||
| ): | ||||
|     """ | ||||
|  | @ -94,6 +96,9 @@ def convert( | |||
|             ) | ||||
|     if converter not in CONVERTERS: | ||||
|         msg.fail("Can't find converter for {}".format(converter), exits=1) | ||||
|     ner_map = None | ||||
|     if ner_map_path is not None: | ||||
|         ner_map = srsly.read_json(ner_map_path) | ||||
|     # Use converter function to convert data | ||||
|     func = CONVERTERS[converter] | ||||
|     data = func( | ||||
|  | @ -104,6 +109,7 @@ def convert( | |||
|         lang=lang, | ||||
|         model=model, | ||||
|         no_print=no_print, | ||||
|         ner_map=ner_map, | ||||
|     ) | ||||
|     if output_dir != "-": | ||||
|         # Export data to a file | ||||
|  |  | |||
|  | @ -7,7 +7,8 @@ from spacy.gold import Example | |||
| from ...gold import iob_to_biluo | ||||
| 
 | ||||
| 
 | ||||
| def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | ||||
| def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, | ||||
|                 ner_map=None, **_): | ||||
|     """ | ||||
|     Convert conllu files into JSON format for use with train cli. | ||||
|     use_morphology parameter enables appending morphology to tags, which is | ||||
|  | @ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | |||
|             checked_for_ner = True | ||||
|         raw += example.text | ||||
|         sentences.append(generate_sentence(example.token_annotation, | ||||
|                 has_ner_tags, MISC_NER_PATTERN)) | ||||
|                 has_ner_tags, MISC_NER_PATTERN, | ||||
|                 ner_map=ner_map)) | ||||
|         # Real-sized documents could be extracted using the comments on the | ||||
|         # conllu document | ||||
|         if len(sentences) % n_sents == 0: | ||||
|  | @ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0): | |||
|                 break | ||||
| 
 | ||||
| 
 | ||||
| def simplify_tags(iob, tag_pattern): | ||||
| def extract_tags(iob, tag_pattern, ner_map=None): | ||||
|     """ | ||||
|     Extract tag from MISC column according to `tag_pattern` and map to final | ||||
|     entity type with `ner_map` if mapping present. | ||||
| 
 | ||||
|     For NorNE: | ||||
|     Simplify tags obtained from the dataset in order to follow Wikipedia | ||||
|     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while | ||||
|     'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to | ||||
|  | @ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern): | |||
|             prefix = tag_match.group(2) | ||||
|             suffix = tag_match.group(3) | ||||
|             if prefix and suffix: | ||||
|                 if suffix == "GPE_LOC": | ||||
|                     suffix = "LOC" | ||||
|                 elif suffix == "GPE_ORG": | ||||
|                     suffix = "ORG" | ||||
|                 elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": | ||||
|                     suffix = "MISC" | ||||
|                 new_tag = prefix + "-" + suffix | ||||
|                 if ner_map: | ||||
|                     suffix = ner_map.get(suffix, suffix) | ||||
|                     if suffix == "": | ||||
|                         new_tag = "O" | ||||
|                     else: | ||||
|                         new_tag = prefix + "-" + suffix | ||||
|         new_iob.append(new_tag) | ||||
|     return new_iob | ||||
| 
 | ||||
| 
 | ||||
| def generate_sentence(token_annotation, has_ner_tags, tag_pattern): | ||||
| def generate_sentence(token_annotation, has_ner_tags, tag_pattern, | ||||
|                       ner_map=None): | ||||
|     sentence = {} | ||||
|     tokens = [] | ||||
|     if has_ner_tags: | ||||
|         iob = simplify_tags(token_annotation.entities, tag_pattern) | ||||
|         iob = extract_tags(token_annotation.entities, tag_pattern, | ||||
|                             ner_map=ner_map) | ||||
|         biluo = iob_to_biluo(iob) | ||||
|     for i, id in enumerate(token_annotation.ids): | ||||
|         token = {} | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs | |||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conllu2json(): | ||||
|     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu | ||||
|     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu | ||||
|     lines = [ | ||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", | ||||
|         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", | ||||
|  | @ -32,17 +32,16 @@ def test_cli_converters_conllu2json(): | |||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conllu2json(): | ||||
|     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu | ||||
| def test_cli_converters_conllu2json_name_ner_map(): | ||||
|     lines = [ | ||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||
|         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", | ||||
|         "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", | ||||
|         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", | ||||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", | ||||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", | ||||
|     ] | ||||
|     input_data = "\n".join(lines) | ||||
|     converted = conllu2json(input_data, n_sents=1) | ||||
|     converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) | ||||
|     assert len(converted) == 1 | ||||
|     assert converted[0]["id"] == 0 | ||||
|     assert len(converted[0]["paragraphs"]) == 1 | ||||
|  | @ -55,7 +54,7 @@ def test_cli_converters_conllu2json(): | |||
|     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] | ||||
|     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] | ||||
|     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] | ||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] | ||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_iob2json(): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user