mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add NER map option to convert CLI (#4763)
Instead of a hard-coded NER tag simplification function that was only
intended for NorNE, map NER tags in CoNLL-U converter using a dict
provided as JSON as a command-line option.
Map NER entity types or new tag or to "" for 'O', e.g.:
```
{"PER": "PERSON", "BAD": ""}
=>
B-PER -> B-PERSON
B-BAD -> O
```
			
			
This commit is contained in:
		
							parent
							
								
									68f711b409
								
							
						
					
					
						commit
						eb9b1858c4
					
				|  | @ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl") | ||||||
|     converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), |     converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), | ||||||
|     lang=("Language (if tokenizer required)", "option", "l", str), |     lang=("Language (if tokenizer required)", "option", "l", str), | ||||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool), |     morphology=("Enable appending morphology to tags", "flag", "m", bool), | ||||||
|  |     ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), | ||||||
| ) | ) | ||||||
| def convert( | def convert( | ||||||
|     input_file, |     input_file, | ||||||
|  | @ -49,6 +50,7 @@ def convert( | ||||||
|     model=None, |     model=None, | ||||||
|     morphology=False, |     morphology=False, | ||||||
|     converter="auto", |     converter="auto", | ||||||
|  |     ner_map_path=None, | ||||||
|     lang=None, |     lang=None, | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -94,6 +96,9 @@ def convert( | ||||||
|             ) |             ) | ||||||
|     if converter not in CONVERTERS: |     if converter not in CONVERTERS: | ||||||
|         msg.fail("Can't find converter for {}".format(converter), exits=1) |         msg.fail("Can't find converter for {}".format(converter), exits=1) | ||||||
|  |     ner_map = None | ||||||
|  |     if ner_map_path is not None: | ||||||
|  |         ner_map = srsly.read_json(ner_map_path) | ||||||
|     # Use converter function to convert data |     # Use converter function to convert data | ||||||
|     func = CONVERTERS[converter] |     func = CONVERTERS[converter] | ||||||
|     data = func( |     data = func( | ||||||
|  | @ -104,6 +109,7 @@ def convert( | ||||||
|         lang=lang, |         lang=lang, | ||||||
|         model=model, |         model=model, | ||||||
|         no_print=no_print, |         no_print=no_print, | ||||||
|  |         ner_map=ner_map, | ||||||
|     ) |     ) | ||||||
|     if output_dir != "-": |     if output_dir != "-": | ||||||
|         # Export data to a file |         # Export data to a file | ||||||
|  |  | ||||||
|  | @ -7,7 +7,8 @@ from spacy.gold import Example | ||||||
| from ...gold import iob_to_biluo | from ...gold import iob_to_biluo | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, | ||||||
|  |                 ner_map=None, **_): | ||||||
|     """ |     """ | ||||||
|     Convert conllu files into JSON format for use with train cli. |     Convert conllu files into JSON format for use with train cli. | ||||||
|     use_morphology parameter enables appending morphology to tags, which is |     use_morphology parameter enables appending morphology to tags, which is | ||||||
|  | @ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | ||||||
|             checked_for_ner = True |             checked_for_ner = True | ||||||
|         raw += example.text |         raw += example.text | ||||||
|         sentences.append(generate_sentence(example.token_annotation, |         sentences.append(generate_sentence(example.token_annotation, | ||||||
|                 has_ner_tags, MISC_NER_PATTERN)) |                 has_ner_tags, MISC_NER_PATTERN, | ||||||
|  |                 ner_map=ner_map)) | ||||||
|         # Real-sized documents could be extracted using the comments on the |         # Real-sized documents could be extracted using the comments on the | ||||||
|         # conllu document |         # conllu document | ||||||
|         if len(sentences) % n_sents == 0: |         if len(sentences) % n_sents == 0: | ||||||
|  | @ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0): | ||||||
|                 break |                 break | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def simplify_tags(iob, tag_pattern): | def extract_tags(iob, tag_pattern, ner_map=None): | ||||||
|     """ |     """ | ||||||
|  |     Extract tag from MISC column according to `tag_pattern` and map to final | ||||||
|  |     entity type with `ner_map` if mapping present. | ||||||
|  | 
 | ||||||
|  |     For NorNE: | ||||||
|     Simplify tags obtained from the dataset in order to follow Wikipedia |     Simplify tags obtained from the dataset in order to follow Wikipedia | ||||||
|     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while |     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while | ||||||
|     'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to |     'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to | ||||||
|  | @ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern): | ||||||
|             prefix = tag_match.group(2) |             prefix = tag_match.group(2) | ||||||
|             suffix = tag_match.group(3) |             suffix = tag_match.group(3) | ||||||
|             if prefix and suffix: |             if prefix and suffix: | ||||||
|                 if suffix == "GPE_LOC": |                 new_tag = prefix + "-" + suffix | ||||||
|                     suffix = "LOC" |                 if ner_map: | ||||||
|                 elif suffix == "GPE_ORG": |                     suffix = ner_map.get(suffix, suffix) | ||||||
|                     suffix = "ORG" |                     if suffix == "": | ||||||
|                 elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": |                         new_tag = "O" | ||||||
|                     suffix = "MISC" |                     else: | ||||||
|                         new_tag = prefix + "-" + suffix |                         new_tag = prefix + "-" + suffix | ||||||
|         new_iob.append(new_tag) |         new_iob.append(new_tag) | ||||||
|     return new_iob |     return new_iob | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def generate_sentence(token_annotation, has_ner_tags, tag_pattern): | def generate_sentence(token_annotation, has_ner_tags, tag_pattern, | ||||||
|  |                       ner_map=None): | ||||||
|     sentence = {} |     sentence = {} | ||||||
|     tokens = [] |     tokens = [] | ||||||
|     if has_ner_tags: |     if has_ner_tags: | ||||||
|         iob = simplify_tags(token_annotation.entities, tag_pattern) |         iob = extract_tags(token_annotation.entities, tag_pattern, | ||||||
|  |                             ner_map=ner_map) | ||||||
|         biluo = iob_to_biluo(iob) |         biluo = iob_to_biluo(iob) | ||||||
|     for i, id in enumerate(token_annotation.ids): |     for i, id in enumerate(token_annotation.ids): | ||||||
|         token = {} |         token = {} | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_conllu2json(): | def test_cli_converters_conllu2json(): | ||||||
|     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu |     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu | ||||||
|     lines = [ |     lines = [ | ||||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", |         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", | ||||||
|         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", |         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", | ||||||
|  | @ -32,17 +32,16 @@ def test_cli_converters_conllu2json(): | ||||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] |     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_conllu2json(): | def test_cli_converters_conllu2json_name_ner_map(): | ||||||
|     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu |  | ||||||
|     lines = [ |     lines = [ | ||||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", |         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||||
|         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", |         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", | ||||||
|         "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", |         "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", | ||||||
|         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", |         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", | ||||||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", |         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted = conllu2json(input_data, n_sents=1) |     converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) | ||||||
|     assert len(converted) == 1 |     assert len(converted) == 1 | ||||||
|     assert converted[0]["id"] == 0 |     assert converted[0]["id"] == 0 | ||||||
|     assert len(converted[0]["paragraphs"]) == 1 |     assert len(converted[0]["paragraphs"]) == 1 | ||||||
|  | @ -55,7 +54,7 @@ def test_cli_converters_conllu2json(): | ||||||
|     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] |     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] | ||||||
|     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] |     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] | ||||||
|     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] |     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] | ||||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] |     assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_iob2json(): | def test_cli_converters_iob2json(): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user