Add NER map option to convert CLI (#4763)

Instead of a hard-coded NER tag simplification function that was only
intended for NorNE, map NER tags in CoNLL-U converter using a dict
provided as JSON as a command-line option.

Map NER entity types or new tag or to "" for 'O', e.g.:

```
{"PER": "PERSON", "BAD": ""}

=>

B-PER -> B-PERSON
B-BAD -> O
```
This commit is contained in:
adrianeboyd 2019-12-11 18:20:49 +01:00 committed by Ines Montani
parent 68f711b409
commit eb9b1858c4
3 changed files with 30 additions and 17 deletions

View File

@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str), lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool), morphology=("Enable appending morphology to tags", "flag", "m", bool),
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
) )
def convert( def convert(
input_file, input_file,
@ -49,6 +50,7 @@ def convert(
model=None, model=None,
morphology=False, morphology=False,
converter="auto", converter="auto",
ner_map_path=None,
lang=None, lang=None,
): ):
""" """
@ -94,6 +96,9 @@ def convert(
) )
if converter not in CONVERTERS: if converter not in CONVERTERS:
msg.fail("Can't find converter for {}".format(converter), exits=1) msg.fail("Can't find converter for {}".format(converter), exits=1)
ner_map = None
if ner_map_path is not None:
ner_map = srsly.read_json(ner_map_path)
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
data = func( data = func(
@ -104,6 +109,7 @@ def convert(
lang=lang, lang=lang,
model=model, model=model,
no_print=no_print, no_print=no_print,
ner_map=ner_map,
) )
if output_dir != "-": if output_dir != "-":
# Export data to a file # Export data to a file

View File

@ -7,7 +7,8 @@ from spacy.gold import Example
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
ner_map=None, **_):
""" """
Convert conllu files into JSON format for use with train cli. Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is use_morphology parameter enables appending morphology to tags, which is
@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = True checked_for_ner = True
raw += example.text raw += example.text
sentences.append(generate_sentence(example.token_annotation, sentences.append(generate_sentence(example.token_annotation,
has_ner_tags, MISC_NER_PATTERN)) has_ner_tags, MISC_NER_PATTERN,
ner_map=ner_map))
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the
# conllu document # conllu document
if len(sentences) % n_sents == 0: if len(sentences) % n_sents == 0:
@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
break break
def simplify_tags(iob, tag_pattern): def extract_tags(iob, tag_pattern, ner_map=None):
""" """
Extract tag from MISC column according to `tag_pattern` and map to final
entity type with `ner_map` if mapping present.
For NorNE:
Simplify tags obtained from the dataset in order to follow Wikipedia Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
prefix = tag_match.group(2) prefix = tag_match.group(2)
suffix = tag_match.group(3) suffix = tag_match.group(3)
if prefix and suffix: if prefix and suffix:
if suffix == "GPE_LOC":
suffix = "LOC"
elif suffix == "GPE_ORG":
suffix = "ORG"
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
suffix = "MISC"
new_tag = prefix + "-" + suffix new_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
new_tag = "O"
else:
new_tag = prefix + "-" + suffix
new_iob.append(new_tag) new_iob.append(new_tag)
return new_iob return new_iob
def generate_sentence(token_annotation, has_ner_tags, tag_pattern): def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
ner_map=None):
sentence = {} sentence = {}
tokens = [] tokens = []
if has_ner_tags: if has_ner_tags:
iob = simplify_tags(token_annotation.entities, tag_pattern) iob = extract_tags(token_annotation.entities, tag_pattern,
ner_map=ner_map)
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids): for i, id in enumerate(token_annotation.ids):
token = {} token = {}

View File

@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [ lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json_name_ner_map():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [ lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1 assert len(converted) == 1
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_iob2json(): def test_cli_converters_iob2json():