mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
Add NER map option to convert CLI (#4763)
Instead of a hard-coded NER tag simplification function that was only intended for NorNE, map NER tags in CoNLL-U converter using a dict provided as JSON as a command-line option. Map NER entity types or new tag or to "" for 'O', e.g.: ``` {"PER": "PERSON", "BAD": ""} => B-PER -> B-PERSON B-BAD -> O ```
This commit is contained in:
parent
68f711b409
commit
eb9b1858c4
|
@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||||
|
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
|
||||||
)
|
)
|
||||||
def convert(
|
def convert(
|
||||||
input_file,
|
input_file,
|
||||||
|
@ -49,6 +50,7 @@ def convert(
|
||||||
model=None,
|
model=None,
|
||||||
morphology=False,
|
morphology=False,
|
||||||
converter="auto",
|
converter="auto",
|
||||||
|
ner_map_path=None,
|
||||||
lang=None,
|
lang=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -94,6 +96,9 @@ def convert(
|
||||||
)
|
)
|
||||||
if converter not in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||||
|
ner_map = None
|
||||||
|
if ner_map_path is not None:
|
||||||
|
ner_map = srsly.read_json(ner_map_path)
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
data = func(
|
data = func(
|
||||||
|
@ -104,6 +109,7 @@ def convert(
|
||||||
lang=lang,
|
lang=lang,
|
||||||
model=model,
|
model=model,
|
||||||
no_print=no_print,
|
no_print=no_print,
|
||||||
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
if output_dir != "-":
|
if output_dir != "-":
|
||||||
# Export data to a file
|
# Export data to a file
|
||||||
|
|
|
@ -7,7 +7,8 @@ from spacy.gold import Example
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
|
||||||
|
ner_map=None, **_):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
|
@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||||
checked_for_ner = True
|
checked_for_ner = True
|
||||||
raw += example.text
|
raw += example.text
|
||||||
sentences.append(generate_sentence(example.token_annotation,
|
sentences.append(generate_sentence(example.token_annotation,
|
||||||
has_ner_tags, MISC_NER_PATTERN))
|
has_ner_tags, MISC_NER_PATTERN,
|
||||||
|
ner_map=ner_map))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conllu document
|
# conllu document
|
||||||
if len(sentences) % n_sents == 0:
|
if len(sentences) % n_sents == 0:
|
||||||
|
@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def simplify_tags(iob, tag_pattern):
|
def extract_tags(iob, tag_pattern, ner_map=None):
|
||||||
"""
|
"""
|
||||||
|
Extract tag from MISC column according to `tag_pattern` and map to final
|
||||||
|
entity type with `ner_map` if mapping present.
|
||||||
|
|
||||||
|
For NorNE:
|
||||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||||
|
@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
|
||||||
prefix = tag_match.group(2)
|
prefix = tag_match.group(2)
|
||||||
suffix = tag_match.group(3)
|
suffix = tag_match.group(3)
|
||||||
if prefix and suffix:
|
if prefix and suffix:
|
||||||
if suffix == "GPE_LOC":
|
|
||||||
suffix = "LOC"
|
|
||||||
elif suffix == "GPE_ORG":
|
|
||||||
suffix = "ORG"
|
|
||||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
|
||||||
suffix = "MISC"
|
|
||||||
new_tag = prefix + "-" + suffix
|
new_tag = prefix + "-" + suffix
|
||||||
|
if ner_map:
|
||||||
|
suffix = ner_map.get(suffix, suffix)
|
||||||
|
if suffix == "":
|
||||||
|
new_tag = "O"
|
||||||
|
else:
|
||||||
|
new_tag = prefix + "-" + suffix
|
||||||
new_iob.append(new_tag)
|
new_iob.append(new_tag)
|
||||||
return new_iob
|
return new_iob
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
|
def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
|
||||||
|
ner_map=None):
|
||||||
sentence = {}
|
sentence = {}
|
||||||
tokens = []
|
tokens = []
|
||||||
if has_ner_tags:
|
if has_ner_tags:
|
||||||
iob = simplify_tags(token_annotation.entities, tag_pattern)
|
iob = extract_tags(token_annotation.entities, tag_pattern,
|
||||||
|
ner_map=ner_map)
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
for i, id in enumerate(token_annotation.ids):
|
for i, id in enumerate(token_annotation.ids):
|
||||||
token = {}
|
token = {}
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||||
lines = [
|
lines = [
|
||||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
||||||
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
|
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
|
||||||
|
@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json_name_ner_map():
|
||||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
|
||||||
lines = [
|
lines = [
|
||||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
||||||
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
|
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
|
||||||
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
|
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
|
||||||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
|
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
|
||||||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted = conllu2json(input_data, n_sents=1)
|
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
||||||
assert len(converted) == 1
|
assert len(converted) == 1
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted[0]["paragraphs"]) == 1
|
||||||
|
@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
|
||||||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
||||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_iob2json():
|
def test_cli_converters_iob2json():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user