Improve handling of NER in CoNLL-U MISC

This commit is contained in:
Adriane Boyd 2020-05-20 18:48:51 +02:00
parent fda7355508
commit 4b229bfc22
2 changed files with 43 additions and 30 deletions

View File

@ -26,7 +26,7 @@ def conllu2json(
Extract NER tags if available and convert them so that they follow Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme BILUO and the Wikipedia scheme
""" """
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
docs = [] docs = []
@ -39,7 +39,7 @@ def conllu2json(
ner_map=ner_map, ner_map=ner_map,
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
) )
has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
for i, example in enumerate(conll_data): for i, example in enumerate(conll_data):
raw += example.text raw += example.text
sentences.append( sentences.append(
@ -65,21 +65,20 @@ def conllu2json(
def has_ner(input_data, ner_tag_pattern): def has_ner(input_data, ner_tag_pattern):
""" """
Check the 10th column of the first token to determine if the file contains Check the MISC column for NER tags.
NER tags
""" """
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
if lines: for line in lines:
parts = lines[0].split("\t") parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if re.search(ner_tag_pattern, misc): for misc_part in misc.split("|"):
return True if re.match(ner_tag_pattern, misc_part):
else: return True
return False return False
def read_conllx( def read_conllx(
@ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None):
iob = [] iob = []
for misc in miscs: for misc in miscs:
tag_match = re.search(tag_pattern, misc)
iob_tag = "O" iob_tag = "O"
if tag_match: for misc_part in misc.split("|"):
prefix = tag_match.group(2) tag_match = re.match(tag_pattern, misc_part)
suffix = tag_match.group(3) if tag_match:
if prefix and suffix: prefix = tag_match.group(2)
iob_tag = prefix + "-" + suffix suffix = tag_match.group(3)
if ner_map: if prefix and suffix:
suffix = ner_map.get(suffix, suffix) iob_tag = prefix + "-" + suffix
if suffix == "": if ner_map:
iob_tag = "O" suffix = ner_map.get(suffix, suffix)
else: if suffix == "":
iob_tag = prefix + "-" + suffix iob_tag = "O"
else:
iob_tag = prefix + "-" + suffix
break
iob.append(iob_tag) iob.append(iob_tag)
return iob_to_biluo(iob) return iob_to_biluo(iob)

View File

@ -29,14 +29,26 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu2json_name_ner_map(): @pytest.mark.parametrize(
lines = [ "lines",
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", [
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", (
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
] "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
),
(
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD",
),
],
)
def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1 assert len(converted) == 1