Add conllu2docs converter (#5704)

Add conllu2docs converter adapted from conllu2json converter
This commit is contained in:
Adriane Boyd 2020-07-03 12:54:32 +02:00 committed by GitHub
parent e4dcac4a4b
commit abad56db7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 94 deletions

View File

@ -9,7 +9,7 @@ import sys
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..gold import docs_to_json from ..gold import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..gold.converters import iob2docs, conll_ner2docs, json2docs from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
# "conllubio": conllu2docs, TODO "conllubio": conllu2docs,
# "conllu": conllu2docs, TODO "conllu": conllu2docs,
# "conll": conllu2docs, TODO "conll": conllu2docs,
"ner": conll_ner2docs, "ner": conll_ner2docs,
"iob": iob2docs, "iob": iob2docs,
"json": json2docs, "json": json2docs,

View File

@ -1,6 +1,4 @@
from .iob2docs import iob2docs # noqa: F401 from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs from .json2docs import json2docs
from .conllu2docs import conllu2docs # noqa: F401
# TODO: Update this one
# from .conllu2docs import conllu2docs # noqa: F401

View File

@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
from ...gold import Example from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language from ...language import Language
from ...tokens import Doc, Token from ...tokens import Doc, Token, Span
from wasabi import Printer from wasabi import Printer
def conllu2json( def conllu2docs(
input_data, input_data,
n_sents=10, n_sents=10,
append_morphology=False, append_morphology=False,
@ -28,34 +28,22 @@ def conllu2json(
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
docs = [] sent_docs = read_conllx(
raw = ""
sentences = []
conll_data = read_conllx(
input_data, input_data,
append_morphology=append_morphology, append_morphology=append_morphology,
ner_tag_pattern=MISC_NER_PATTERN, ner_tag_pattern=MISC_NER_PATTERN,
ner_map=ner_map, ner_map=ner_map,
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
) )
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) docs = []
for i, example in enumerate(conll_data): sent_docs_to_merge = []
raw += example.text for sent_doc in sent_docs:
sentences.append( sent_docs_to_merge.append(sent_doc)
generate_sentence( if len(sent_docs_to_merge) % n_sents == 0:
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, docs.append(Doc.from_docs(sent_docs_to_merge))
) sent_docs_to_merge = []
) if sent_docs_to_merge:
# Real-sized documents could be extracted using the comments on the docs.append(Doc.from_docs(sent_docs_to_merge))
# conllu document
if len(sentences) % n_sents == 0:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
raw = ""
sentences = []
if sentences:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
return docs return docs
@ -84,14 +72,14 @@ def read_conllx(
ner_tag_pattern="", ner_tag_pattern="",
ner_map=None, ner_map=None,
): ):
""" Yield examples, one for each sentence """ """ Yield docs, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
example = example_from_conllu_sentence( doc = doc_from_conllu_sentence(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -99,7 +87,7 @@ def read_conllx(
append_morphology=append_morphology, append_morphology=append_morphology,
ner_map=ner_map, ner_map=ner_map,
) )
yield example yield doc
def get_entities(lines, tag_pattern, ner_map=None): def get_entities(lines, tag_pattern, ner_map=None):
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
return iob_to_biluo(iob) return iob_to_biluo(iob)
def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): def doc_from_conllu_sentence(
sentence = {}
tokens = []
token_annotation = example_dict["token_annotation"]
for i, id_ in enumerate(token_annotation["ids"]):
token = {}
token["id"] = id_
token["orth"] = token_annotation["words"][i]
token["tag"] = token_annotation["tags"][i]
token["pos"] = token_annotation["pos"][i]
token["lemma"] = token_annotation["lemmas"][i]
token["morph"] = token_annotation["morphs"][i]
token["head"] = token_annotation["heads"][i] - i
token["dep"] = token_annotation["deps"][i]
if has_ner_tags:
token["ner"] = example_dict["doc_annotation"]["entities"][i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
def example_from_conllu_sentence(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
if merge_subtokens: if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc) doc = merge_conllu_subtokens(lines, doc)
# create Example from custom Doc annotation # create final Doc from custom Doc annotation
words, spaces, tags, morphs, lemmas = [], [], [], [], [] words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
heads, deps = [], []
for i, t in enumerate(doc): for i, t in enumerate(doc):
words.append(t._.merged_orth) words.append(t._.merged_orth)
lemmas.append(t._.merged_lemma) lemmas.append(t._.merged_lemma)
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
tags.append(t.tag_ + "__" + t._.merged_morph) tags.append(t.tag_ + "__" + t._.merged_morph)
else: else:
tags.append(t.tag_) tags.append(t.tag_)
poses.append(t.pos_)
heads.append(t.head.i)
deps.append(t.dep_)
doc_x = Doc(vocab, words=words, spaces=spaces) doc_x = Doc(vocab, words=words, spaces=spaces)
ref_dict = Example(doc_x, reference=doc).to_dict() for i in range(len(doc)):
ref_dict["words"] = words doc_x[i].tag_ = tags[i]
ref_dict["lemmas"] = lemmas doc_x[i].morph_ = morphs[i]
ref_dict["spaces"] = spaces doc_x[i].lemma_ = lemmas[i]
ref_dict["tags"] = tags doc_x[i].pos_ = poses[i]
ref_dict["morphs"] = morphs doc_x[i].dep_ = deps[i]
example = Example.from_dict(doc_x, ref_dict) doc_x[i].head = doc_x[heads[i]]
return example doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
doc_x.is_parsed = True
doc_x.is_tagged = True
return doc_x
def merge_conllu_subtokens(lines, doc): def merge_conllu_subtokens(lines, doc):

View File

@ -1,14 +1,10 @@
import pytest import pytest
from spacy.gold import docs_to_json from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.gold.converters.conllu2json import conllu2json
from spacy.lang.en import English from spacy.lang.en import English
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
# TODO
# from spacy.gold.converters import conllu2docs
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1) converted_docs = conllu2docs(input_data, n_sents=1)
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["head"] for t in tokens] == [1, 2, -1, 0]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
) )
def test_cli_converters_conllu2json_name_ner_map(lines): def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0] sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 5 assert len(sent["tokens"]) == 5
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens(): def test_cli_converters_conllu2json_subtokens():
@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json( converted_docs = conllu2docs(
input_data, n_sents=1, merge_subtokens=True, append_morphology=True input_data, n_sents=1, merge_subtokens=True, append_morphology=True
) )
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0] sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 4 assert len(sent["tokens"]) == 4
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json(en_vocab): def test_cli_converters_iob2json(en_vocab):