mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Add conllu2docs converter (#5704)
Add conllu2docs converter adapted from conllu2json converter
This commit is contained in:
parent
e4dcac4a4b
commit
abad56db7d
|
@ -9,7 +9,7 @@ import sys
|
||||||
from ._app import app, Arg, Opt
|
from ._app import app, Arg, Opt
|
||||||
from ..gold import docs_to_json
|
from ..gold import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
# "conllubio": conllu2docs, TODO
|
"conllubio": conllu2docs,
|
||||||
# "conllu": conllu2docs, TODO
|
"conllu": conllu2docs,
|
||||||
# "conll": conllu2docs, TODO
|
"conll": conllu2docs,
|
||||||
"ner": conll_ner2docs,
|
"ner": conll_ner2docs,
|
||||||
"iob": iob2docs,
|
"iob": iob2docs,
|
||||||
"json": json2docs,
|
"json": json2docs,
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
from .iob2docs import iob2docs # noqa: F401
|
from .iob2docs import iob2docs # noqa: F401
|
||||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||||
from .json2docs import json2docs
|
from .json2docs import json2docs
|
||||||
|
from .conllu2docs import conllu2docs # noqa: F401
|
||||||
# TODO: Update this one
|
|
||||||
# from .conllu2docs import conllu2docs # noqa: F401
|
|
||||||
|
|
|
@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
|
||||||
from ...gold import Example
|
from ...gold import Example
|
||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc, Token
|
from ...tokens import Doc, Token, Span
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(
|
def conllu2docs(
|
||||||
input_data,
|
input_data,
|
||||||
n_sents=10,
|
n_sents=10,
|
||||||
append_morphology=False,
|
append_morphology=False,
|
||||||
|
@ -28,34 +28,22 @@ def conllu2json(
|
||||||
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
|
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
docs = []
|
sent_docs = read_conllx(
|
||||||
raw = ""
|
|
||||||
sentences = []
|
|
||||||
conll_data = read_conllx(
|
|
||||||
input_data,
|
input_data,
|
||||||
append_morphology=append_morphology,
|
append_morphology=append_morphology,
|
||||||
ner_tag_pattern=MISC_NER_PATTERN,
|
ner_tag_pattern=MISC_NER_PATTERN,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
merge_subtokens=merge_subtokens,
|
merge_subtokens=merge_subtokens,
|
||||||
)
|
)
|
||||||
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
|
docs = []
|
||||||
for i, example in enumerate(conll_data):
|
sent_docs_to_merge = []
|
||||||
raw += example.text
|
for sent_doc in sent_docs:
|
||||||
sentences.append(
|
sent_docs_to_merge.append(sent_doc)
|
||||||
generate_sentence(
|
if len(sent_docs_to_merge) % n_sents == 0:
|
||||||
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
|
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||||
)
|
sent_docs_to_merge = []
|
||||||
)
|
if sent_docs_to_merge:
|
||||||
# Real-sized documents could be extracted using the comments on the
|
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||||
# conllu document
|
|
||||||
if len(sentences) % n_sents == 0:
|
|
||||||
doc = create_json_doc(raw, sentences, i)
|
|
||||||
docs.append(doc)
|
|
||||||
raw = ""
|
|
||||||
sentences = []
|
|
||||||
if sentences:
|
|
||||||
doc = create_json_doc(raw, sentences, i)
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,14 +72,14 @@ def read_conllx(
|
||||||
ner_tag_pattern="",
|
ner_tag_pattern="",
|
||||||
ner_map=None,
|
ner_map=None,
|
||||||
):
|
):
|
||||||
""" Yield examples, one for each sentence """
|
""" Yield docs, one for each sentence """
|
||||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
||||||
for sent in input_data.strip().split("\n\n"):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split("\n")
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith("#"):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
example = example_from_conllu_sentence(
|
doc = doc_from_conllu_sentence(
|
||||||
vocab,
|
vocab,
|
||||||
lines,
|
lines,
|
||||||
ner_tag_pattern,
|
ner_tag_pattern,
|
||||||
|
@ -99,7 +87,7 @@ def read_conllx(
|
||||||
append_morphology=append_morphology,
|
append_morphology=append_morphology,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
yield example
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
def get_entities(lines, tag_pattern, ner_map=None):
|
def get_entities(lines, tag_pattern, ner_map=None):
|
||||||
|
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
||||||
return iob_to_biluo(iob)
|
return iob_to_biluo(iob)
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
|
def doc_from_conllu_sentence(
|
||||||
sentence = {}
|
|
||||||
tokens = []
|
|
||||||
token_annotation = example_dict["token_annotation"]
|
|
||||||
for i, id_ in enumerate(token_annotation["ids"]):
|
|
||||||
token = {}
|
|
||||||
token["id"] = id_
|
|
||||||
token["orth"] = token_annotation["words"][i]
|
|
||||||
token["tag"] = token_annotation["tags"][i]
|
|
||||||
token["pos"] = token_annotation["pos"][i]
|
|
||||||
token["lemma"] = token_annotation["lemmas"][i]
|
|
||||||
token["morph"] = token_annotation["morphs"][i]
|
|
||||||
token["head"] = token_annotation["heads"][i] - i
|
|
||||||
token["dep"] = token_annotation["deps"][i]
|
|
||||||
if has_ner_tags:
|
|
||||||
token["ner"] = example_dict["doc_annotation"]["entities"][i]
|
|
||||||
tokens.append(token)
|
|
||||||
sentence["tokens"] = tokens
|
|
||||||
return sentence
|
|
||||||
|
|
||||||
|
|
||||||
def create_json_doc(raw, sentences, id_):
|
|
||||||
doc = {}
|
|
||||||
paragraph = {}
|
|
||||||
doc["id"] = id_
|
|
||||||
doc["paragraphs"] = []
|
|
||||||
paragraph["raw"] = raw.strip()
|
|
||||||
paragraph["sentences"] = sentences
|
|
||||||
doc["paragraphs"].append(paragraph)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def example_from_conllu_sentence(
|
|
||||||
vocab,
|
vocab,
|
||||||
lines,
|
lines,
|
||||||
ner_tag_pattern,
|
ner_tag_pattern,
|
||||||
|
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
|
||||||
if merge_subtokens:
|
if merge_subtokens:
|
||||||
doc = merge_conllu_subtokens(lines, doc)
|
doc = merge_conllu_subtokens(lines, doc)
|
||||||
|
|
||||||
# create Example from custom Doc annotation
|
# create final Doc from custom Doc annotation
|
||||||
words, spaces, tags, morphs, lemmas = [], [], [], [], []
|
words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
|
||||||
|
heads, deps = [], []
|
||||||
for i, t in enumerate(doc):
|
for i, t in enumerate(doc):
|
||||||
words.append(t._.merged_orth)
|
words.append(t._.merged_orth)
|
||||||
lemmas.append(t._.merged_lemma)
|
lemmas.append(t._.merged_lemma)
|
||||||
|
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
|
||||||
tags.append(t.tag_ + "__" + t._.merged_morph)
|
tags.append(t.tag_ + "__" + t._.merged_morph)
|
||||||
else:
|
else:
|
||||||
tags.append(t.tag_)
|
tags.append(t.tag_)
|
||||||
|
poses.append(t.pos_)
|
||||||
|
heads.append(t.head.i)
|
||||||
|
deps.append(t.dep_)
|
||||||
|
|
||||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
doc_x = Doc(vocab, words=words, spaces=spaces)
|
||||||
ref_dict = Example(doc_x, reference=doc).to_dict()
|
for i in range(len(doc)):
|
||||||
ref_dict["words"] = words
|
doc_x[i].tag_ = tags[i]
|
||||||
ref_dict["lemmas"] = lemmas
|
doc_x[i].morph_ = morphs[i]
|
||||||
ref_dict["spaces"] = spaces
|
doc_x[i].lemma_ = lemmas[i]
|
||||||
ref_dict["tags"] = tags
|
doc_x[i].pos_ = poses[i]
|
||||||
ref_dict["morphs"] = morphs
|
doc_x[i].dep_ = deps[i]
|
||||||
example = Example.from_dict(doc_x, ref_dict)
|
doc_x[i].head = doc_x[heads[i]]
|
||||||
return example
|
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||||
|
doc_x.is_parsed = True
|
||||||
|
doc_x.is_tagged = True
|
||||||
|
|
||||||
|
return doc_x
|
||||||
|
|
||||||
|
|
||||||
def merge_conllu_subtokens(lines, doc):
|
def merge_conllu_subtokens(lines, doc):
|
|
@ -1,14 +1,10 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.gold import docs_to_json
|
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.gold.converters import iob2docs, conll_ner2docs
|
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.gold.converters.conllu2json import conllu2json
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
|
|
||||||
# TODO
|
|
||||||
# from spacy.gold.converters import conllu2docs
|
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||||
|
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
|
||||||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted = conllu2json(input_data, n_sents=1)
|
converted_docs = conllu2docs(input_data, n_sents=1)
|
||||||
assert len(converted) == 1
|
assert len(converted_docs) == 1
|
||||||
|
converted = [docs_to_json(converted_docs)]
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted[0]["paragraphs"]) == 1
|
||||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||||
|
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
|
||||||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
|
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
|
||||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
|
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
|
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
|
||||||
)
|
)
|
||||||
def test_cli_converters_conllu2json_name_ner_map(lines):
|
def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
||||||
assert len(converted) == 1
|
assert len(converted_docs) == 1
|
||||||
|
converted = [docs_to_json(converted_docs)]
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted[0]["paragraphs"]) == 1
|
||||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
|
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
|
||||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
||||||
assert len(sent["tokens"]) == 5
|
assert len(sent["tokens"]) == 5
|
||||||
|
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
||||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
|
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json_subtokens():
|
def test_cli_converters_conllu2json_subtokens():
|
||||||
|
@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted = conllu2json(
|
converted_docs = conllu2docs(
|
||||||
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
||||||
)
|
)
|
||||||
assert len(converted) == 1
|
assert len(converted_docs) == 1
|
||||||
|
converted = [docs_to_json(converted_docs)]
|
||||||
|
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted[0]["paragraphs"]) == 1
|
||||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
|
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
|
||||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
||||||
assert len(sent["tokens"]) == 4
|
assert len(sent["tokens"]) == 4
|
||||||
|
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
|
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
|
||||||
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
|
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
|
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
|
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_iob2json(en_vocab):
|
def test_cli_converters_iob2json(en_vocab):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user