diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index b008e2f93..976fe7910 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -9,7 +9,7 @@ import sys from ._app import app, Arg, Opt from ..gold import docs_to_json from ..tokens import DocBin -from ..gold.converters import iob2docs, conll_ner2docs, json2docs +from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs # Converters are matched by file extension except for ner/iob, which are @@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs # imported from /converters. CONVERTERS = { - # "conllubio": conllu2docs, TODO - # "conllu": conllu2docs, TODO - # "conll": conllu2docs, TODO + "conllubio": conllu2docs, + "conllu": conllu2docs, + "conll": conllu2docs, "ner": conll_ner2docs, "iob": iob2docs, "json": json2docs, diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index 3e366933a..63d52ad9d 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,4 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 from .json2docs import json2docs - -# TODO: Update this one -# from .conllu2docs import conllu2docs # noqa: F401 +from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/gold/converters/conllu2json.py b/spacy/gold/converters/conllu2docs.py similarity index 79% rename from spacy/gold/converters/conllu2json.py rename to spacy/gold/converters/conllu2docs.py index 73fdf57e7..b591d3218 100644 --- a/spacy/gold/converters/conllu2json.py +++ b/spacy/gold/converters/conllu2docs.py @@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info from ...gold import Example from ...gold import iob_to_biluo, spans_from_biluo_tags from ...language import Language -from ...tokens import Doc, Token +from ...tokens import Doc, Token, Span from wasabi import Printer -def conllu2json( +def conllu2docs( input_data, n_sents=10, append_morphology=False, @@ -28,34 +28,22 @@ def conllu2json( MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" msg = Printer(no_print=no_print) n_sents_info(msg, n_sents) - docs = [] - raw = "" - sentences = [] - conll_data = read_conllx( + sent_docs = read_conllx( input_data, append_morphology=append_morphology, ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map, merge_subtokens=merge_subtokens, ) - has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) - for i, example in enumerate(conll_data): - raw += example.text - sentences.append( - generate_sentence( - example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, - ) - ) - # Real-sized documents could be extracted using the comments on the - # conllu document - if len(sentences) % n_sents == 0: - doc = create_json_doc(raw, sentences, i) - docs.append(doc) - raw = "" - sentences = [] - if sentences: - doc = create_json_doc(raw, sentences, i) - docs.append(doc) + docs = [] + sent_docs_to_merge = [] + for sent_doc in sent_docs: + sent_docs_to_merge.append(sent_doc) + if len(sent_docs_to_merge) % n_sents == 0: + docs.append(Doc.from_docs(sent_docs_to_merge)) + sent_docs_to_merge = [] + if sent_docs_to_merge: + docs.append(Doc.from_docs(sent_docs_to_merge)) return docs @@ -84,14 +72,14 @@ def read_conllx( ner_tag_pattern="", ner_map=None, ): - """ Yield examples, one for each sentence """ + """ Yield docs, one for each sentence """ vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - example = example_from_conllu_sentence( + doc = doc_from_conllu_sentence( vocab, lines, ner_tag_pattern, @@ -99,7 +87,7 @@ def read_conllx( append_morphology=append_morphology, ner_map=ner_map, ) - yield example + yield doc def get_entities(lines, tag_pattern, ner_map=None): @@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None): return iob_to_biluo(iob) -def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): - sentence = {} - tokens = [] - token_annotation = example_dict["token_annotation"] - for i, id_ in enumerate(token_annotation["ids"]): - token = {} - token["id"] = id_ - token["orth"] = token_annotation["words"][i] - token["tag"] = token_annotation["tags"][i] - token["pos"] = token_annotation["pos"][i] - token["lemma"] = token_annotation["lemmas"][i] - token["morph"] = token_annotation["morphs"][i] - token["head"] = token_annotation["heads"][i] - i - token["dep"] = token_annotation["deps"][i] - if has_ner_tags: - token["ner"] = example_dict["doc_annotation"]["entities"][i] - tokens.append(token) - sentence["tokens"] = tokens - return sentence - - -def create_json_doc(raw, sentences, id_): - doc = {} - paragraph = {} - doc["id"] = id_ - doc["paragraphs"] = [] - paragraph["raw"] = raw.strip() - paragraph["sentences"] = sentences - doc["paragraphs"].append(paragraph) - return doc - - -def example_from_conllu_sentence( +def doc_from_conllu_sentence( vocab, lines, ner_tag_pattern, @@ -263,8 +219,9 @@ def example_from_conllu_sentence( if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) - # create Example from custom Doc annotation - words, spaces, tags, morphs, lemmas = [], [], [], [], [] + # create final Doc from custom Doc annotation + words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], [] + heads, deps = [], [] for i, t in enumerate(doc): words.append(t._.merged_orth) lemmas.append(t._.merged_lemma) @@ -274,16 +231,23 @@ def example_from_conllu_sentence( tags.append(t.tag_ + "__" + t._.merged_morph) else: tags.append(t.tag_) + poses.append(t.pos_) + heads.append(t.head.i) + deps.append(t.dep_) doc_x = Doc(vocab, words=words, spaces=spaces) - ref_dict = Example(doc_x, reference=doc).to_dict() - ref_dict["words"] = words - ref_dict["lemmas"] = lemmas - ref_dict["spaces"] = spaces - ref_dict["tags"] = tags - ref_dict["morphs"] = morphs - example = Example.from_dict(doc_x, ref_dict) - return example + for i in range(len(doc)): + doc_x[i].tag_ = tags[i] + doc_x[i].morph_ = morphs[i] + doc_x[i].lemma_ = lemmas[i] + doc_x[i].pos_ = poses[i] + doc_x[i].dep_ = deps[i] + doc_x[i].head = doc_x[heads[i]] + doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + doc_x.is_parsed = True + doc_x.is_tagged = True + + return doc_x def merge_conllu_subtokens(lines, doc): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ca0f3710f..e8928f33a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,14 +1,10 @@ import pytest -from spacy.gold import docs_to_json -from spacy.gold.converters import iob2docs, conll_ner2docs -from spacy.gold.converters.conllu2json import conllu2json +from spacy.gold import docs_to_json, biluo_tags_from_offsets +from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English from spacy.cli.pretrain import make_docs -# TODO -# from spacy.gold.converters import conllu2docs - def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu @@ -19,8 +15,9 @@ def test_cli_converters_conllu2json(): "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1) - assert len(converted) == 1 + converted_docs = conllu2docs(input_data, n_sents=1) + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 @@ -31,7 +28,9 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] - assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @pytest.mark.parametrize( @@ -55,11 +54,12 @@ def test_cli_converters_conllu2json(): ) def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) - assert len(converted) == 1 + converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 - assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 5 @@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] def test_cli_converters_conllu2json_subtokens(): @@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens(): "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) - converted = conllu2json( + converted_docs = conllu2docs( input_data, n_sents=1, merge_subtokens=True, append_morphology=True ) - assert len(converted) == 1 + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] + assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 - assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 @@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "U-PER", "O", "O"] def test_cli_converters_iob2json(en_vocab):