From 6e2930a4a21cf6686d1c4fc59a23fda083e6ef93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Henrik=20Skogstr=C3=B8m?= Date: Wed, 18 Jul 2018 18:55:42 +0200 Subject: [PATCH] Conll(u)-bio converter (#2525) * Started simple conllxbiluo converter * Fix missing BIO to BILUO conversion --- spacy/cli/convert.py | 3 +- spacy/cli/converters/__init__.py | 1 + spacy/cli/converters/conllubio2json.py | 95 ++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 spacy/cli/converters/conllubio2json.py diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index cafe027ed..1b6217a63 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac from pathlib import Path -from .converters import conllu2json, iob2json, conll_ner2json +from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from ._messages import Messages from ..util import prints @@ -12,6 +12,7 @@ from ..util import prints # entry to this dict with the file extension mapped to the converter function # imported from /converters. CONVERTERS = { + 'conllubio': conllubio2json, 'conllu': conllu2json, 'conll': conllu2json, 'ner': conll_ner2json, diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index 02b596d4d..c7d5ac198 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,3 +1,4 @@ from .conllu2json import conllu2json +from .conllubio2json import conllubio2json from .iob2json import iob2json from .conll_ner2json import conll_ner2json diff --git a/spacy/cli/converters/conllubio2json.py b/spacy/cli/converters/conllubio2json.py new file mode 100644 index 000000000..c671e2fed --- /dev/null +++ b/spacy/cli/converters/conllubio2json.py @@ -0,0 +1,95 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...compat import json_dumps, path2str +from ...util import prints +from ...gold import iob_to_biluo + +def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False): + """ + Convert conllu files into JSON format for use with train cli. + use_morphology parameter enables appending morphology to tags, which is + useful for languages such as Spanish, where UD tags are not so rich. + """ + # by @dvsrepo, via #11 explosion/spacy-dev-resources + + docs = [] + sentences = [] + conll_tuples = read_conllx(input_path, use_morphology=use_morphology) + + for i, (raw_text, tokens) in enumerate(conll_tuples): + sentence, brackets = tokens[0] + sentences.append(generate_sentence(sentence)) + # Real-sized documents could be extracted using the comments on the + # conluu document + if(len(sentences) % n_sents == 0): + doc = create_doc(sentences, i) + docs.append(doc) + sentences = [] + + output_filename = input_path.parts[-1].replace(".conll", ".json") + output_filename = input_path.parts[-1].replace(".conllu", ".json") + output_file = output_path / output_filename + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), + title="Generated output file %s" % path2str(output_file)) + + +def read_conllx(input_path, use_morphology=False, n=0): + text = input_path.open('r', encoding='utf-8').read() + i = 0 + for sent in text.strip().split('\n\n'): + lines = sent.strip().split('\n') + if lines: + while lines[0].startswith('#'): + lines.pop(0) + tokens = [] + for line in lines: + + parts = line.split('\t') + id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts + if '-' in id_ or '.' in id_: + continue + try: + id_ = int(id_) - 1 + head = (int(head) - 1) if head != '0' else id_ + dep = 'ROOT' if dep == 'root' else dep + tag = pos if tag == '_' else tag + tag = tag+'__'+morph if use_morphology else tag + ner = ner if ner else 'O' + tokens.append((id_, word, tag, head, dep, ner)) + except: + print(line) + raise + tuples = [list(t) for t in zip(*tokens)] + yield (None, [[tuples, []]]) + i += 1 + if n >= 1 and i >= n: + break + +def generate_sentence(sent): + (id_, word, tag, head, dep, ner) = sent + sentence = {} + tokens = [] + ner = iob_to_biluo(ner) + for i, id in enumerate(id_): + token = {} + token["orth"] = word[i] + token["tag"] = tag[i] + token["head"] = head[i] - id + token["dep"] = dep[i] + token["ner"] = ner[i] + tokens.append(token) + sentence["tokens"] = tokens + return sentence + + +def create_doc(sentences,id): + doc = {} + paragraph = {} + doc["id"] = id + doc["paragraphs"] = [] + paragraph["sentences"] = sentences + doc["paragraphs"].append(paragraph) + return doc