spaCy/spacy/cli/converters/conllu2json.py
Sofie Van Landeghem e48a09df4e Example class for training data (#4543)
* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script
2019-11-11 17:35:27 +01:00

147 lines
4.8 KiB
Python

# coding: utf8
from __future__ import unicode_literals
import re
from spacy.gold import Example
from ...gold import iob_to_biluo
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
"""
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich.
Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
"""
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor
docs = []
sentences = []
conll_data = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False
has_ner_tags = False
for i, example in enumerate(conll_data):
for token_annotation in example.token_annotations:
if not checked_for_ner:
has_ner_tags = is_ner(token_annotation.entities[0])
checked_for_ner = True
sentences.append(generate_sentence(token_annotation, has_ner_tags))
# Real-sized documents could be extracted using the comments on the
# conluu document
if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
return docs
def is_ner(tag):
"""
Check the 10th column of the first token to determine if the file contains
NER tags
"""
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
if tag_match:
return True
elif tag == "O":
return True
else:
return False
def read_conllx(input_data, use_morphology=False, n=0):
""" Yield example data points, one for each sentence """
i = 0
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
ids, words, tags, heads, deps, ents = [], [], [], [], [], []
for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
if "-" in id_ or "." in id_:
continue
try:
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
iob = iob if iob else "O"
ids.append(id_)
words.append(word)
tags.append(tag)
heads.append(head)
deps.append(dep)
ents.append(iob)
except: # noqa: E722
print(line)
raise
example = Example(doc=None)
example.add_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents)
yield example
i += 1
if 1 <= n <= i:
break
def simplify_tags(iob):
"""
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
'MISC'.
"""
new_iob = []
for tag in iob:
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
if tag_match:
prefix = tag_match.group(1)
suffix = tag_match.group(2)
if suffix == "GPE_LOC":
suffix = "LOC"
elif suffix == "GPE_ORG":
suffix = "ORG"
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
suffix = "MISC"
tag = prefix + "-" + suffix
new_iob.append(tag)
return new_iob
def generate_sentence(token_annotation, has_ner_tags):
sentence = {}
tokens = []
if has_ner_tags:
iob = simplify_tags(token_annotation.entities)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
token = {}
token["id"] = id
token["orth"] = token_annotation.words[i]
token["tag"] = token_annotation.tags[i]
token["head"] = token_annotation.heads[i] - id
token["dep"] = token_annotation.deps[i]
if has_ner_tags:
token["ner"] = biluo[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_doc(sentences, id):
doc = {}
paragraph = {}
doc["id"] = id
doc["paragraphs"] = []
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc