Fix jsonl to json conversion (#3419)

* Fix spacy.gold.docs_to_json function

* Fix jsonl2json converter
This commit is contained in:
Matthew Honnibal 2019-03-17 22:12:54 +01:00 committed by Ines Montani
parent 0a4b074184
commit 47e110375d
2 changed files with 61 additions and 12 deletions

View File

@ -3,18 +3,51 @@ from __future__ import unicode_literals
import srsly
from ...util import get_lang_class
from ...gold import docs_to_json
from ...util import get_lang_class, minibatch
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
input_tuples = [srsly.json_loads(line) for line in input_data]
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text)
doc[0].is_sent_start = True
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
json_docs.append(doc.to_json())
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
for record in batch:
raw_text = record["text"]
if "entities" in record:
ents = record["entities"]
else:
ents = record["spans"]
ents = [(e["start"], e["end"], e["label"]) for e in ents]
doc = nlp.make_doc(raw_text)
sentencizer(doc)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
json_docs.append(docs_to_json(docs, id=i))
return json_docs
def _cleanup_spans(spans):
output = []
seen = set()
for span in spans:
if span is not None:
# Trim whitespace
while len(span) and span[0].is_space:
span = span[1:]
while len(span) and span[-1].is_space:
span = span[:-1]
if not len(span):
continue
for i in range(span.start, span.end):
if i in seen:
break
else:
output.append(span)
seen.update(range(span.start, span.end))
return output

View File

@ -598,19 +598,35 @@ cdef class GoldParse:
self.c.sent_start[i] = 0
def docs_to_json(docs, underscore=None):
def docs_to_json(docs, id=0):
"""Convert a list of Doc objects into the JSON-serializable format used by
the spacy train command.
docs (iterable / Doc): The Doc object(s) to convert.
underscore (list): Optional list of string names of custom doc._.
attributes. Attribute values need to be JSON-serializable. Values will
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
id (int): Id for the JSON.
RETURNS (list): The data in spaCy's JSON format.
"""
if isinstance(docs, Doc):
docs = [docs]
return [doc.to_json(underscore=underscore) for doc in docs]
json_doc = {"id": id, "paragraphs": []}
for i, doc in enumerate(docs):
json_para = {'raw': doc.text, "sentences": []}
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
if doc.is_tagged:
json_token["tag"] = token.tag_
if doc.is_parsed:
json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_
json_token["ner"] = biluo_tags[token.i]
json_sent["tokens"].append(json_token)
json_para["sentences"].append(json_sent)
json_doc["paragraphs"].append(json_para)
return json_doc
def biluo_tags_from_offsets(doc, entities, missing="O"):