add spaces to json output format

This commit is contained in:
svlandeg 2020-06-16 19:30:03 +02:00
parent ba80ad7efd
commit 8b66c11ff2
2 changed files with 7 additions and 4 deletions

View File

@ -53,7 +53,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
if doc.is_tagged:
json_token["tag"] = token.tag_
json_token["pos"] = token.pos_
@ -91,6 +91,7 @@ def json_to_annotations(doc):
for paragraph in doc["paragraphs"]:
example = {"text": paragraph.get("raw", None)}
words = []
spaces = []
ids = []
tags = []
pos = []
@ -104,6 +105,7 @@ def json_to_annotations(doc):
sent_start_i = len(words)
for i, token in enumerate(sent["tokens"]):
words.append(token["orth"])
spaces.append(token["space"])
ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-"))
pos.append(token.get("pos", ""))
@ -126,6 +128,7 @@ def json_to_annotations(doc):
example["token_annotation"] = dict(
ids=ids,
words=words,
spaces=spaces,
tags=tags,
pos=pos,
morphs=morphs,

View File

@ -262,7 +262,7 @@ def test_roundtrip_docs_to_json(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
cats = doc.cats
ents = doc.ents
ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
# roundtrip to JSON
with make_tempdir() as tmpdir:
@ -272,7 +272,7 @@ def test_roundtrip_docs_to_json(doc):
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.predicted.text
assert text == reloaded_example.reference.text
assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference]
assert pos == [t.pos_ for t in reloaded_example.reference]
@ -280,7 +280,7 @@ def test_roundtrip_docs_to_json(doc):
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference]
assert ents == reloaded_example.reference.ents
assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents]
assert "TRAVEL" in reloaded_example.reference.cats
assert "BAKING" in reloaded_example.reference.cats
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]