mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
add spaces to json output format
This commit is contained in:
parent
ba80ad7efd
commit
8b66c11ff2
|
@ -53,7 +53,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
|||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text}
|
||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||
if doc.is_tagged:
|
||||
json_token["tag"] = token.tag_
|
||||
json_token["pos"] = token.pos_
|
||||
|
@ -91,6 +91,7 @@ def json_to_annotations(doc):
|
|||
for paragraph in doc["paragraphs"]:
|
||||
example = {"text": paragraph.get("raw", None)}
|
||||
words = []
|
||||
spaces = []
|
||||
ids = []
|
||||
tags = []
|
||||
pos = []
|
||||
|
@ -104,6 +105,7 @@ def json_to_annotations(doc):
|
|||
sent_start_i = len(words)
|
||||
for i, token in enumerate(sent["tokens"]):
|
||||
words.append(token["orth"])
|
||||
spaces.append(token["space"])
|
||||
ids.append(token.get('id', sent_start_i + i))
|
||||
tags.append(token.get('tag', "-"))
|
||||
pos.append(token.get("pos", ""))
|
||||
|
@ -126,6 +128,7 @@ def json_to_annotations(doc):
|
|||
example["token_annotation"] = dict(
|
||||
ids=ids,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
|
|
|
@ -262,7 +262,7 @@ def test_roundtrip_docs_to_json(doc):
|
|||
deps = [t.dep_ for t in doc]
|
||||
heads = [t.head.i for t in doc]
|
||||
cats = doc.cats
|
||||
ents = doc.ents
|
||||
ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
|
||||
# roundtrip to JSON
|
||||
with make_tempdir() as tmpdir:
|
||||
|
@ -272,7 +272,7 @@ def test_roundtrip_docs_to_json(doc):
|
|||
|
||||
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
|
||||
assert len(doc) == goldcorpus.count_train()
|
||||
assert text == reloaded_example.predicted.text
|
||||
assert text == reloaded_example.reference.text
|
||||
assert idx == [t.idx for t in reloaded_example.reference]
|
||||
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||
assert pos == [t.pos_ for t in reloaded_example.reference]
|
||||
|
@ -280,7 +280,7 @@ def test_roundtrip_docs_to_json(doc):
|
|||
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
||||
assert deps == [t.dep_ for t in reloaded_example.reference]
|
||||
assert heads == [t.head.i for t in reloaded_example.reference]
|
||||
assert ents == reloaded_example.reference.ents
|
||||
assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents]
|
||||
assert "TRAVEL" in reloaded_example.reference.cats
|
||||
assert "BAKING" in reloaded_example.reference.cats
|
||||
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
|
||||
|
|
Loading…
Reference in New Issue
Block a user