diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 1c66ff5e9..967bee060 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -53,7 +53,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"): for j, sent in enumerate(doc.sents): json_sent = {"tokens": [], "brackets": []} for token in sent: - json_token = {"id": token.i, "orth": token.text} + json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} if doc.is_tagged: json_token["tag"] = token.tag_ json_token["pos"] = token.pos_ @@ -91,6 +91,7 @@ def json_to_annotations(doc): for paragraph in doc["paragraphs"]: example = {"text": paragraph.get("raw", None)} words = [] + spaces = [] ids = [] tags = [] pos = [] @@ -104,6 +105,7 @@ def json_to_annotations(doc): sent_start_i = len(words) for i, token in enumerate(sent["tokens"]): words.append(token["orth"]) + spaces.append(token["space"]) ids.append(token.get('id', sent_start_i + i)) tags.append(token.get('tag', "-")) pos.append(token.get("pos", "")) @@ -126,6 +128,7 @@ def json_to_annotations(doc): example["token_annotation"] = dict( ids=ids, words=words, + spaces=spaces, tags=tags, pos=pos, morphs=morphs, diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 5fd060088..83489799c 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -262,7 +262,7 @@ def test_roundtrip_docs_to_json(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] cats = doc.cats - ents = doc.ents + ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] # roundtrip to JSON with make_tempdir() as tmpdir: @@ -272,7 +272,7 @@ def test_roundtrip_docs_to_json(doc): reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.predicted.text + assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference] @@ -280,7 +280,7 @@ def test_roundtrip_docs_to_json(doc): assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] - assert ents == reloaded_example.reference.ents + assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents] assert "TRAVEL" in reloaded_example.reference.cats assert "BAKING" in reloaded_example.reference.cats assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]