spaCy/spacy/tests/regression/test_issue4402.py
Sofie Van Landeghem e48a09df4e Example class for training data (#4543)
* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script
2019-11-11 17:35:27 +01:00

96 lines
3.9 KiB
Python

# coding: utf8
from __future__ import unicode_literals
import srsly
from spacy.gold import GoldCorpus
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
def test_issue4402():
nlp = English()
with make_tempdir() as tmpdir:
json_path = tmpdir / "test4402.json"
srsly.write_json(json_path, json_data)
corpus = GoldCorpus(str(json_path), str(json_path))
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
# assert that the data got split into 4 sentences
assert len(train_data) == 4
json_data = [
{
"id": 0,
"paragraphs": [
{
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "How", "ner": "O"},
{"id": 1, "orth": "should", "ner": "O"},
{"id": 2, "orth": "I", "ner": "O"},
{"id": 3, "orth": "cook", "ner": "O"},
{"id": 4, "orth": "bacon", "ner": "O"},
{"id": 5, "orth": "in", "ner": "O"},
{"id": 6, "orth": "an", "ner": "O"},
{"id": 7, "orth": "oven", "ner": "O"},
{"id": 8, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{
"tokens": [
{"id": 9, "orth": "\n", "ner": "O"},
{"id": 10, "orth": "I", "ner": "O"},
{"id": 11, "orth": "'ve", "ner": "O"},
{"id": 12, "orth": "heard", "ner": "O"},
{"id": 13, "orth": "of", "ner": "O"},
{"id": 14, "orth": "people", "ner": "O"},
{"id": 15, "orth": "cooking", "ner": "O"},
{"id": 16, "orth": "bacon", "ner": "O"},
{"id": 17, "orth": "in", "ner": "O"},
{"id": 18, "orth": "an", "ner": "O"},
{"id": 19, "orth": "oven", "ner": "O"},
{"id": 20, "orth": ".", "ner": "O"},
],
"brackets": [],
},
],
"cats": [
{"label": "baking", "value": 1.0},
{"label": "not_baking", "value": 0.0},
],
},
{
"raw": "What is the difference between white and brown eggs?\n",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "What", "ner": "O"},
{"id": 1, "orth": "is", "ner": "O"},
{"id": 2, "orth": "the", "ner": "O"},
{"id": 3, "orth": "difference", "ner": "O"},
{"id": 4, "orth": "between", "ner": "O"},
{"id": 5, "orth": "white", "ner": "O"},
{"id": 6, "orth": "and", "ner": "O"},
{"id": 7, "orth": "brown", "ner": "O"},
{"id": 8, "orth": "eggs", "ner": "O"},
{"id": 9, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
],
"cats": [
{"label": "baking", "value": 0.0},
{"label": "not_baking", "value": 1.0},
],
},
],
}
]