avoid writing temp dir in json2docs, fixing 4402 test

This commit is contained in:
svlandeg 2020-06-22 14:27:35 +02:00
parent ffddff03b8
commit 5e71919322
5 changed files with 28 additions and 26 deletions

View File

@ -2,7 +2,7 @@ import tempfile
import contextlib import contextlib
import shutil import shutil
from pathlib import Path from pathlib import Path
from ..gold_io import read_json_file from ..gold_io import json_to_annotations
from ..example import annotations2doc from ..example import annotations2doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
@ -19,11 +19,7 @@ def make_tempdir():
def json2docs(input_data, model=None, **kwargs): def json2docs(input_data, model=None, **kwargs):
nlp = load_model(model) if model is not None else MultiLanguage() nlp = load_model(model) if model is not None else MultiLanguage()
docs = [] docs = []
with make_tempdir() as tmp_dir: for json_annot in json_to_annotations(input_data):
json_path = Path(tmp_dir) / "data.json"
with (json_path).open("w") as file_:
file_.write(input_data)
for json_annot in read_json_file(json_path):
example_dict = _fix_legacy_dict_data(json_annot) example_dict = _fix_legacy_dict_data(json_annot)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)

View File

@ -43,7 +43,7 @@ class Corpus:
locs.append(path) locs.append(path)
return locs return locs
def make_examples(self, nlp, reference_docs, **kwargs): def make_examples(self, nlp, reference_docs):
for reference in reference_docs: for reference in reference_docs:
predicted = nlp.make_doc(reference.text) predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference) yield Example(predicted, reference)
@ -72,15 +72,15 @@ class Corpus:
i += 1 i += 1
return n return n
def train_dataset(self, nlp, shuffle=True, **kwargs): def train_dataset(self, nlp, shuffle=True):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs) examples = self.make_examples(nlp, ref_docs)
if shuffle: if shuffle:
examples = list(examples) examples = list(examples)
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self, nlp, **kwargs): def dev_dataset(self, nlp):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs) examples = self.make_examples(nlp, ref_docs)
yield from examples yield from examples

View File

@ -9,7 +9,6 @@ from .align cimport Alignment
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
from .align import Alignment from .align import Alignment
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from ..structs cimport TokenC
from ..syntax import nonproj from ..syntax import nonproj
@ -19,6 +18,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if array.size: if array.size:
output = output.from_array(attrs, array) output = output.from_array(attrs, array)
# TODO: links ?!
output.cats.update(doc_annot.get("cats", {})) output.cats.update(doc_annot.get("cats", {}))
return output return output

View File

@ -2,7 +2,7 @@ import warnings
import srsly import srsly
from .. import util from .. import util
from ..errors import Warnings from ..errors import Warnings
from ..tokens import Token, Doc from ..tokens import Doc
from .iob_utils import biluo_tags_from_offsets from .iob_utils import biluo_tags_from_offsets

View File

@ -1,24 +1,31 @@
import srsly
from spacy.gold import Corpus from spacy.gold import Corpus
from spacy.lang.en import English from spacy.lang.en import English
from ..util import make_tempdir from ..util import make_tempdir
from ...gold.converters import json2docs
from ...tokens import DocBin
def test_issue4402(): def test_issue4402():
nlp = English() nlp = English()
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
json_path = tmpdir / "test4402.json" output_file = tmpdir / "test4402.spacy"
srsly.write_json(json_path, json_data) docs = json2docs(json_data)
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
corpus = Corpus(str(json_path), str(json_path)) train_data = list(corpus.train_dataset(nlp))
assert len(train_data) == 2
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) split_train_data = []
# assert that the data got split into 4 sentences for eg in train_data:
assert len(train_data) == 4 split_train_data.extend(eg.split_sents())
assert len(split_train_data) == 4
json_data = [ json_data =\
{ {
"id": 0, "id": 0,
"paragraphs": [ "paragraphs": [
@ -89,4 +96,3 @@ json_data = [
}, },
], ],
} }
]