From 5e7191932250787b057f5c867f68830be2204dae Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 14:27:35 +0200 Subject: [PATCH] avoid writing temp dir in json2docs, fixing 4402 test --- spacy/gold/converters/json2docs.py | 16 ++++++---------- spacy/gold/corpus.py | 10 +++++----- spacy/gold/example.pyx | 2 +- spacy/gold/gold_io.pyx | 2 +- spacy/tests/regression/test_issue4402.py | 24 +++++++++++++++--------- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 8f94e169e..3ca821893 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -2,7 +2,7 @@ import tempfile import contextlib import shutil from pathlib import Path -from ..gold_io import read_json_file +from ..gold_io import json_to_annotations from ..example import annotations2doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model @@ -19,13 +19,9 @@ def make_tempdir(): def json2docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() docs = [] - with make_tempdir() as tmp_dir: - json_path = Path(tmp_dir) / "data.json" - with (json_path).open("w") as file_: - file_.write(input_data) - for json_annot in read_json_file(json_path): - example_dict = _fix_legacy_dict_data(json_annot) - tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) - docs.append(doc) + for json_annot in json_to_annotations(input_data): + example_dict = _fix_legacy_dict_data(json_annot) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 25252a1ca..1244e2516 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -43,7 +43,7 @@ class Corpus: locs.append(path) return locs - def make_examples(self, nlp, reference_docs, **kwargs): + def make_examples(self, nlp, reference_docs): for reference in reference_docs: predicted = nlp.make_doc(reference.text) yield Example(predicted, reference) @@ -72,15 +72,15 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, shuffle=True, **kwargs): + def train_dataset(self, nlp, shuffle=True): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) if shuffle: examples = list(examples) random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, **kwargs): + def dev_dataset(self, nlp): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) yield from examples diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index ac8246585..5aa799996 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -9,7 +9,6 @@ from .align cimport Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .align import Alignment from ..errors import Errors, AlignmentError -from ..structs cimport TokenC from ..syntax import nonproj @@ -19,6 +18,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if array.size: output = output.from_array(attrs, array) + # TODO: links ?! output.cats.update(doc_annot.get("cats", {})) return output diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 2d105b6cd..aa8273bfb 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -2,7 +2,7 @@ import warnings import srsly from .. import util from ..errors import Warnings -from ..tokens import Token, Doc +from ..tokens import Doc from .iob_utils import biluo_tags_from_offsets diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 71ed7ec14..462bb8ea1 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,24 +1,31 @@ -import srsly from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir +from ...gold.converters import json2docs +from ...tokens import DocBin def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - json_path = tmpdir / "test4402.json" - srsly.write_json(json_path, json_data) + output_file = tmpdir / "test4402.spacy" + docs = json2docs(json_data) + data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - corpus = Corpus(str(json_path), str(json_path)) + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 - train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) - # assert that the data got split into 4 sentences - assert len(train_data) == 4 + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 -json_data = [ +json_data =\ { "id": 0, "paragraphs": [ @@ -89,4 +96,3 @@ json_data = [ }, ], } -]