From 0d64c435b08921ba57da5d17a7ab409f343abea6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 10:05:12 +0200 Subject: [PATCH 1/6] small fixes --- examples/training/create_kb.py | 2 +- spacy/gold/corpus.py | 4 ++-- spacy/gold/example.pyx | 18 ++++++++++-------- spacy/ml/models/multi_task.py | 4 +--- spacy/ml/models/tok2vec.py | 2 +- spacy/tests/parser/test_ner.py | 2 +- spacy/tests/test_gold.py | 6 ++---- 7 files changed, 18 insertions(+), 20 deletions(-) diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index cbdb5c05b..5b17bb59e 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), ) -def main(model=None, output_dir=None): +def main(model, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9efa71ff7..d04a7bb7a 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -14,11 +14,11 @@ class Corpus: """ def __init__(self, train_loc, dev_loc, limit=0): - """Create a GoldCorpus. + """Create a Corpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. + RETURNS (Corpus): The newly created object. """ self.train_loc = train_loc self.dev_loc = dev_loc diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 7ddc59cda..ac8246585 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -1,3 +1,5 @@ +import warnings + import numpy from ..tokens import Token @@ -204,24 +206,23 @@ def _annot2array(vocab, tok_annot, doc_annot): values = [] for key, value in doc_annot.items(): - if key == "entities": - if value: + if value: + if key == "entities": words = tok_annot["ORTH"] spaces = tok_annot["SPACY"] ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces) tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_TYPE"] = ent_types - elif key == "links": - if value: + elif key == "links": entities = doc_annot.get("entities", {}) if value and not entities: raise ValueError(Errors.E981) ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) tok_annot["ENT_KB_ID"] = ent_kb_ids - elif key == "cats": - pass - else: - raise ValueError(f"Unknown doc attribute: {key}") + elif key == "cats": + pass + else: + raise ValueError(f"Unknown doc attribute: {key}") for key, value in tok_annot.items(): if key not in IDS: @@ -298,6 +299,7 @@ def _fix_legacy_dict_data(example_dict): if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") + warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set") return { "token_annotation": token_dict, "doc_annotation": doc_dict diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index b3a9e0815..6f154bc81 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -48,9 +48,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update( - docs - ) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update(docs) def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b1bed1ea1..e329601da 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -147,7 +147,7 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True) @registry.architectures.register("spacy.MultiHashEmbed.v1") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 61e25ffee..40723ab20 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -7,10 +7,10 @@ from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown +from spacy.gold import Example from spacy.tokens import Doc from ..util import make_tempdir -from ...gold import Example TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 61b9ca57c..b60dd2d42 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -596,8 +596,6 @@ def test_split_sents(merged_dict): assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] -# This fails on some None value? Need to look into that. -@pytest.mark.xfail # TODO def test_tuples_to_example(vocab, merged_dict): cats = {"TRAVEL": 1.0, "BAKING": 0.0} merged_dict = dict(merged_dict) @@ -607,6 +605,6 @@ def test_tuples_to_example(vocab, merged_dict): assert words == merged_dict["words"] tags = [token.tag_ for token in ex.reference] assert tags == merged_dict["tags"] - sent_starts = [token.is_sent_start for token in ex.reference] + sent_starts = [bool(token.is_sent_start) for token in ex.reference] assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]] - ex.reference.cats == cats + assert ex.reference.cats == cats From 0b3985d307b3ee389f0476d0f9230ab3e0e70bc7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 10:22:26 +0200 Subject: [PATCH 2/6] limit arg for Corpus --- spacy/gold/corpus.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d04a7bb7a..25252a1ca 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,5 +1,3 @@ -import srsly -from pathlib import Path import random from .. import util from .example import Example @@ -7,8 +5,8 @@ from ..tokens import DocBin class Corpus: - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. + """An annotated corpus, reading train and dev datasets from + the DocBin (.spacy) format. DOCS: https://spacy.io/api/goldcorpus """ @@ -18,10 +16,12 @@ class Corpus: train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. + limit (int): Max. number of examples returned RETURNS (Corpus): The newly created object. """ self.train_loc = train_loc self.dev_loc = dev_loc + self.limit = limit @staticmethod def walk_corpus(path): @@ -48,7 +48,7 @@ class Corpus: predicted = nlp.make_doc(reference.text) yield Example(predicted, reference) - def read_docbin(self, vocab, locs, limit=0): + def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -57,6 +57,9 @@ class Corpus: with loc.open("rb") as file_: doc_bin = DocBin().from_bytes(file_.read()) yield from doc_bin.get_docs(vocab) + i += len(doc_bin) # TODO: should we restrict to EXACTLY the limit ? + if i >= self.limit: + break def count_train(self, nlp): """Returns count of words in train examples""" @@ -64,7 +67,7 @@ class Corpus: i = 0 for example in self.train_dataset(nlp): n += len(example.predicted) - if self.limit and i >= self.limit: + if i >= self.limit: break i += 1 return n From bb87e8c2b13894e139b7969c5ba721b1bbff6cd9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 10:48:04 +0200 Subject: [PATCH 3/6] fix test_roundtrip_docs_to_docbin --- spacy/tests/test_gold.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b60dd2d42..6ddfd5c28 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -5,7 +5,7 @@ from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree -from spacy.tokens import Doc +from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, compounding, minibatch import pytest import srsly @@ -349,7 +349,7 @@ def test_iob_to_biluo(): iob_to_biluo(bad_iob) -def test_roundtrip_docs_to_json(doc): +def test_roundtrip_docs_to_docbin(doc): nlp = English() text = doc.text idx = [t.idx for t in doc] @@ -362,14 +362,16 @@ def test_roundtrip_docs_to_json(doc): cats = doc.cats ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - # roundtrip to JSON + # roundtrip to DocBin with make_tempdir() as tmpdir: - json_file = tmpdir / "roundtrip.json" - srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = Corpus(train=str(json_file), dev=str(json_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) - assert len(doc) == goldcorpus.count_train() + assert len(doc) == goldcorpus.count_train(nlp) assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] From 8ba8defa7836f70b9178c0860bab50c17af037fc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 11:52:17 +0200 Subject: [PATCH 4/6] fix test_make_orth_variants --- spacy/tests/test_gold.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 6ddfd5c28..227b9acb0 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -396,10 +396,11 @@ def test_projective_train_vs_nonprojective_dev(doc): heads = [t.head.i for t in doc] with make_tempdir() as tmpdir: - json_file = tmpdir / "test.json" - # write to JSON train dicts - srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = Corpus(str(json_file), str(json_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) train_reloaded_example = next(goldcorpus.train_dataset(nlp)) train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] @@ -455,10 +456,11 @@ def test_ignore_misaligned(doc): def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: - json_file = tmpdir / "test.json" - # write to JSON train dicts - srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = Corpus(str(json_file), str(json_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp)) From ffddff03b8fe94aa7eb48283c06f72dc260a49ea Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 11:58:59 +0200 Subject: [PATCH 5/6] fix add_label test --- spacy/tests/parser/test_add_label.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 87675e94d..5809f16b8 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -54,7 +54,8 @@ def test_add_label(parser): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - parser.update((doc, gold), sgd=sgd, losses=losses) + example = Example.from_dict(doc, gold) + parser.update([example], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" From 5e7191932250787b057f5c867f68830be2204dae Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 14:27:35 +0200 Subject: [PATCH 6/6] avoid writing temp dir in json2docs, fixing 4402 test --- spacy/gold/converters/json2docs.py | 16 ++++++---------- spacy/gold/corpus.py | 10 +++++----- spacy/gold/example.pyx | 2 +- spacy/gold/gold_io.pyx | 2 +- spacy/tests/regression/test_issue4402.py | 24 +++++++++++++++--------- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 8f94e169e..3ca821893 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -2,7 +2,7 @@ import tempfile import contextlib import shutil from pathlib import Path -from ..gold_io import read_json_file +from ..gold_io import json_to_annotations from ..example import annotations2doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model @@ -19,13 +19,9 @@ def make_tempdir(): def json2docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() docs = [] - with make_tempdir() as tmp_dir: - json_path = Path(tmp_dir) / "data.json" - with (json_path).open("w") as file_: - file_.write(input_data) - for json_annot in read_json_file(json_path): - example_dict = _fix_legacy_dict_data(json_annot) - tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) - docs.append(doc) + for json_annot in json_to_annotations(input_data): + example_dict = _fix_legacy_dict_data(json_annot) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 25252a1ca..1244e2516 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -43,7 +43,7 @@ class Corpus: locs.append(path) return locs - def make_examples(self, nlp, reference_docs, **kwargs): + def make_examples(self, nlp, reference_docs): for reference in reference_docs: predicted = nlp.make_doc(reference.text) yield Example(predicted, reference) @@ -72,15 +72,15 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, shuffle=True, **kwargs): + def train_dataset(self, nlp, shuffle=True): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) if shuffle: examples = list(examples) random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, **kwargs): + def dev_dataset(self, nlp): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) yield from examples diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index ac8246585..5aa799996 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -9,7 +9,6 @@ from .align cimport Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .align import Alignment from ..errors import Errors, AlignmentError -from ..structs cimport TokenC from ..syntax import nonproj @@ -19,6 +18,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if array.size: output = output.from_array(attrs, array) + # TODO: links ?! output.cats.update(doc_annot.get("cats", {})) return output diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 2d105b6cd..aa8273bfb 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -2,7 +2,7 @@ import warnings import srsly from .. import util from ..errors import Warnings -from ..tokens import Token, Doc +from ..tokens import Doc from .iob_utils import biluo_tags_from_offsets diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 71ed7ec14..462bb8ea1 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,24 +1,31 @@ -import srsly from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir +from ...gold.converters import json2docs +from ...tokens import DocBin def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - json_path = tmpdir / "test4402.json" - srsly.write_json(json_path, json_data) + output_file = tmpdir / "test4402.spacy" + docs = json2docs(json_data) + data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - corpus = Corpus(str(json_path), str(json_path)) + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 - train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) - # assert that the data got split into 4 sentences - assert len(train_data) == 4 + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 -json_data = [ +json_data =\ { "id": 0, "paragraphs": [ @@ -89,4 +96,3 @@ json_data = [ }, ], } -]