diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index cbdb5c05b..5b17bb59e 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), ) -def main(model=None, output_dir=None): +def main(model, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 8f94e169e..3ca821893 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -2,7 +2,7 @@ import tempfile import contextlib import shutil from pathlib import Path -from ..gold_io import read_json_file +from ..gold_io import json_to_annotations from ..example import annotations2doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model @@ -19,13 +19,9 @@ def make_tempdir(): def json2docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() docs = [] - with make_tempdir() as tmp_dir: - json_path = Path(tmp_dir) / "data.json" - with (json_path).open("w") as file_: - file_.write(input_data) - for json_annot in read_json_file(json_path): - example_dict = _fix_legacy_dict_data(json_annot) - tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) - docs.append(doc) + for json_annot in json_to_annotations(input_data): + example_dict = _fix_legacy_dict_data(json_annot) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9efa71ff7..1244e2516 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,5 +1,3 @@ -import srsly -from pathlib import Path import random from .. import util from .example import Example @@ -7,21 +5,23 @@ from ..tokens import DocBin class Corpus: - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. + """An annotated corpus, reading train and dev datasets from + the DocBin (.spacy) format. DOCS: https://spacy.io/api/goldcorpus """ def __init__(self, train_loc, dev_loc, limit=0): - """Create a GoldCorpus. + """Create a Corpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. + limit (int): Max. number of examples returned + RETURNS (Corpus): The newly created object. """ self.train_loc = train_loc self.dev_loc = dev_loc + self.limit = limit @staticmethod def walk_corpus(path): @@ -43,12 +43,12 @@ class Corpus: locs.append(path) return locs - def make_examples(self, nlp, reference_docs, **kwargs): + def make_examples(self, nlp, reference_docs): for reference in reference_docs: predicted = nlp.make_doc(reference.text) yield Example(predicted, reference) - def read_docbin(self, vocab, locs, limit=0): + def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -57,6 +57,9 @@ class Corpus: with loc.open("rb") as file_: doc_bin = DocBin().from_bytes(file_.read()) yield from doc_bin.get_docs(vocab) + i += len(doc_bin) # TODO: should we restrict to EXACTLY the limit ? + if i >= self.limit: + break def count_train(self, nlp): """Returns count of words in train examples""" @@ -64,20 +67,20 @@ class Corpus: i = 0 for example in self.train_dataset(nlp): n += len(example.predicted) - if self.limit and i >= self.limit: + if i >= self.limit: break i += 1 return n - def train_dataset(self, nlp, shuffle=True, **kwargs): + def train_dataset(self, nlp, shuffle=True): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) if shuffle: examples = list(examples) random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, **kwargs): + def dev_dataset(self, nlp): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) + examples = self.make_examples(nlp, ref_docs) yield from examples diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index bfc0eb1e8..20f1a783e 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -1,3 +1,5 @@ +import warnings + import numpy from ..tokens import Token @@ -8,7 +10,6 @@ from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_do from .iob_utils import spans_from_biluo_tags from .align import Alignment from ..errors import Errors, AlignmentError -from ..structs cimport TokenC from ..syntax import nonproj @@ -18,6 +19,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if array.size: output = output.from_array(attrs, array) + # TODO: links ?! output.cats.update(doc_annot.get("cats", {})) return output @@ -262,24 +264,23 @@ def _annot2array(vocab, tok_annot, doc_annot): values = [] for key, value in doc_annot.items(): - if key == "entities": - if value: + if value: + if key == "entities": words = tok_annot["ORTH"] spaces = tok_annot["SPACY"] ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces) tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_TYPE"] = ent_types - elif key == "links": - if value: + elif key == "links": entities = doc_annot.get("entities", {}) if value and not entities: raise ValueError(Errors.E981) ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) tok_annot["ENT_KB_ID"] = ent_kb_ids - elif key == "cats": - pass - else: - raise ValueError(f"Unknown doc attribute: {key}") + elif key == "cats": + pass + else: + raise ValueError(f"Unknown doc attribute: {key}") for key, value in tok_annot.items(): if key not in IDS: @@ -356,6 +357,7 @@ def _fix_legacy_dict_data(example_dict): if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") + warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set") return { "token_annotation": token_dict, "doc_annotation": doc_dict diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 2d105b6cd..aa8273bfb 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -2,7 +2,7 @@ import warnings import srsly from .. import util from ..errors import Warnings -from ..tokens import Token, Doc +from ..tokens import Doc from .iob_utils import biluo_tags_from_offsets diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index b3a9e0815..6f154bc81 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -48,9 +48,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update( - docs - ) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update(docs) def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b1bed1ea1..e329601da 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -147,7 +147,7 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True) @registry.architectures.register("spacy.MultiHashEmbed.v1") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index ee143ae0d..7199e229f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -7,10 +7,10 @@ from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown +from spacy.gold import Example from spacy.tokens import Doc from ..util import make_tempdir -from ...gold import Example TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 71ed7ec14..462bb8ea1 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,24 +1,31 @@ -import srsly from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir +from ...gold.converters import json2docs +from ...tokens import DocBin def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - json_path = tmpdir / "test4402.json" - srsly.write_json(json_path, json_data) + output_file = tmpdir / "test4402.spacy" + docs = json2docs(json_data) + data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - corpus = Corpus(str(json_path), str(json_path)) + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 - train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) - # assert that the data got split into 4 sentences - assert len(train_data) == 4 + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 -json_data = [ +json_data =\ { "id": 0, "paragraphs": [ @@ -89,4 +96,3 @@ json_data = [ }, ], } -] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d0a05d48b..1adbeb68a 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -5,7 +5,7 @@ from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree -from spacy.tokens import Doc +from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, compounding, minibatch import pytest import srsly @@ -349,9 +349,7 @@ def test_iob_to_biluo(): iob_to_biluo(bad_iob) -# This test is outdated as we use DocBin now. It should probably be removed? -@pytest.mark.xfail(reason="Outdated") -def test_roundtrip_docs_to_json(doc): +def test_roundtrip_docs_to_docbin(doc): nlp = English() text = doc.text idx = [t.idx for t in doc] @@ -364,14 +362,18 @@ def test_roundtrip_docs_to_json(doc): cats = doc.cats ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - # roundtrip to JSON + # roundtrip to DocBin with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = Corpus(str(json_file), str(json_file)) - + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) - assert len(doc) == goldcorpus.count_train() + assert len(doc) == goldcorpus.count_train(nlp) assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] @@ -425,14 +427,14 @@ def test_ignore_misaligned(doc): # We probably want the orth variant logic back, but this test won't be quite # right -- we need to go from DocBin. -@pytest.mark.xfail(reason="Outdated") def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: - json_file = tmpdir / "test.json" - # write to JSON train dicts - srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = Corpus(str(json_file), str(json_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp))