Update test

This commit is contained in:
Matthew Honnibal 2020-06-22 14:59:05 +02:00
commit b250f6b62f
10 changed files with 67 additions and 60 deletions

View File

@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
model=("Model name, should have pretrained word embeddings", "positional", None, str), model=("Model name, should have pretrained word embeddings", "positional", None, str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
) )
def main(model=None, output_dir=None): def main(model, output_dir=None):
"""Load the model and create the KB with pre-defined entity encodings. """Load the model and create the KB with pre-defined entity encodings.
If an output_dir is provided, the KB will be stored there in a file 'kb'. If an output_dir is provided, the KB will be stored there in a file 'kb'.
The updated vocab will also be written to a directory in the output_dir.""" The updated vocab will also be written to a directory in the output_dir."""

View File

@ -2,7 +2,7 @@ import tempfile
import contextlib import contextlib
import shutil import shutil
from pathlib import Path from pathlib import Path
from ..gold_io import read_json_file from ..gold_io import json_to_annotations
from ..example import annotations2doc from ..example import annotations2doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
@ -19,13 +19,9 @@ def make_tempdir():
def json2docs(input_data, model=None, **kwargs): def json2docs(input_data, model=None, **kwargs):
nlp = load_model(model) if model is not None else MultiLanguage() nlp = load_model(model) if model is not None else MultiLanguage()
docs = [] docs = []
with make_tempdir() as tmp_dir: for json_annot in json_to_annotations(input_data):
json_path = Path(tmp_dir) / "data.json" example_dict = _fix_legacy_dict_data(json_annot)
with (json_path).open("w") as file_: tok_dict, doc_dict = _parse_example_dict_data(example_dict)
file_.write(input_data) doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
for json_annot in read_json_file(json_path): docs.append(doc)
example_dict = _fix_legacy_dict_data(json_annot)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc)
return docs return docs

View File

@ -1,5 +1,3 @@
import srsly
from pathlib import Path
import random import random
from .. import util from .. import util
from .example import Example from .example import Example
@ -7,21 +5,23 @@ from ..tokens import DocBin
class Corpus: class Corpus:
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, reading train and dev datasets from
annotations for tagging, dependency parsing and NER. the DocBin (.spacy) format.
DOCS: https://spacy.io/api/goldcorpus DOCS: https://spacy.io/api/goldcorpus
""" """
def __init__(self, train_loc, dev_loc, limit=0): def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus. """Create a Corpus.
train (str / Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data. dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. limit (int): Max. number of examples returned
RETURNS (Corpus): The newly created object.
""" """
self.train_loc = train_loc self.train_loc = train_loc
self.dev_loc = dev_loc self.dev_loc = dev_loc
self.limit = limit
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
@ -43,12 +43,12 @@ class Corpus:
locs.append(path) locs.append(path)
return locs return locs
def make_examples(self, nlp, reference_docs, **kwargs): def make_examples(self, nlp, reference_docs):
for reference in reference_docs: for reference in reference_docs:
predicted = nlp.make_doc(reference.text) predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference) yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0): def read_docbin(self, vocab, locs):
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
@ -57,6 +57,9 @@ class Corpus:
with loc.open("rb") as file_: with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read()) doc_bin = DocBin().from_bytes(file_.read())
yield from doc_bin.get_docs(vocab) yield from doc_bin.get_docs(vocab)
i += len(doc_bin) # TODO: should we restrict to EXACTLY the limit ?
if i >= self.limit:
break
def count_train(self, nlp): def count_train(self, nlp):
"""Returns count of words in train examples""" """Returns count of words in train examples"""
@ -64,20 +67,20 @@ class Corpus:
i = 0 i = 0
for example in self.train_dataset(nlp): for example in self.train_dataset(nlp):
n += len(example.predicted) n += len(example.predicted)
if self.limit and i >= self.limit: if i >= self.limit:
break break
i += 1 i += 1
return n return n
def train_dataset(self, nlp, shuffle=True, **kwargs): def train_dataset(self, nlp, shuffle=True):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs) examples = self.make_examples(nlp, ref_docs)
if shuffle: if shuffle:
examples = list(examples) examples = list(examples)
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self, nlp, **kwargs): def dev_dataset(self, nlp):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs) examples = self.make_examples(nlp, ref_docs)
yield from examples yield from examples

View File

@ -1,3 +1,5 @@
import warnings
import numpy import numpy
from ..tokens import Token from ..tokens import Token
@ -8,7 +10,6 @@ from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_do
from .iob_utils import spans_from_biluo_tags from .iob_utils import spans_from_biluo_tags
from .align import Alignment from .align import Alignment
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from ..structs cimport TokenC
from ..syntax import nonproj from ..syntax import nonproj
@ -18,6 +19,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if array.size: if array.size:
output = output.from_array(attrs, array) output = output.from_array(attrs, array)
# TODO: links ?!
output.cats.update(doc_annot.get("cats", {})) output.cats.update(doc_annot.get("cats", {}))
return output return output
@ -262,24 +264,23 @@ def _annot2array(vocab, tok_annot, doc_annot):
values = [] values = []
for key, value in doc_annot.items(): for key, value in doc_annot.items():
if key == "entities": if value:
if value: if key == "entities":
words = tok_annot["ORTH"] words = tok_annot["ORTH"]
spaces = tok_annot["SPACY"] spaces = tok_annot["SPACY"]
ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces) ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces)
tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_IOB"] = ent_iobs
tok_annot["ENT_TYPE"] = ent_types tok_annot["ENT_TYPE"] = ent_types
elif key == "links": elif key == "links":
if value:
entities = doc_annot.get("entities", {}) entities = doc_annot.get("entities", {})
if value and not entities: if value and not entities:
raise ValueError(Errors.E981) raise ValueError(Errors.E981)
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
tok_annot["ENT_KB_ID"] = ent_kb_ids tok_annot["ENT_KB_ID"] = ent_kb_ids
elif key == "cats": elif key == "cats":
pass pass
else: else:
raise ValueError(f"Unknown doc attribute: {key}") raise ValueError(f"Unknown doc attribute: {key}")
for key, value in tok_annot.items(): for key, value in tok_annot.items():
if key not in IDS: if key not in IDS:
@ -356,6 +357,7 @@ def _fix_legacy_dict_data(example_dict):
if "HEAD" in token_dict and "SENT_START" in token_dict: if "HEAD" in token_dict and "SENT_START" in token_dict:
# If heads are set, we don't also redundantly specify SENT_START. # If heads are set, we don't also redundantly specify SENT_START.
token_dict.pop("SENT_START") token_dict.pop("SENT_START")
warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set")
return { return {
"token_annotation": token_dict, "token_annotation": token_dict,
"doc_annotation": doc_dict "doc_annotation": doc_dict

View File

@ -2,7 +2,7 @@ import warnings
import srsly import srsly
from .. import util from .. import util
from ..errors import Warnings from ..errors import Warnings
from ..tokens import Token, Doc from ..tokens import Doc
from .iob_utils import biluo_tags_from_offsets from .iob_utils import biluo_tags_from_offsets

View File

@ -48,9 +48,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def mlm_forward(model, docs, is_train): def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update( output, backprop = model.get_ref("wrapped-model").begin_update(docs)
docs
) # drop=drop
def mlm_backward(d_output): def mlm_backward(d_output):
d_output *= 1 - mask d_output *= 1 - mask

View File

@ -147,7 +147,7 @@ def hash_char_embed_bilstm_v1(
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") @registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(width, maxout_pieces): def LayerNormalizedMaxout(width, maxout_pieces):
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True)
@registry.architectures.register("spacy.MultiHashEmbed.v1") @registry.architectures.register("spacy.MultiHashEmbed.v1")

View File

@ -7,10 +7,10 @@ from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown from spacy.syntax.ner import BiluoPushDown
from spacy.gold import Example
from spacy.tokens import Doc from spacy.tokens import Doc
from ..util import make_tempdir from ..util import make_tempdir
from ...gold import Example
TRAIN_DATA = [ TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),

View File

@ -1,24 +1,31 @@
import srsly
from spacy.gold import Corpus from spacy.gold import Corpus
from spacy.lang.en import English from spacy.lang.en import English
from ..util import make_tempdir from ..util import make_tempdir
from ...gold.converters import json2docs
from ...tokens import DocBin
def test_issue4402(): def test_issue4402():
nlp = English() nlp = English()
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
json_path = tmpdir / "test4402.json" output_file = tmpdir / "test4402.spacy"
srsly.write_json(json_path, json_data) docs = json2docs(json_data)
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
corpus = Corpus(str(json_path), str(json_path)) train_data = list(corpus.train_dataset(nlp))
assert len(train_data) == 2
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) split_train_data = []
# assert that the data got split into 4 sentences for eg in train_data:
assert len(train_data) == 4 split_train_data.extend(eg.split_sents())
assert len(split_train_data) == 4
json_data = [ json_data =\
{ {
"id": 0, "id": 0,
"paragraphs": [ "paragraphs": [
@ -89,4 +96,3 @@ json_data = [
}, },
], ],
} }
]

View File

@ -5,7 +5,7 @@ from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, compounding, minibatch from spacy.util import get_words_and_spaces, compounding, minibatch
import pytest import pytest
import srsly import srsly
@ -349,9 +349,7 @@ def test_iob_to_biluo():
iob_to_biluo(bad_iob) iob_to_biluo(bad_iob)
# This test is outdated as we use DocBin now. It should probably be removed? def test_roundtrip_docs_to_docbin(doc):
@pytest.mark.xfail(reason="Outdated")
def test_roundtrip_docs_to_json(doc):
nlp = English() nlp = English()
text = doc.text text = doc.text
idx = [t.idx for t in doc] idx = [t.idx for t in doc]
@ -364,14 +362,18 @@ def test_roundtrip_docs_to_json(doc):
cats = doc.cats cats = doc.cats
ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
# roundtrip to JSON # roundtrip to DocBin
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json" json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = Corpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
output_file = tmpdir / "roundtrip.spacy"
data = DocBin(docs=[doc]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train(nlp)
assert text == reloaded_example.reference.text assert text == reloaded_example.reference.text
assert idx == [t.idx for t in reloaded_example.reference] assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference]
@ -425,14 +427,14 @@ def test_ignore_misaligned(doc):
# We probably want the orth variant logic back, but this test won't be quite # We probably want the orth variant logic back, but this test won't be quite
# right -- we need to go from DocBin. # right -- we need to go from DocBin.
@pytest.mark.xfail(reason="Outdated")
def test_make_orth_variants(doc): def test_make_orth_variants(doc):
nlp = English() nlp = English()
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
json_file = tmpdir / "test.json" output_file = tmpdir / "roundtrip.spacy"
# write to JSON train dicts data = DocBin(docs=[doc]).to_bytes()
srsly.write_json(json_file, [docs_to_json(doc)]) with output_file.open("wb") as file_:
goldcorpus = Corpus(str(json_file), str(json_file)) file_.write(data)
goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
train_example = next(goldcorpus.train_dataset(nlp)) train_example = next(goldcorpus.train_dataset(nlp))