diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index bdb574126..95fdb10d5 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -428,7 +428,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + raise KeyError(Errors.E983.format(dict='score_weights', key=str(e), keys=list(scores.keys()))) scores["speed"] = wps return weighted_score, scores @@ -577,7 +577,7 @@ def setup_printer(training, nlp): ] except KeyError as e: raise KeyError( - Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + Errors.E983.format(dict='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) try: scores = [ @@ -585,7 +585,7 @@ def setup_printer(training, nlp): for col in score_cols ] except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + raise KeyError(Errors.E983.format(dict='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) data = ( [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) diff --git a/spacy/errors.py b/spacy/errors.py index 27fc91496..1f035afcf 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -580,13 +580,14 @@ class Errors(object): "table, which contains {n_rows} vectors.") # TODO: fix numbering after merging develop into master + E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") E981 = ("The offsets of the annotations for 'links' need to refer exactly " "to the offsets of the 'entities' annotations.") E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") - E983 = ("Invalid key for '{dict_name}': {key}. Available keys: " + E983 = ("Invalid key for '{dict}': {key}. Available keys: " "{keys}") E985 = ("The pipeline component '{component}' is already available in the base " "model. The settings in the component block in the config file are " diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 01de682fd..f3eaabc4e 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -216,11 +216,11 @@ class GoldCorpus(object): examples = example.split_sents() else: examples = [example] - for ex in examples: - if (not max_length) or len(ex.predicted) < max_length: + for eg in examples: + if (not max_length) or len(eg.predicted) < max_length: if ignore_misaligned: try: - _ = ex._deprecated_get_gold() + _ = eg._deprecated_get_gold() except AlignmentError: continue - yield ex + yield eg diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 107c63cba..1fc671010 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -4,7 +4,6 @@ from ..tokens import Token from ..tokens.doc cimport Doc from ..attrs import IDS from .align cimport Alignment -from .annotation import TokenAnnotation, DocAnnotation from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .align import Alignment from ..errors import Errors, AlignmentError @@ -13,7 +12,7 @@ from ..errors import Errors, AlignmentError cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot): # TODO: Improve and test this words = tok_annot.get("ORTH", [tok.text for tok in predicted]) - attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot) + attrs, array = _annot2array(predicted, tok_annot, doc_annot) output = Doc(predicted.vocab, words=words) if array.size: output = output.from_array(attrs, array) @@ -63,8 +62,6 @@ cdef class Example: @property def alignment(self): if self._alignment is None: - if self.doc is None: - return None spacy_words = [token.orth_ for token in self.predicted] gold_words = [token.orth_ for token in self.reference] if gold_words == []: @@ -99,6 +96,7 @@ cdef class Example: return { "doc_annotation": { "cats": dict(self.reference.cats), + "entities": biluo_tags_from_doc(self.reference), "links": [], # TODO }, "token_annotation": { @@ -110,8 +108,7 @@ cdef class Example: "morphs": [t.morph_ for t in self.reference], "heads": [t.head.i for t in self.reference], "deps": [t.dep_ for t in self.reference], - "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference], - "entities": biluo_tags_from_doc(self.reference) + "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference] } } @@ -142,21 +139,21 @@ cdef class Example: return self.x.text -def _annot2array(vocab, tok_annot, doc_annot): +def _annot2array(predicted, tok_annot, doc_annot): attrs = [] values = [] for key, value in doc_annot.items(): if key == "entities": - words = tok_annot["ORTH"] - ent_iobs, ent_types = _parse_ner_tags(vocab, words, value) + words = tok_annot.get("ORTH", [tok.text for tok in predicted]) + ent_iobs, ent_types = _parse_ner_tags(predicted.vocab, words, value) tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_TYPE"] = ent_types elif key == "links": entities = doc_annot.get("entities", {}) if value and not entities: raise ValueError(Errors.E981) - ent_kb_ids = _parse_links(vocab, words, value, entities) + ent_kb_ids = _parse_links(predicted.vocab, words, value, entities) tok_annot["ENT_KB_ID"] = ent_kb_ids elif key == "cats": pass @@ -176,7 +173,7 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append(value) elif key == "MORPH": attrs.append(key) - values.append([vocab.morphology.add(v) for v in value]) + values.append([predicted.vocab.morphology.add(v) for v in value]) elif key == "ENT_IOB": iob_strings = Token.iob_strings() attrs.append(key) @@ -186,7 +183,7 @@ def _annot2array(vocab, tok_annot, doc_annot): raise ValueError(Errors.E982.format(values=iob_strings, value=values)) else: attrs.append(key) - values.append([vocab.strings.add(v) for v in value]) + values.append([predicted.vocab.strings.add(v) for v in value]) array = numpy.asarray(values, dtype="uint64") return attrs, array.T @@ -227,12 +224,12 @@ def _fix_legacy_dict_data(predicted, example_dict): old_token_dict = token_dict token_dict = {} for key, value in old_token_dict.items(): - if key in ("text", "ids", "entities", "ner", "brackets"): + if key in ("text", "ids", "brackets"): pass elif key in remapping: token_dict[remapping[key]] = value else: - raise ValueError(f"Unknown attr: {key}") + raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") diff --git a/spacy/language.py b/spacy/language.py index b9829b543..510c64d5b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -526,6 +526,23 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) + def _convert_examples(self, examples): + converted_examples = [] + if isinstance(examples, tuple): + examples = [examples] + for eg in examples: + if isinstance(eg, Example): + converted_examples.append(eg) + elif isinstance(eg, tuple): + doc, annot = eg + if isinstance(doc, str): + doc = self.make_doc(doc) + converted_examples.append(Example.from_dict(doc, annot)) + else: + raise ValueError(Errors.E979.format(type=type(eg))) + return converted_examples + + def update( self, examples, @@ -553,7 +570,7 @@ class Language(object): if len(examples) == 0: return - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = self._convert_examples(examples) if sgd is None: if self._optimizer is None: @@ -601,7 +618,7 @@ class Language(object): # TODO: document if len(examples) == 0: return - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = self._convert_examples(examples) if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() @@ -640,8 +657,8 @@ class Language(object): for name, proc in self.pipeline: if hasattr(proc, "preprocess_gold"): examples = proc.preprocess_gold(examples) - for ex in examples: - yield ex + for eg in examples: + yield eg def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and @@ -723,7 +740,7 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ - examples = Example.to_example_objects(examples) + examples = self._convert_examples(examples) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: @@ -738,7 +755,7 @@ class Language(object): docs = pipe.pipe(docs, **kwargs) for doc, eg in zip(docs, examples): if verbose: - print(ex.doc) + print(doc) eg.predicted = doc kwargs = component_cfg.get("scorer", {}) kwargs.setdefault("verbose", verbose) @@ -1189,9 +1206,9 @@ def _pipe(examples, proc, kwargs): for arg in ["n_threads", "batch_size"]: if arg in kwargs: kwargs.pop(arg) - for ex in examples: - ex = proc(ex, **kwargs) - yield ex + for eg in examples: + eg = proc(eg, **kwargs) + yield eg def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 227b5f6af..b693e4fd6 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -91,9 +91,9 @@ class Morphologizer(Tagger): correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for ex in examples: - pos_tags = ex.get_aligned("POS") - morphs = ex.get_aligned("MORPH") + for eg in examples: + pos_tags = eg.get_aligned("POS") + morphs = eg.get_aligned("MORPH") for i in range(len(morphs)): pos = pos_tags[i] morph = morphs[i] @@ -116,7 +116,7 @@ class Morphologizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c674046af..692c74a38 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -72,8 +72,7 @@ class SimpleNER(Pipe): def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): if not any(_has_ner(eg) for eg in examples): return 0 - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) loss, d_scores = self.get_loss(examples, scores) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index de30a55f0..69582908a 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -97,7 +97,6 @@ class Tok2Vec(Pipe): """ if losses is None: losses = {} - examples = Example.to_example_objects(examples) docs = [eg.doc for eg in examples] if isinstance(docs, Doc): docs = [docs] diff --git a/spacy/scorer.py b/spacy/scorer.py index 706e0cbc9..71cbc019a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,5 @@ import numpy as np -from .gold import tags_to_entities, GoldParse, DocAnnotation from .errors import Errors @@ -273,7 +272,7 @@ class Scorer(object): } def score(self, example, verbose=False, punct_labels=("p", "punct")): - """Update the evaluation scores from a single Doc / GoldParse pair. + """Update the evaluation scores from a single Example. example (Example): The predicted annotations + correct annotations. verbose (bool): Print debugging information. @@ -283,17 +282,9 @@ class Scorer(object): DOCS: https://spacy.io/api/scorer#score """ - if isinstance(example, tuple) and len(example) == 2: - doc, gold = example - else: - gold = example._deprecated_get_gold() - doc = example.doc - - if len(doc) != len(gold): - doc_annotation = DocAnnotation(cats=gold.cats) - token_annotation = gold.orig - gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) - orig = gold.orig + doc = example.predicted + gold_doc = example.reference + align = example.alignment gold_deps = set() gold_deps_per_dep = {} gold_tags = set() @@ -301,28 +292,28 @@ class Scorer(object): gold_morphs = set() gold_morphs_per_feat = {} gold_sent_starts = set() - gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): - gold_tags.add((id_, tag)) - gold_pos.add((id_, pos)) - gold_morphs.add((id_, morph)) - if morph: - for feat in morph.split("|"): + for gold_i, token in enumerate(gold_doc): + gold_tags.add((gold_i, token.tag_)) + gold_pos.add((gold_i, token.pos_)) + gold_morphs.add((gold_i, token.morph_)) + if token.morph_: + for feat in token.morph_.split("|"): field, values = feat.split("=") if field not in self.morphs_per_feat: self.morphs_per_feat[field] = PRFScore() if field not in gold_morphs_per_feat: gold_morphs_per_feat[field] = set() - gold_morphs_per_feat[field].add((id_, feat)) - if sent_start: - gold_sent_starts.add(id_) - if dep not in (None, "") and dep.lower() not in punct_labels: - gold_deps.add((id_, head, dep.lower())) - if dep.lower() not in self.labelled_per_dep: - self.labelled_per_dep[dep.lower()] = PRFScore() - if dep.lower() not in gold_deps_per_dep: - gold_deps_per_dep[dep.lower()] = set() - gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower())) + gold_morphs_per_feat[field].add((gold_i, feat)) + if token.sent_start: + gold_sent_starts.add(gold_i) + dep = token.dep_.lower() + if dep not in punct_labels: + gold_deps.add((gold_i, token.head.i, dep)) + if dep not in self.labelled_per_dep: + self.labelled_per_dep[dep] = PRFScore() + if dep not in gold_deps_per_dep: + gold_deps_per_dep[dep] = set() + gold_deps_per_dep[dep].add((gold_i, token.head.i, dep)) cand_deps = set() cand_deps_per_dep = {} cand_tags = set() @@ -333,7 +324,7 @@ class Scorer(object): for token in doc: if token.orth_.isspace(): continue - gold_i = gold.cand_to_gold[token.i] + gold_i = align.cand_to_gold[token.i] if gold_i is None: self.tokens.fp += 1 else: @@ -352,7 +343,7 @@ class Scorer(object): if token.is_sent_start: cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): - gold_head = gold.cand_to_gold[token.head.i] + gold_head = align.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set # Multiple (None, None) deps are possible if gold_i is None or gold_head is None: @@ -367,35 +358,36 @@ class Scorer(object): cand_deps_per_dep[token.dep_.lower()].add( (gold_i, gold_head, token.dep_.lower()) ) - if "-" not in [token[-1] for token in orig.entities]: - # Find all NER labels in gold and doc - ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) - # Set up all labels for per type scoring and prepare gold per type - gold_per_ents = {ent_label: set() for ent_label in ent_labels} - for ent_label in ent_labels: - if ent_label not in self.ner_per_ents: - self.ner_per_ents[ent_label] = PRFScore() - gold_per_ents[ent_label].update( - [x for x in gold_ents if x[0] == ent_label] - ) - # Find all candidate labels, for all and per type - cand_ents = set() - cand_per_ents = {ent_label: set() for ent_label in ent_labels} - for ent in doc.ents: - first = gold.cand_to_gold[ent.start] - last = gold.cand_to_gold[ent.end - 1] - if first is None or last is None: - self.ner.fp += 1 - self.ner_per_ents[ent.label_].fp += 1 - else: - cand_ents.add((ent.label_, first, last)) - cand_per_ents[ent.label_].add((ent.label_, first, last)) - # Scores per ent - for k, v in self.ner_per_ents.items(): - if k in cand_per_ents: - v.score_set(cand_per_ents[k], gold_per_ents[k]) - # Score for all ents - self.ner.score_set(cand_ents, gold_ents) + # Find all NER labels in gold and doc + ent_labels = set([k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]) + # Set up all labels for per type scoring and prepare gold per type + gold_per_ents = {ent_label: set() for ent_label in ent_labels} + for ent_label in ent_labels: + if ent_label not in self.ner_per_ents: + self.ner_per_ents[ent_label] = PRFScore() + # Find all candidate labels, for all and per type + gold_ents = set() + for ent in gold_doc.ents: + gold_ent = (ent.label_, ent.start, ent.end - 1) + gold_ents.add(gold_ent) + gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1)) + cand_ents = set() + cand_per_ents = {ent_label: set() for ent_label in ent_labels} + for ent in doc.ents: + first = align.cand_to_gold[ent.start] + last = align.cand_to_gold[ent.end - 1] + if first is None or last is None: + self.ner.fp += 1 + self.ner_per_ents[ent.label_].fp += 1 + else: + cand_ents.add((ent.label_, first, last)) + cand_per_ents[ent.label_].add((ent.label_, first, last)) + # Scores per ent + for k, v in self.ner_per_ents.items(): + if k in cand_per_ents: + v.score_set(cand_per_ents[k], gold_per_ents[k]) + # Score for all ents + self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) self.pos.score_set(cand_pos, gold_pos) self.morphs.score_set(cand_morphs, gold_morphs) @@ -411,38 +403,38 @@ class Scorer(object): set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) ) if ( - len(gold.cats) > 0 - and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) - and set(gold.cats) == set(doc.cats) + len(gold_doc.cats) > 0 + and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold_doc.cats) + and set(gold_doc.cats) == set(doc.cats) ): - goldcat = max(gold.cats, key=gold.cats.get) + goldcat = max(gold_doc.cats, key=gold_doc.cats.get) candcat = max(doc.cats, key=doc.cats.get) if self.textcat_positive_label: self.textcat.score_set( set([self.textcat_positive_label]) & set([candcat]), set([self.textcat_positive_label]) & set([goldcat]), ) - for label in set(gold.cats): + for label in set(gold_doc.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold.cats[label] + doc.cats[label], gold_doc.cats[label] ) self.textcat_f_per_cat[label].score_set( set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) - eval_labels = set(gold.cats) + eval_labels = set(gold_doc.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) elif len(self.textcat_auc_per_cat) > 0: model_labels = set(self.textcat_auc_per_cat) - eval_labels = set(gold.cats) + eval_labels = set(gold_doc.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) if verbose: - gold_words = orig.words + gold_words = gold_doc.words for w_id, h_id, dep in cand_deps - gold_deps: print("F", gold_words[w_id], dep, gold_words[h_id]) for w_id, h_id, dep in gold_deps - cand_deps: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index f7b8bc266..eef5723f3 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -23,7 +23,6 @@ from ..compat import copy_array from ..errors import Errors, TempErrors from ..util import link_vectors_to_models, create_default_optimizer from .. import util -from . import _beam_utils from . import nonproj @@ -260,10 +259,9 @@ class ParserStepModel(Model): def mark_class_seen(self, class_): self._class_mask[class_] = 1 - def get_token_ids(self, batch): - states = _beam_utils.collect_states(batch) + def get_token_ids(self, states): cdef StateClass state - states = [state for state in states if not state.is_final()] + states = [state for state in states() if not state.is_final()] cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), dtype='i', order='C') ids.fill(-1) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8c9050351..14b350067 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -263,8 +263,6 @@ cdef class Parser: free(is_valid) def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if losses is None: losses = {} losses.setdefault(self.name, 0.) @@ -275,7 +273,7 @@ cdef class Parser: states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] # Prepare the stepwise model, and get the callback for finishing the batch - model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples]) + model, backprop_tok2vec = self.model.begin_update([eg.doc for eg in examples]) all_states = list(states) for _ in range(max_steps): if not states_golds: @@ -291,13 +289,12 @@ cdef class Parser: if sgd is not None: self.model.finish_update(sgd) if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] self.set_annotations(docs, all_states) return losses def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" - examples = Example.to_example_objects(examples) if losses is None: losses = {} for multitask in self._multitasks: @@ -307,7 +304,7 @@ cdef class Parser: return None losses.setdefault(self.name, 0.) - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 363366eeb..9da89e947 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,6 +1,5 @@ import itertools import pytest -from spacy.gold import GoldParse from spacy.language import Language from spacy.tokens import Doc, Span from spacy.vocab import Vocab @@ -19,7 +18,6 @@ def nlp(): return nlp -@pytest.mark.xfail # TODO def test_language_update(nlp): text = "hello world" annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} @@ -32,7 +30,7 @@ def test_language_update(nlp): # Update badly with pytest.raises(ValueError): nlp.update((doc, None)) - with pytest.raises(TypeError): + with pytest.raises(KeyError): nlp.update((text, wrongkeyannots)) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index aea4c71ab..3cacf82ae 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -1,5 +1,5 @@ import pytest -from spacy.gold.example as Example +from spacy.gold.example import Example from spacy.tokens import Doc from spacy.vocab import Vocab @@ -7,19 +7,19 @@ from spacy.vocab import Vocab def test_Example_init_requires_doc_objects(): vocab = Vocab() with pytest.raises(TypeError): - eg = Example(None, None) + example = Example(None, None) with pytest.raises(TypeError): - eg = Example(Doc(vocab, words=["hi"]), None) + example = Example(Doc(vocab, words=["hi"]), None) with pytest.raises(TypeError): - eg = Example(None, Doc(vocab, words=["hi"])) + example = Example(None, Doc(vocab, words=["hi"])) def test_Example_from_dict_basic(): - eg = Example.from_dict( + example = Example.from_dict( Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]} ) - assert isinstance(eg.x, Doc) - assert isinstance(eg.y, Doc) + assert isinstance(example.x, Doc) + assert isinstance(example.y, Doc) @pytest.mark.parametrize( @@ -36,8 +36,8 @@ def test_Example_from_dict_invalid(annots): def test_Example_from_dict_with_tags(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - for i, token in enumerate(eg.reference): + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] @@ -54,8 +54,8 @@ def test_Example_from_dict_with_tags(annots): def test_Example_from_dict_with_parse(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - for i, token in enumerate(eg.reference): + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): assert token.dep_ == annots["deps"][i] assert token.head.i == annots["heads"][i] @@ -77,8 +77,8 @@ def test_Example_from_dict_with_parse(annots): def test_Example_from_dict_with_morphology(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - for i, token in enumerate(eg.reference): + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): assert token.morph_ == annots["morphs"][i] @@ -94,9 +94,9 @@ def test_Example_from_dict_with_morphology(annots): def test_Example_from_dict_with_sent_start(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert len(list(eg.reference.sents)) == 2 - for i, token in enumerate(eg.reference): + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.sents)) == 2 + for i, token in enumerate(example.reference): assert bool(token.is_sent_start) == bool(annots["sent_starts"][i]) @@ -112,11 +112,11 @@ def test_Example_from_dict_with_sent_start(annots): def test_Example_from_dict_with_cats(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert len(list(eg.reference.cats)) == 3 - assert eg.reference.cats["cat1"] == 1.0 - assert eg.reference.cats["cat2"] == 0.0 - assert eg.reference.cats["cat3"] == 0.5 + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.cats)) == 3 + assert example.reference.cats["cat1"] == 1.0 + assert example.reference.cats["cat2"] == 0.0 + assert example.reference.cats["cat3"] == 0.5 @pytest.mark.parametrize( @@ -131,18 +131,18 @@ def test_Example_from_dict_with_cats(annots): def test_Example_from_dict_with_entities(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert len(list(eg.reference.ents)) == 2 - assert eg.reference[0].ent_iob_ == "O" - assert eg.reference[1].ent_iob_ == "O" - assert eg.reference[2].ent_iob_ == "B" - assert eg.reference[3].ent_iob_ == "I" - assert eg.reference[4].ent_iob_ == "O" - assert eg.reference[5].ent_iob_ == "B" - assert eg.reference[6].ent_iob_ == "O" - assert eg.reference[2].ent_type_ == "LOC" - assert eg.reference[3].ent_type_ == "LOC" - assert eg.reference[5].ent_type_ == "LOC" + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 2 + assert example.reference[0].ent_iob_ == "O" + assert example.reference[1].ent_iob_ == "O" + assert example.reference[2].ent_iob_ == "B" + assert example.reference[3].ent_iob_ == "I" + assert example.reference[4].ent_iob_ == "O" + assert example.reference[5].ent_iob_ == "B" + assert example.reference[6].ent_iob_ == "O" + assert example.reference[2].ent_type_ == "LOC" + assert example.reference[3].ent_type_ == "LOC" + assert example.reference[5].ent_type_ == "LOC" @pytest.mark.parametrize( @@ -158,14 +158,14 @@ def test_Example_from_dict_with_entities(annots): def test_Example_from_dict_with_links(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert eg.reference[0].ent_kb_id_ == "" - assert eg.reference[1].ent_kb_id_ == "" - assert eg.reference[2].ent_kb_id_ == "Q60" - assert eg.reference[3].ent_kb_id_ == "Q60" - assert eg.reference[4].ent_kb_id_ == "" - assert eg.reference[5].ent_kb_id_ == "Q64" - assert eg.reference[6].ent_kb_id_ == "" + example = Example.from_dict(predicted, annots) + assert example.reference[0].ent_kb_id_ == "" + assert example.reference[1].ent_kb_id_ == "" + assert example.reference[2].ent_kb_id_ == "Q60" + assert example.reference[3].ent_kb_id_ == "Q60" + assert example.reference[4].ent_kb_id_ == "" + assert example.reference[5].ent_kb_id_ == "Q64" + assert example.reference[6].ent_kb_id_ == "" @pytest.mark.parametrize( diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 5eaf8d5b3..5574b7d6a 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,7 +1,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example, GoldParse, TokenAnnotation +from spacy.gold import Example from spacy.gold.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve @@ -90,8 +90,9 @@ def test_las_per_type(en_vocab): heads=([h - i for i, h in enumerate(annot["heads"])]), deps=annot["deps"], ) - gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) - scorer.score((doc, gold)) + gold = {"heads": annot["heads"], "deps": annot["deps"]} + example = Example.from_dict(doc, gold) + scorer.score(example) results = scorer.scores assert results["uas"] == 100 @@ -112,9 +113,10 @@ def test_las_per_type(en_vocab): heads=([h - i for i, h in enumerate(annot["heads"])]), deps=annot["deps"], ) - gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) + gold = {"heads": annot["heads"], "deps": annot["deps"]} doc[0].dep_ = "compound" - scorer.score((doc, gold)) + example = Example.from_dict(doc, gold) + scorer.score(example) results = scorer.scores assert results["uas"] == 100 @@ -137,10 +139,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) entities = biluo_tags_from_offsets(doc, annot["entities"]) - ex = Example( - doc=doc, - token_annotation=TokenAnnotation(entities=entities) - ) + ex = Example.from_dict(doc, {"entities": entities}) scorer.score(ex) results = scorer.scores @@ -161,10 +160,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) entities = biluo_tags_from_offsets(doc, annot["entities"]) - ex = Example( - doc=doc, - token_annotation=TokenAnnotation(entities=entities) - ) + ex = Example.from_dict(doc, {"entities": entities}) scorer.score(ex) results = scorer.scores @@ -188,13 +184,13 @@ def test_ner_per_type(en_vocab): def test_tag_score(tagged_doc): # Gold and Doc are identical scorer = Scorer() - gold = GoldParse( - tagged_doc, - tags=[t.tag_ for t in tagged_doc], - pos=[t.pos_ for t in tagged_doc], - morphs=[t.morph_ for t in tagged_doc] - ) - scorer.score((tagged_doc, gold)) + gold = { + "tags": [t.tag_ for t in tagged_doc], + "pos": [t.pos_ for t in tagged_doc], + "morphs": [t.morph_ for t in tagged_doc], + } + example = Example.from_dict(tagged_doc, gold) + scorer.score(example) results = scorer.scores assert results["tags_acc"] == 100 @@ -211,8 +207,9 @@ def test_tag_score(tagged_doc): morphs = [t.morph_ for t in tagged_doc] morphs[1] = "Number=sing" morphs[2] = "Number=plur" - gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs) - scorer.score((tagged_doc, gold)) + gold = {"tags": tags, "pos": pos, "morphs": morphs} + example = Example.from_dict(tagged_doc, gold) + scorer.score(example) results = scorer.scores assert results["tags_acc"] == 90 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 22a6b0830..601d4f4a7 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -704,7 +704,7 @@ cdef class Doc: for id_ in py_attr_ids] except KeyError as msg: keys = [k for k in IDS.keys() if not k.startswith("FLAG")] - raise KeyError(Errors.E983.format(dict_name="IDS", key=msg, keys=keys)) + raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")