mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
fixing language and scoring tests
This commit is contained in:
parent
b4d914ec77
commit
fd5f199feb
|
@ -428,7 +428,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
try:
|
try:
|
||||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
|
raise KeyError(Errors.E983.format(dict='score_weights', key=str(e), keys=list(scores.keys())))
|
||||||
|
|
||||||
scores["speed"] = wps
|
scores["speed"] = wps
|
||||||
return weighted_score, scores
|
return weighted_score, scores
|
||||||
|
@ -577,7 +577,7 @@ def setup_printer(training, nlp):
|
||||||
]
|
]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
Errors.E983.format(dict='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
scores = [
|
scores = [
|
||||||
|
@ -585,7 +585,7 @@ def setup_printer(training, nlp):
|
||||||
for col in score_cols
|
for col in score_cols
|
||||||
]
|
]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
raise KeyError(Errors.E983.format(dict='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
||||||
data = (
|
data = (
|
||||||
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||||
)
|
)
|
||||||
|
|
|
@ -580,13 +580,14 @@ class Errors(object):
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E979 = ("Cannot convert {type} to an Example object.")
|
||||||
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
||||||
"identifier mapping to 1.0, and all others to 0.0.")
|
"identifier mapping to 1.0, and all others to 0.0.")
|
||||||
E981 = ("The offsets of the annotations for 'links' need to refer exactly "
|
E981 = ("The offsets of the annotations for 'links' need to refer exactly "
|
||||||
"to the offsets of the 'entities' annotations.")
|
"to the offsets of the 'entities' annotations.")
|
||||||
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
|
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||||
"{keys}")
|
"{keys}")
|
||||||
E985 = ("The pipeline component '{component}' is already available in the base "
|
E985 = ("The pipeline component '{component}' is already available in the base "
|
||||||
"model. The settings in the component block in the config file are "
|
"model. The settings in the component block in the config file are "
|
||||||
|
|
|
@ -216,11 +216,11 @@ class GoldCorpus(object):
|
||||||
examples = example.split_sents()
|
examples = example.split_sents()
|
||||||
else:
|
else:
|
||||||
examples = [example]
|
examples = [example]
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
if (not max_length) or len(ex.predicted) < max_length:
|
if (not max_length) or len(eg.predicted) < max_length:
|
||||||
if ignore_misaligned:
|
if ignore_misaligned:
|
||||||
try:
|
try:
|
||||||
_ = ex._deprecated_get_gold()
|
_ = eg._deprecated_get_gold()
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
continue
|
continue
|
||||||
yield ex
|
yield eg
|
||||||
|
|
|
@ -4,7 +4,6 @@ from ..tokens import Token
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align cimport Alignment
|
from .align cimport Alignment
|
||||||
from .annotation import TokenAnnotation, DocAnnotation
|
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
from .align import Alignment
|
from .align import Alignment
|
||||||
from ..errors import Errors, AlignmentError
|
from ..errors import Errors, AlignmentError
|
||||||
|
@ -13,7 +12,7 @@ from ..errors import Errors, AlignmentError
|
||||||
cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
|
cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
|
||||||
# TODO: Improve and test this
|
# TODO: Improve and test this
|
||||||
words = tok_annot.get("ORTH", [tok.text for tok in predicted])
|
words = tok_annot.get("ORTH", [tok.text for tok in predicted])
|
||||||
attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot)
|
attrs, array = _annot2array(predicted, tok_annot, doc_annot)
|
||||||
output = Doc(predicted.vocab, words=words)
|
output = Doc(predicted.vocab, words=words)
|
||||||
if array.size:
|
if array.size:
|
||||||
output = output.from_array(attrs, array)
|
output = output.from_array(attrs, array)
|
||||||
|
@ -63,8 +62,6 @@ cdef class Example:
|
||||||
@property
|
@property
|
||||||
def alignment(self):
|
def alignment(self):
|
||||||
if self._alignment is None:
|
if self._alignment is None:
|
||||||
if self.doc is None:
|
|
||||||
return None
|
|
||||||
spacy_words = [token.orth_ for token in self.predicted]
|
spacy_words = [token.orth_ for token in self.predicted]
|
||||||
gold_words = [token.orth_ for token in self.reference]
|
gold_words = [token.orth_ for token in self.reference]
|
||||||
if gold_words == []:
|
if gold_words == []:
|
||||||
|
@ -99,6 +96,7 @@ cdef class Example:
|
||||||
return {
|
return {
|
||||||
"doc_annotation": {
|
"doc_annotation": {
|
||||||
"cats": dict(self.reference.cats),
|
"cats": dict(self.reference.cats),
|
||||||
|
"entities": biluo_tags_from_doc(self.reference),
|
||||||
"links": [], # TODO
|
"links": [], # TODO
|
||||||
},
|
},
|
||||||
"token_annotation": {
|
"token_annotation": {
|
||||||
|
@ -110,8 +108,7 @@ cdef class Example:
|
||||||
"morphs": [t.morph_ for t in self.reference],
|
"morphs": [t.morph_ for t in self.reference],
|
||||||
"heads": [t.head.i for t in self.reference],
|
"heads": [t.head.i for t in self.reference],
|
||||||
"deps": [t.dep_ for t in self.reference],
|
"deps": [t.dep_ for t in self.reference],
|
||||||
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
|
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||||
"entities": biluo_tags_from_doc(self.reference)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -142,21 +139,21 @@ cdef class Example:
|
||||||
return self.x.text
|
return self.x.text
|
||||||
|
|
||||||
|
|
||||||
def _annot2array(vocab, tok_annot, doc_annot):
|
def _annot2array(predicted, tok_annot, doc_annot):
|
||||||
attrs = []
|
attrs = []
|
||||||
values = []
|
values = []
|
||||||
|
|
||||||
for key, value in doc_annot.items():
|
for key, value in doc_annot.items():
|
||||||
if key == "entities":
|
if key == "entities":
|
||||||
words = tok_annot["ORTH"]
|
words = tok_annot.get("ORTH", [tok.text for tok in predicted])
|
||||||
ent_iobs, ent_types = _parse_ner_tags(vocab, words, value)
|
ent_iobs, ent_types = _parse_ner_tags(predicted.vocab, words, value)
|
||||||
tok_annot["ENT_IOB"] = ent_iobs
|
tok_annot["ENT_IOB"] = ent_iobs
|
||||||
tok_annot["ENT_TYPE"] = ent_types
|
tok_annot["ENT_TYPE"] = ent_types
|
||||||
elif key == "links":
|
elif key == "links":
|
||||||
entities = doc_annot.get("entities", {})
|
entities = doc_annot.get("entities", {})
|
||||||
if value and not entities:
|
if value and not entities:
|
||||||
raise ValueError(Errors.E981)
|
raise ValueError(Errors.E981)
|
||||||
ent_kb_ids = _parse_links(vocab, words, value, entities)
|
ent_kb_ids = _parse_links(predicted.vocab, words, value, entities)
|
||||||
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
||||||
elif key == "cats":
|
elif key == "cats":
|
||||||
pass
|
pass
|
||||||
|
@ -176,7 +173,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append(value)
|
values.append(value)
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([predicted.vocab.morphology.add(v) for v in value])
|
||||||
elif key == "ENT_IOB":
|
elif key == "ENT_IOB":
|
||||||
iob_strings = Token.iob_strings()
|
iob_strings = Token.iob_strings()
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
|
@ -186,7 +183,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
raise ValueError(Errors.E982.format(values=iob_strings, value=values))
|
raise ValueError(Errors.E982.format(values=iob_strings, value=values))
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
values.append([predicted.vocab.strings.add(v) for v in value])
|
||||||
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
@ -227,12 +224,12 @@ def _fix_legacy_dict_data(predicted, example_dict):
|
||||||
old_token_dict = token_dict
|
old_token_dict = token_dict
|
||||||
token_dict = {}
|
token_dict = {}
|
||||||
for key, value in old_token_dict.items():
|
for key, value in old_token_dict.items():
|
||||||
if key in ("text", "ids", "entities", "ner", "brackets"):
|
if key in ("text", "ids", "brackets"):
|
||||||
pass
|
pass
|
||||||
elif key in remapping:
|
elif key in remapping:
|
||||||
token_dict[remapping[key]] = value
|
token_dict[remapping[key]] = value
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown attr: {key}")
|
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
||||||
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
||||||
# If heads are set, we don't also redundantly specify SENT_START.
|
# If heads are set, we don't also redundantly specify SENT_START.
|
||||||
token_dict.pop("SENT_START")
|
token_dict.pop("SENT_START")
|
||||||
|
|
|
@ -526,6 +526,23 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
def _convert_examples(self, examples):
|
||||||
|
converted_examples = []
|
||||||
|
if isinstance(examples, tuple):
|
||||||
|
examples = [examples]
|
||||||
|
for eg in examples:
|
||||||
|
if isinstance(eg, Example):
|
||||||
|
converted_examples.append(eg)
|
||||||
|
elif isinstance(eg, tuple):
|
||||||
|
doc, annot = eg
|
||||||
|
if isinstance(doc, str):
|
||||||
|
doc = self.make_doc(doc)
|
||||||
|
converted_examples.append(Example.from_dict(doc, annot))
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E979.format(type=type(eg)))
|
||||||
|
return converted_examples
|
||||||
|
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
examples,
|
examples,
|
||||||
|
@ -553,7 +570,7 @@ class Language(object):
|
||||||
|
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
examples = self._convert_examples(examples)
|
||||||
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
|
@ -601,7 +618,7 @@ class Language(object):
|
||||||
# TODO: document
|
# TODO: document
|
||||||
if len(examples) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
examples = self._convert_examples(examples)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer()
|
self._optimizer = create_default_optimizer()
|
||||||
|
@ -640,8 +657,8 @@ class Language(object):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "preprocess_gold"):
|
if hasattr(proc, "preprocess_gold"):
|
||||||
examples = proc.preprocess_gold(examples)
|
examples = proc.preprocess_gold(examples)
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
yield ex
|
yield eg
|
||||||
|
|
||||||
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
|
@ -723,7 +740,7 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
examples = Example.to_example_objects(examples)
|
examples = self._convert_examples(examples)
|
||||||
if scorer is None:
|
if scorer is None:
|
||||||
scorer = Scorer(pipeline=self.pipeline)
|
scorer = Scorer(pipeline=self.pipeline)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
|
@ -738,7 +755,7 @@ class Language(object):
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
docs = pipe.pipe(docs, **kwargs)
|
||||||
for doc, eg in zip(docs, examples):
|
for doc, eg in zip(docs, examples):
|
||||||
if verbose:
|
if verbose:
|
||||||
print(ex.doc)
|
print(doc)
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
kwargs = component_cfg.get("scorer", {})
|
kwargs = component_cfg.get("scorer", {})
|
||||||
kwargs.setdefault("verbose", verbose)
|
kwargs.setdefault("verbose", verbose)
|
||||||
|
@ -1189,9 +1206,9 @@ def _pipe(examples, proc, kwargs):
|
||||||
for arg in ["n_threads", "batch_size"]:
|
for arg in ["n_threads", "batch_size"]:
|
||||||
if arg in kwargs:
|
if arg in kwargs:
|
||||||
kwargs.pop(arg)
|
kwargs.pop(arg)
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
ex = proc(ex, **kwargs)
|
eg = proc(eg, **kwargs)
|
||||||
yield ex
|
yield eg
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
|
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
|
||||||
|
|
|
@ -91,9 +91,9 @@ class Morphologizer(Tagger):
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
pos_tags = ex.get_aligned("POS")
|
pos_tags = eg.get_aligned("POS")
|
||||||
morphs = ex.get_aligned("MORPH")
|
morphs = eg.get_aligned("MORPH")
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
pos = pos_tags[i]
|
pos = pos_tags[i]
|
||||||
morph = morphs[i]
|
morph = morphs[i]
|
||||||
|
@ -116,7 +116,7 @@ class Morphologizer(Tagger):
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
d_scores *= self.model.ops.asarray(known_labels)
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.doc for eg in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
|
|
@ -72,8 +72,7 @@ class SimpleNER(Pipe):
|
||||||
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
if not any(_has_ner(eg) for eg in examples):
|
if not any(_has_ner(eg) for eg in examples):
|
||||||
return 0
|
return 0
|
||||||
examples = Example.to_example_objects(examples)
|
docs = [eg.doc for eg in examples]
|
||||||
docs = [ex.doc for ex in examples]
|
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
scores, bp_scores = self.model.begin_update(docs)
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
|
|
@ -97,7 +97,6 @@ class Tok2Vec(Pipe):
|
||||||
"""
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
docs = [eg.doc for eg in examples]
|
docs = [eg.doc for eg in examples]
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .gold import tags_to_entities, GoldParse, DocAnnotation
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
@ -273,7 +272,7 @@ class Scorer(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
||||||
"""Update the evaluation scores from a single Doc / GoldParse pair.
|
"""Update the evaluation scores from a single Example.
|
||||||
|
|
||||||
example (Example): The predicted annotations + correct annotations.
|
example (Example): The predicted annotations + correct annotations.
|
||||||
verbose (bool): Print debugging information.
|
verbose (bool): Print debugging information.
|
||||||
|
@ -283,17 +282,9 @@ class Scorer(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
if isinstance(example, tuple) and len(example) == 2:
|
doc = example.predicted
|
||||||
doc, gold = example
|
gold_doc = example.reference
|
||||||
else:
|
align = example.alignment
|
||||||
gold = example._deprecated_get_gold()
|
|
||||||
doc = example.doc
|
|
||||||
|
|
||||||
if len(doc) != len(gold):
|
|
||||||
doc_annotation = DocAnnotation(cats=gold.cats)
|
|
||||||
token_annotation = gold.orig
|
|
||||||
gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
|
|
||||||
orig = gold.orig
|
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_deps_per_dep = {}
|
gold_deps_per_dep = {}
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
|
@ -301,28 +292,28 @@ class Scorer(object):
|
||||||
gold_morphs = set()
|
gold_morphs = set()
|
||||||
gold_morphs_per_feat = {}
|
gold_morphs_per_feat = {}
|
||||||
gold_sent_starts = set()
|
gold_sent_starts = set()
|
||||||
gold_ents = set(tags_to_entities(orig.entities))
|
for gold_i, token in enumerate(gold_doc):
|
||||||
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
|
gold_tags.add((gold_i, token.tag_))
|
||||||
gold_tags.add((id_, tag))
|
gold_pos.add((gold_i, token.pos_))
|
||||||
gold_pos.add((id_, pos))
|
gold_morphs.add((gold_i, token.morph_))
|
||||||
gold_morphs.add((id_, morph))
|
if token.morph_:
|
||||||
if morph:
|
for feat in token.morph_.split("|"):
|
||||||
for feat in morph.split("|"):
|
|
||||||
field, values = feat.split("=")
|
field, values = feat.split("=")
|
||||||
if field not in self.morphs_per_feat:
|
if field not in self.morphs_per_feat:
|
||||||
self.morphs_per_feat[field] = PRFScore()
|
self.morphs_per_feat[field] = PRFScore()
|
||||||
if field not in gold_morphs_per_feat:
|
if field not in gold_morphs_per_feat:
|
||||||
gold_morphs_per_feat[field] = set()
|
gold_morphs_per_feat[field] = set()
|
||||||
gold_morphs_per_feat[field].add((id_, feat))
|
gold_morphs_per_feat[field].add((gold_i, feat))
|
||||||
if sent_start:
|
if token.sent_start:
|
||||||
gold_sent_starts.add(id_)
|
gold_sent_starts.add(gold_i)
|
||||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
dep = token.dep_.lower()
|
||||||
gold_deps.add((id_, head, dep.lower()))
|
if dep not in punct_labels:
|
||||||
if dep.lower() not in self.labelled_per_dep:
|
gold_deps.add((gold_i, token.head.i, dep))
|
||||||
self.labelled_per_dep[dep.lower()] = PRFScore()
|
if dep not in self.labelled_per_dep:
|
||||||
if dep.lower() not in gold_deps_per_dep:
|
self.labelled_per_dep[dep] = PRFScore()
|
||||||
gold_deps_per_dep[dep.lower()] = set()
|
if dep not in gold_deps_per_dep:
|
||||||
gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
|
gold_deps_per_dep[dep] = set()
|
||||||
|
gold_deps_per_dep[dep].add((gold_i, token.head.i, dep))
|
||||||
cand_deps = set()
|
cand_deps = set()
|
||||||
cand_deps_per_dep = {}
|
cand_deps_per_dep = {}
|
||||||
cand_tags = set()
|
cand_tags = set()
|
||||||
|
@ -333,7 +324,7 @@ class Scorer(object):
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
gold_i = gold.cand_to_gold[token.i]
|
gold_i = align.cand_to_gold[token.i]
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
self.tokens.fp += 1
|
self.tokens.fp += 1
|
||||||
else:
|
else:
|
||||||
|
@ -352,7 +343,7 @@ class Scorer(object):
|
||||||
if token.is_sent_start:
|
if token.is_sent_start:
|
||||||
cand_sent_starts.add(gold_i)
|
cand_sent_starts.add(gold_i)
|
||||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||||
gold_head = gold.cand_to_gold[token.head.i]
|
gold_head = align.cand_to_gold[token.head.i]
|
||||||
# None is indistinct, so we can't just add it to the set
|
# None is indistinct, so we can't just add it to the set
|
||||||
# Multiple (None, None) deps are possible
|
# Multiple (None, None) deps are possible
|
||||||
if gold_i is None or gold_head is None:
|
if gold_i is None or gold_head is None:
|
||||||
|
@ -367,23 +358,24 @@ class Scorer(object):
|
||||||
cand_deps_per_dep[token.dep_.lower()].add(
|
cand_deps_per_dep[token.dep_.lower()].add(
|
||||||
(gold_i, gold_head, token.dep_.lower())
|
(gold_i, gold_head, token.dep_.lower())
|
||||||
)
|
)
|
||||||
if "-" not in [token[-1] for token in orig.entities]:
|
|
||||||
# Find all NER labels in gold and doc
|
# Find all NER labels in gold and doc
|
||||||
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
ent_labels = set([k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents])
|
||||||
# Set up all labels for per type scoring and prepare gold per type
|
# Set up all labels for per type scoring and prepare gold per type
|
||||||
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
|
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||||
for ent_label in ent_labels:
|
for ent_label in ent_labels:
|
||||||
if ent_label not in self.ner_per_ents:
|
if ent_label not in self.ner_per_ents:
|
||||||
self.ner_per_ents[ent_label] = PRFScore()
|
self.ner_per_ents[ent_label] = PRFScore()
|
||||||
gold_per_ents[ent_label].update(
|
|
||||||
[x for x in gold_ents if x[0] == ent_label]
|
|
||||||
)
|
|
||||||
# Find all candidate labels, for all and per type
|
# Find all candidate labels, for all and per type
|
||||||
|
gold_ents = set()
|
||||||
|
for ent in gold_doc.ents:
|
||||||
|
gold_ent = (ent.label_, ent.start, ent.end - 1)
|
||||||
|
gold_ents.add(gold_ent)
|
||||||
|
gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
|
||||||
cand_ents = set()
|
cand_ents = set()
|
||||||
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
first = gold.cand_to_gold[ent.start]
|
first = align.cand_to_gold[ent.start]
|
||||||
last = gold.cand_to_gold[ent.end - 1]
|
last = align.cand_to_gold[ent.end - 1]
|
||||||
if first is None or last is None:
|
if first is None or last is None:
|
||||||
self.ner.fp += 1
|
self.ner.fp += 1
|
||||||
self.ner_per_ents[ent.label_].fp += 1
|
self.ner_per_ents[ent.label_].fp += 1
|
||||||
|
@ -411,38 +403,38 @@ class Scorer(object):
|
||||||
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
len(gold.cats) > 0
|
len(gold_doc.cats) > 0
|
||||||
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
|
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold_doc.cats)
|
||||||
and set(gold.cats) == set(doc.cats)
|
and set(gold_doc.cats) == set(doc.cats)
|
||||||
):
|
):
|
||||||
goldcat = max(gold.cats, key=gold.cats.get)
|
goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
|
||||||
candcat = max(doc.cats, key=doc.cats.get)
|
candcat = max(doc.cats, key=doc.cats.get)
|
||||||
if self.textcat_positive_label:
|
if self.textcat_positive_label:
|
||||||
self.textcat.score_set(
|
self.textcat.score_set(
|
||||||
set([self.textcat_positive_label]) & set([candcat]),
|
set([self.textcat_positive_label]) & set([candcat]),
|
||||||
set([self.textcat_positive_label]) & set([goldcat]),
|
set([self.textcat_positive_label]) & set([goldcat]),
|
||||||
)
|
)
|
||||||
for label in set(gold.cats):
|
for label in set(gold_doc.cats):
|
||||||
self.textcat_auc_per_cat[label].score_set(
|
self.textcat_auc_per_cat[label].score_set(
|
||||||
doc.cats[label], gold.cats[label]
|
doc.cats[label], gold_doc.cats[label]
|
||||||
)
|
)
|
||||||
self.textcat_f_per_cat[label].score_set(
|
self.textcat_f_per_cat[label].score_set(
|
||||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||||
)
|
)
|
||||||
elif len(self.textcat_f_per_cat) > 0:
|
elif len(self.textcat_f_per_cat) > 0:
|
||||||
model_labels = set(self.textcat_f_per_cat)
|
model_labels = set(self.textcat_f_per_cat)
|
||||||
eval_labels = set(gold.cats)
|
eval_labels = set(gold_doc.cats)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||||
)
|
)
|
||||||
elif len(self.textcat_auc_per_cat) > 0:
|
elif len(self.textcat_auc_per_cat) > 0:
|
||||||
model_labels = set(self.textcat_auc_per_cat)
|
model_labels = set(self.textcat_auc_per_cat)
|
||||||
eval_labels = set(gold.cats)
|
eval_labels = set(gold_doc.cats)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||||
)
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
gold_words = orig.words
|
gold_words = gold_doc.words
|
||||||
for w_id, h_id, dep in cand_deps - gold_deps:
|
for w_id, h_id, dep in cand_deps - gold_deps:
|
||||||
print("F", gold_words[w_id], dep, gold_words[h_id])
|
print("F", gold_words[w_id], dep, gold_words[h_id])
|
||||||
for w_id, h_id, dep in gold_deps - cand_deps:
|
for w_id, h_id, dep in gold_deps - cand_deps:
|
||||||
|
|
|
@ -23,7 +23,6 @@ from ..compat import copy_array
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer
|
||||||
from .. import util
|
from .. import util
|
||||||
from . import _beam_utils
|
|
||||||
from . import nonproj
|
from . import nonproj
|
||||||
|
|
||||||
|
|
||||||
|
@ -260,10 +259,9 @@ class ParserStepModel(Model):
|
||||||
def mark_class_seen(self, class_):
|
def mark_class_seen(self, class_):
|
||||||
self._class_mask[class_] = 1
|
self._class_mask[class_] = 1
|
||||||
|
|
||||||
def get_token_ids(self, batch):
|
def get_token_ids(self, states):
|
||||||
states = _beam_utils.collect_states(batch)
|
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
states = [state for state in states if not state.is_final()]
|
states = [state for state in states() if not state.is_final()]
|
||||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
||||||
dtype='i', order='C')
|
dtype='i', order='C')
|
||||||
ids.fill(-1)
|
ids.fill(-1)
|
||||||
|
|
|
@ -263,8 +263,6 @@ cdef class Parser:
|
||||||
free(is_valid)
|
free(is_valid)
|
||||||
|
|
||||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
@ -275,7 +273,7 @@ cdef class Parser:
|
||||||
states_golds = [(s, g) for (s, g) in zip(states, golds)
|
states_golds = [(s, g) for (s, g) in zip(states, golds)
|
||||||
if not s.is_final() and g is not None]
|
if not s.is_final() and g is not None]
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||||
model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
|
model, backprop_tok2vec = self.model.begin_update([eg.doc for eg in examples])
|
||||||
all_states = list(states)
|
all_states = list(states)
|
||||||
for _ in range(max_steps):
|
for _ in range(max_steps):
|
||||||
if not states_golds:
|
if not states_golds:
|
||||||
|
@ -291,13 +289,12 @@ cdef class Parser:
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.doc for eg in examples]
|
||||||
self.set_annotations(docs, all_states)
|
self.set_annotations(docs, all_states)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
|
@ -307,7 +304,7 @@ cdef class Parser:
|
||||||
return None
|
return None
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.doc for eg in examples]
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import itertools
|
import itertools
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -19,7 +18,6 @@ def nlp():
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail # TODO
|
|
||||||
def test_language_update(nlp):
|
def test_language_update(nlp):
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||||
|
@ -32,7 +30,7 @@ def test_language_update(nlp):
|
||||||
# Update badly
|
# Update badly
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.update((doc, None))
|
nlp.update((doc, None))
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(KeyError):
|
||||||
nlp.update((text, wrongkeyannots))
|
nlp.update((text, wrongkeyannots))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.gold.example as Example
|
from spacy.gold.example import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
@ -7,19 +7,19 @@ from spacy.vocab import Vocab
|
||||||
def test_Example_init_requires_doc_objects():
|
def test_Example_init_requires_doc_objects():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
eg = Example(None, None)
|
example = Example(None, None)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
eg = Example(Doc(vocab, words=["hi"]), None)
|
example = Example(Doc(vocab, words=["hi"]), None)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
eg = Example(None, Doc(vocab, words=["hi"]))
|
example = Example(None, Doc(vocab, words=["hi"]))
|
||||||
|
|
||||||
|
|
||||||
def test_Example_from_dict_basic():
|
def test_Example_from_dict_basic():
|
||||||
eg = Example.from_dict(
|
example = Example.from_dict(
|
||||||
Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
|
Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
|
||||||
)
|
)
|
||||||
assert isinstance(eg.x, Doc)
|
assert isinstance(example.x, Doc)
|
||||||
assert isinstance(eg.y, Doc)
|
assert isinstance(example.y, Doc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -36,8 +36,8 @@ def test_Example_from_dict_invalid(annots):
|
||||||
def test_Example_from_dict_with_tags(annots):
|
def test_Example_from_dict_with_tags(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
for i, token in enumerate(eg.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert token.tag_ == annots["tags"][i]
|
assert token.tag_ == annots["tags"][i]
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,8 +54,8 @@ def test_Example_from_dict_with_tags(annots):
|
||||||
def test_Example_from_dict_with_parse(annots):
|
def test_Example_from_dict_with_parse(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
for i, token in enumerate(eg.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert token.dep_ == annots["deps"][i]
|
assert token.dep_ == annots["deps"][i]
|
||||||
assert token.head.i == annots["heads"][i]
|
assert token.head.i == annots["heads"][i]
|
||||||
|
|
||||||
|
@ -77,8 +77,8 @@ def test_Example_from_dict_with_parse(annots):
|
||||||
def test_Example_from_dict_with_morphology(annots):
|
def test_Example_from_dict_with_morphology(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
for i, token in enumerate(eg.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert token.morph_ == annots["morphs"][i]
|
assert token.morph_ == annots["morphs"][i]
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,9 +94,9 @@ def test_Example_from_dict_with_morphology(annots):
|
||||||
def test_Example_from_dict_with_sent_start(annots):
|
def test_Example_from_dict_with_sent_start(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
assert len(list(eg.reference.sents)) == 2
|
assert len(list(example.reference.sents)) == 2
|
||||||
for i, token in enumerate(eg.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
|
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
|
||||||
|
|
||||||
|
|
||||||
|
@ -112,11 +112,11 @@ def test_Example_from_dict_with_sent_start(annots):
|
||||||
def test_Example_from_dict_with_cats(annots):
|
def test_Example_from_dict_with_cats(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
assert len(list(eg.reference.cats)) == 3
|
assert len(list(example.reference.cats)) == 3
|
||||||
assert eg.reference.cats["cat1"] == 1.0
|
assert example.reference.cats["cat1"] == 1.0
|
||||||
assert eg.reference.cats["cat2"] == 0.0
|
assert example.reference.cats["cat2"] == 0.0
|
||||||
assert eg.reference.cats["cat3"] == 0.5
|
assert example.reference.cats["cat3"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -131,18 +131,18 @@ def test_Example_from_dict_with_cats(annots):
|
||||||
def test_Example_from_dict_with_entities(annots):
|
def test_Example_from_dict_with_entities(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
assert len(list(eg.reference.ents)) == 2
|
assert len(list(example.reference.ents)) == 2
|
||||||
assert eg.reference[0].ent_iob_ == "O"
|
assert example.reference[0].ent_iob_ == "O"
|
||||||
assert eg.reference[1].ent_iob_ == "O"
|
assert example.reference[1].ent_iob_ == "O"
|
||||||
assert eg.reference[2].ent_iob_ == "B"
|
assert example.reference[2].ent_iob_ == "B"
|
||||||
assert eg.reference[3].ent_iob_ == "I"
|
assert example.reference[3].ent_iob_ == "I"
|
||||||
assert eg.reference[4].ent_iob_ == "O"
|
assert example.reference[4].ent_iob_ == "O"
|
||||||
assert eg.reference[5].ent_iob_ == "B"
|
assert example.reference[5].ent_iob_ == "B"
|
||||||
assert eg.reference[6].ent_iob_ == "O"
|
assert example.reference[6].ent_iob_ == "O"
|
||||||
assert eg.reference[2].ent_type_ == "LOC"
|
assert example.reference[2].ent_type_ == "LOC"
|
||||||
assert eg.reference[3].ent_type_ == "LOC"
|
assert example.reference[3].ent_type_ == "LOC"
|
||||||
assert eg.reference[5].ent_type_ == "LOC"
|
assert example.reference[5].ent_type_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -158,14 +158,14 @@ def test_Example_from_dict_with_entities(annots):
|
||||||
def test_Example_from_dict_with_links(annots):
|
def test_Example_from_dict_with_links(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
assert eg.reference[0].ent_kb_id_ == ""
|
assert example.reference[0].ent_kb_id_ == ""
|
||||||
assert eg.reference[1].ent_kb_id_ == ""
|
assert example.reference[1].ent_kb_id_ == ""
|
||||||
assert eg.reference[2].ent_kb_id_ == "Q60"
|
assert example.reference[2].ent_kb_id_ == "Q60"
|
||||||
assert eg.reference[3].ent_kb_id_ == "Q60"
|
assert example.reference[3].ent_kb_id_ == "Q60"
|
||||||
assert eg.reference[4].ent_kb_id_ == ""
|
assert example.reference[4].ent_kb_id_ == ""
|
||||||
assert eg.reference[5].ent_kb_id_ == "Q64"
|
assert example.reference[5].ent_kb_id_ == "Q64"
|
||||||
assert eg.reference[6].ent_kb_id_ == ""
|
assert example.reference[6].ent_kb_id_ == ""
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||||
import pytest
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
from spacy.gold import Example, GoldParse, TokenAnnotation
|
from spacy.gold import Example
|
||||||
from spacy.gold.iob_utils import biluo_tags_from_offsets
|
from spacy.gold.iob_utils import biluo_tags_from_offsets
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
|
@ -90,8 +90,9 @@ def test_las_per_type(en_vocab):
|
||||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||||
deps=annot["deps"],
|
deps=annot["deps"],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||||
scorer.score((doc, gold))
|
example = Example.from_dict(doc, gold)
|
||||||
|
scorer.score(example)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["uas"] == 100
|
assert results["uas"] == 100
|
||||||
|
@ -112,9 +113,10 @@ def test_las_per_type(en_vocab):
|
||||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||||
deps=annot["deps"],
|
deps=annot["deps"],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||||
doc[0].dep_ = "compound"
|
doc[0].dep_ = "compound"
|
||||||
scorer.score((doc, gold))
|
example = Example.from_dict(doc, gold)
|
||||||
|
scorer.score(example)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["uas"] == 100
|
assert results["uas"] == 100
|
||||||
|
@ -137,10 +139,7 @@ def test_ner_per_type(en_vocab):
|
||||||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||||
)
|
)
|
||||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||||
ex = Example(
|
ex = Example.from_dict(doc, {"entities": entities})
|
||||||
doc=doc,
|
|
||||||
token_annotation=TokenAnnotation(entities=entities)
|
|
||||||
)
|
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
@ -161,10 +160,7 @@ def test_ner_per_type(en_vocab):
|
||||||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||||
)
|
)
|
||||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||||
ex = Example(
|
ex = Example.from_dict(doc, {"entities": entities})
|
||||||
doc=doc,
|
|
||||||
token_annotation=TokenAnnotation(entities=entities)
|
|
||||||
)
|
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
@ -188,13 +184,13 @@ def test_ner_per_type(en_vocab):
|
||||||
def test_tag_score(tagged_doc):
|
def test_tag_score(tagged_doc):
|
||||||
# Gold and Doc are identical
|
# Gold and Doc are identical
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
gold = GoldParse(
|
gold = {
|
||||||
tagged_doc,
|
"tags": [t.tag_ for t in tagged_doc],
|
||||||
tags=[t.tag_ for t in tagged_doc],
|
"pos": [t.pos_ for t in tagged_doc],
|
||||||
pos=[t.pos_ for t in tagged_doc],
|
"morphs": [t.morph_ for t in tagged_doc],
|
||||||
morphs=[t.morph_ for t in tagged_doc]
|
}
|
||||||
)
|
example = Example.from_dict(tagged_doc, gold)
|
||||||
scorer.score((tagged_doc, gold))
|
scorer.score(example)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["tags_acc"] == 100
|
assert results["tags_acc"] == 100
|
||||||
|
@ -211,8 +207,9 @@ def test_tag_score(tagged_doc):
|
||||||
morphs = [t.morph_ for t in tagged_doc]
|
morphs = [t.morph_ for t in tagged_doc]
|
||||||
morphs[1] = "Number=sing"
|
morphs[1] = "Number=sing"
|
||||||
morphs[2] = "Number=plur"
|
morphs[2] = "Number=plur"
|
||||||
gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs)
|
gold = {"tags": tags, "pos": pos, "morphs": morphs}
|
||||||
scorer.score((tagged_doc, gold))
|
example = Example.from_dict(tagged_doc, gold)
|
||||||
|
scorer.score(example)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["tags_acc"] == 90
|
assert results["tags_acc"] == 90
|
||||||
|
|
|
@ -704,7 +704,7 @@ cdef class Doc:
|
||||||
for id_ in py_attr_ids]
|
for id_ in py_attr_ids]
|
||||||
except KeyError as msg:
|
except KeyError as msg:
|
||||||
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
||||||
raise KeyError(Errors.E983.format(dict_name="IDS", key=msg, keys=keys))
|
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys))
|
||||||
# Make an array from the attributes --- otherwise our inner loop is
|
# Make an array from the attributes --- otherwise our inner loop is
|
||||||
# Python dict iteration.
|
# Python dict iteration.
|
||||||
cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
|
cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user