Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-11-03 18:10:28 +01:00
commit 1075b7ebb7
48 changed files with 753 additions and 344 deletions

View File

@ -8,7 +8,6 @@ requires = [
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0rc0,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy"
]
build-backend = "setuptools.build_meta"

View File

@ -14,8 +14,7 @@ pathy
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.5.0,<2.0.0
pytokenizations
pydantic>=1.5.0,<1.7.0
# Official Python utilities
setuptools
packaging>=20.0

View File

@ -51,8 +51,8 @@ install_requires =
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
pydantic>=1.5.0,<2.0.0
pytokenizations
pydantic>=1.5.0,<1.7.0
jinja2
# Official Python utilities
setuptools
packaging>=20.0

View File

@ -49,6 +49,7 @@ MOD_NAMES = [
"spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system",
"spacy.tokenizer",
"spacy.training.align",
"spacy.training.gold_io",
"spacy.tokens.doc",
"spacy.tokens.span",

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0rc1"
__version__ = "3.0.0rc2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -93,27 +93,42 @@ def evaluate(
"SPEED": "speed",
}
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results")
if "morph_per_feat" in scores:
if scores["morph_per_feat"]:
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
data["morph_per_feat"] = scores["morph_per_feat"]
if "dep_las_per_type" in scores:
if scores["dep_las_per_type"]:
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
data["dep_las_per_type"] = scores["dep_las_per_type"]
if "ents_per_type" in scores:
if scores["ents_per_type"]:
print_ents_per_type(msg, scores["ents_per_type"])
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
data["ents_per_type"] = scores["ents_per_type"]
if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]:
print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
data["cats_f_per_type"] = scores["cats_f_per_type"]
if "cats_auc_per_type" in scores:
if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@ -157,7 +172,7 @@ def render_parses(
file_.write(html)
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
data = [
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
for k, v in scores.items()
@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
data,
header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"),
title="NER (per type)",
)
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
data = [
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
for k, v in scores.items()
]
msg.table(
data,
header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"),
title="Textcat F (per label)",
title=f"{name} (per {type})",
)

View File

@ -39,7 +39,7 @@ def init_vectors_cli(
nlp.to_disk(output_dir)
msg.good(
"Saved nlp object with vectors to output directory. You can now use the "
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
"path to it in your config as the 'vectors' setting in [initialize].",
output_dir.resolve(),
)
@ -100,7 +100,7 @@ def init_labels_cli(
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir()
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)

View File

@ -136,15 +136,19 @@ factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
@ -271,15 +275,19 @@ factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"

View File

@ -44,7 +44,7 @@ def train_cli(
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists():
output_path.mkdir()
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
overrides = parse_config_overrides(ctx.args)
import_code(code_path)

View File

@ -398,8 +398,8 @@ class Errors:
E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum")
E164 = ("x is neither increasing nor decreasing: {x}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.")
E165 = ("Only one class present in the gold labels: {label}. "
"ROC AUC score is not defined in that case.")
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}")
@ -456,6 +456,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
E897 = ("Field '{field}' should be a dot-notation string referring to the "
"relevant section in the config, but found type {type} instead.")
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your "
@ -562,7 +564,10 @@ class Errors:
"a string value from {expected} but got: '{arg}'")
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
"a list, but got: {arg_type}")
E949 = ("Can only create an alignment when the texts are the same.")
E949 = ("Unable to align tokens for the predicted and reference docs. It "
"is only possible to align the docs when both texts are the same "
"except for whitespace and capitalization. The predicted tokens "
"start with: {x}. The reference tokens start with: {y}.")
E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "

View File

@ -286,10 +286,10 @@ cdef class DependencyMatcher:
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
for matched_tree in matched_trees:
matched_key_trees.append((key, matched_tree))
for i, (match_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(match_id)
if on_match is not None:
on_match(self, doc, i, matched_key_trees)
for i, (match_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(match_id)
if on_match is not None:
on_match(self, doc, i, matched_key_trees)
return matched_key_trees
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):

View File

@ -1,4 +1,6 @@
from typing import Optional
from typing import Optional, List
from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@ -10,12 +12,13 @@ from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...tokens import Doc
@registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model:
) -> Model[List[Doc], Floats2d]:
"""
Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
is applied instead, so that outputs are in the range [0, 1].
"""
with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model = cnn >> output_layer
model.set_ref("output_layer", output_layer)
else:
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model = cnn >> linear_layer >> Logistic()
model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO)
@ -45,8 +47,7 @@ def build_bow_text_classifier(
ngram_size: int,
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -59,6 +60,39 @@ def build_bow_text_classifier(
return model
@registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier(
tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO")
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model
# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier(
width: int,
@ -158,11 +192,8 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(
width: int,
pretrained_vectors: Optional[bool],
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
width: int, dropout: Optional[float], nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]:
# Don't document this yet, I'm not sure it's right.
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}):

View File

@ -106,7 +106,7 @@ def MultiHashEmbed(
) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
through a feed-forward subnetwork to build a mixed representation.
The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into

View File

@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/tagger#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score")
results = {}
attrs = set()
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH:
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results

View File

@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
DOCS: https://nightly.spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", **kwargs))
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))

View File

@ -10,7 +10,7 @@ from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer
from ..scorer import get_ner_prf
from ..training import validate_examples
@ -340,7 +340,7 @@ class EntityRuler(Pipe):
def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score")
return Scorer.score_spans(examples, "ents", **kwargs)
return get_ner_prf(examples)
def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()

View File

@ -251,10 +251,13 @@ class Morphologizer(Tagger):
DOCS: https://nightly.spacy.io/api/morphologizer#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score")
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", **kwargs))
"morph", getter=morph_key_getter, **kwargs))
return results

View File

@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
score_per_type = get_ner_prf(examples)
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
return get_ner_prf(examples)

View File

@ -155,8 +155,11 @@ class Sentencizer(Pipe):
DOCS: https://nightly.spacy.io/api/sentencizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs)
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results

View File

@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs)
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results

View File

@ -16,15 +16,30 @@ from ..vocab import Vocab
default_model_config = """
[model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
pretrained_vectors = null
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64
conv_depth = 2
embed_size = 2000
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
dropout = null
no_output_layer = false
"""
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
@ -60,9 +75,11 @@ subword_features = true
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_p": None,
"cats_r": None,
"cats_f": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,

View File

@ -1,9 +1,9 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
import numpy as np
from collections import defaultdict
from .training import Example
from .tokens import Token, Doc, Span
from .tokens import Token, Doc, Span, MorphAnalysis
from .errors import Errors
from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology
@ -13,7 +13,8 @@ if TYPE_CHECKING:
from .language import Language # noqa: F401
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
MISSING_VALUES = frozenset([None, 0, ""])
class PRFScore:
@ -24,6 +25,9 @@ class PRFScore:
self.fp = 0
self.fn = 0
def __len__(self) -> int:
return self.tp + self.fp + self.fn
def __iadd__(self, other):
self.tp += other.tp
self.fp += other.fp
@ -59,7 +63,9 @@ class PRFScore:
class ROCAUCScore:
"""An AUC ROC score."""
"""An AUC ROC score. This is only defined for binary classification.
Use the method is_binary before calculating the score, otherwise it
may throw an error."""
def __init__(self) -> None:
self.golds = []
@ -71,16 +77,16 @@ class ROCAUCScore:
self.cands.append(cand)
self.golds.append(gold)
def is_binary(self):
return len(np.unique(self.golds)) == 2
@property
def score(self):
if not self.is_binary():
raise ValueError(Errors.E165.format(label=set(self.golds)))
if len(self.golds) == self.saved_score_at_len:
return self.saved_score
try:
self.saved_score = _roc_auc_score(self.golds, self.cands)
# catch ValueError: Only one class present in y_true.
# ROC AUC score is not defined in that case.
except ValueError:
self.saved_score = -float("inf")
self.saved_score = _roc_auc_score(self.golds, self.cands)
self.saved_score_at_len = len(self.golds)
return self.saved_score
@ -92,7 +98,7 @@ class Scorer:
self,
nlp: Optional["Language"] = None,
default_lang: str = "xx",
default_pipeline=DEFAULT_PIPELINE,
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg,
) -> None:
"""Initialize the Scorer.
@ -124,13 +130,13 @@ class Scorer:
return scores
@staticmethod
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
"""Returns accuracy and PRF scores for tokenization.
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for token character spans
examples (Iterable[Example]): Examples to score
RETURNS (Dict[str, float]): A dictionary containing the scores
RETURNS (Dict[str, Any]): A dictionary containing the scores
token_acc/p/r/f.
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
@ -140,6 +146,8 @@ class Scorer:
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
if gold_doc.has_unknown_spaces:
continue
align = example.alignment
gold_spans = set()
pred_spans = set()
@ -156,12 +164,20 @@ class Scorer:
else:
acc_score.tp += 1
prf_score.score_set(pred_spans, gold_spans)
return {
"token_acc": acc_score.fscore,
"token_p": prf_score.precision,
"token_r": prf_score.recall,
"token_f": prf_score.fscore,
}
if len(acc_score) > 0:
return {
"token_acc": acc_score.fscore,
"token_p": prf_score.precision,
"token_r": prf_score.recall,
"token_f": prf_score.fscore,
}
else:
return {
"token_acc": None,
"token_p": None,
"token_r": None,
"token_f": None
}
@staticmethod
def score_token_attr(
@ -169,8 +185,9 @@ class Scorer:
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg,
) -> Dict[str, float]:
) -> Dict[str, Any]:
"""Returns an accuracy score for a token-level attribute.
examples (Iterable[Example]): Examples to score
@ -178,7 +195,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
under the key attr_acc.
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
@ -189,17 +206,27 @@ class Scorer:
pred_doc = example.predicted
align = example.alignment
gold_tags = set()
missing_indices = set()
for gold_i, token in enumerate(gold_doc):
gold_tags.add((gold_i, getter(token, attr)))
value = getter(token, attr)
if value not in missing_values:
gold_tags.add((gold_i, getter(token, attr)))
else:
missing_indices.add(gold_i)
pred_tags = set()
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0]
pred_tags.add((gold_i, getter(token, attr)))
if gold_i not in missing_indices:
pred_tags.add((gold_i, getter(token, attr)))
tag_score.score_set(pred_tags, gold_tags)
return {f"{attr}_acc": tag_score.fscore}
score_key = f"{attr}_acc"
if len(tag_score) == 0:
return {score_key: None}
else:
return {score_key: tag_score.fscore}
@staticmethod
def score_token_attr_per_feat(
@ -207,8 +234,9 @@ class Scorer:
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg,
):
) -> Dict[str, Any]:
"""Return PRF scores per feat for a token attribute in UFEATS format.
examples (Iterable[Example]): Examples to score
@ -216,7 +244,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
RETURNS (dict): A dictionary containing the per-feat PRF scores under
the key attr_per_feat.
"""
per_feat = {}
@ -225,9 +253,11 @@ class Scorer:
gold_doc = example.reference
align = example.alignment
gold_per_feat = {}
missing_indices = set()
for gold_i, token in enumerate(gold_doc):
morph = str(getter(token, attr))
if morph:
value = getter(token, attr)
morph = gold_doc.vocab.strings[value]
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
@ -235,27 +265,35 @@ class Scorer:
if field not in gold_per_feat:
gold_per_feat[field] = set()
gold_per_feat[field].add((gold_i, feat))
else:
missing_indices.add(gold_i)
pred_per_feat = {}
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0]
morph = str(getter(token, attr))
if morph:
for feat in morph.split("|"):
field, values = feat.split("=")
if field not in per_feat:
per_feat[field] = PRFScore()
if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
if gold_i not in missing_indices:
value = getter(token, attr)
morph = gold_doc.vocab.strings[value]
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
per_feat[field] = PRFScore()
if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat:
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
result = {k: v.to_dict() for k, v in per_feat.items()}
return {f"{attr}_per_feat": result}
score_key = f"{attr}_per_feat"
if any([len(v) for v in per_feat.values()]):
result = {k: v.to_dict() for k, v in per_feat.items()}
return {score_key: result}
else:
return {score_key: None}
@staticmethod
def score_spans(
@ -263,6 +301,7 @@ class Scorer:
attr: str,
*,
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
has_annotation: Optional[Callable[[Doc], bool]] = None,
**cfg,
) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans.
@ -282,18 +321,10 @@ class Scorer:
for example in examples:
pred_doc = example.predicted
gold_doc = example.reference
# TODO
# This is a temporary hack to work around the problem that the scorer
# fails if you have examples that are not fully annotated for all
# the tasks in your pipeline. For instance, you might have a corpus
# of NER annotations that does not set sentence boundaries, but the
# pipeline includes a parser or senter, and then the score_weights
# are used to evaluate that component. When the scorer attempts
# to read the sentences from the gold document, it fails.
try:
list(getter(gold_doc, attr))
except ValueError:
continue
# Option to handle docs without sents
if has_annotation is not None:
if not has_annotation(gold_doc):
continue
# Find all labels in gold and doc
labels = set(
[k.label_ for k in getter(gold_doc, attr)]
@ -321,13 +352,21 @@ class Scorer:
v.score_set(pred_per_type[k], gold_per_type[k])
# Score for all labels
score.score_set(pred_spans, gold_spans)
results = {
f"{attr}_p": score.precision,
f"{attr}_r": score.recall,
f"{attr}_f": score.fscore,
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
return results
if len(score) > 0:
return {
f"{attr}_p": score.precision,
f"{attr}_r": score.recall,
f"{attr}_f": score.fscore,
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
else:
return {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
f"{attr}_per_type": None,
}
@staticmethod
def score_cats(
@ -362,9 +401,13 @@ class Scorer:
for all:
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score),
attr_micro_p,
attr_micro_r,
attr_micro_f,
attr_macro_p,
attr_macro_r,
attr_macro_f,
attr_auc,
attr_macro_auc,
attr_f_per_type,
attr_auc_per_type
@ -384,9 +427,6 @@ class Scorer:
pred_cats = getter(example.predicted, attr)
gold_cats = getter(example.reference, attr)
# I think the AUC metric is applicable regardless of whether we're
# doing multi-label classification? Unsure. If not, move this into
# the elif pred_cats and gold_cats block below.
for label in labels:
pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0)
@ -431,7 +471,9 @@ class Scorer:
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
# Limit macro_auc to those labels with gold annotations,
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
results = {
f"{attr}_score": None,
f"{attr}_score_desc": None,
@ -443,7 +485,7 @@ class Scorer:
f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": macro_auc,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@ -534,6 +576,7 @@ class Scorer:
head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Iterable[str] = SimpleFrozenList(),
missing_values: Set[Any] = MISSING_VALUES,
**cfg,
) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency
@ -558,6 +601,7 @@ class Scorer:
unlabelled = PRFScore()
labelled = PRFScore()
labelled_per_dep = dict()
missing_indices = set()
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
@ -567,13 +611,16 @@ class Scorer:
for gold_i, token in enumerate(gold_doc):
dep = getter(token, attr)
head = head_getter(token, head_attr)
if dep not in ignore_labels:
gold_deps.add((gold_i, head.i, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
if dep not in missing_values:
if dep not in ignore_labels:
gold_deps.add((gold_i, head.i, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
else:
missing_indices.add(gold_i)
pred_deps = set()
pred_deps_per_dep = {}
for token in pred_doc:
@ -583,25 +630,26 @@ class Scorer:
gold_i = None
else:
gold_i = align.x2y[token.i].dataXd[0, 0]
dep = getter(token, attr)
head = head_getter(token, head_attr)
if dep not in ignore_labels and token.orth_.strip():
if align.x2y.lengths[head.i] == 1:
gold_head = align.x2y[head.i].dataXd[0, 0]
else:
gold_head = None
# None is indistinct, so we can't just add it to the set
# Multiple (None, None) deps are possible
if gold_i is None or gold_head is None:
unlabelled.fp += 1
labelled.fp += 1
else:
pred_deps.add((gold_i, gold_head, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
if gold_i not in missing_indices:
dep = getter(token, attr)
head = head_getter(token, head_attr)
if dep not in ignore_labels and token.orth_.strip():
if align.x2y.lengths[head.i] == 1:
gold_head = align.x2y[head.i].dataXd[0, 0]
else:
gold_head = None
# None is indistinct, so we can't just add it to the set
# Multiple (None, None) deps are possible
if gold_i is None or gold_head is None:
unlabelled.fp += 1
labelled.fp += 1
else:
pred_deps.add((gold_i, gold_head, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
labelled.score_set(pred_deps, gold_deps)
for dep in labelled_per_dep:
labelled_per_dep[dep].score_set(
@ -610,29 +658,34 @@ class Scorer:
unlabelled.score_set(
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
)
return {
f"{attr}_uas": unlabelled.fscore,
f"{attr}_las": labelled.fscore,
f"{attr}_las_per_type": {
k: v.to_dict() for k, v in labelled_per_dep.items()
},
}
if len(unlabelled) > 0:
return {
f"{attr}_uas": unlabelled.fscore,
f"{attr}_las": labelled.fscore,
f"{attr}_las_per_type": {
k: v.to_dict() for k, v in labelled_per_dep.items()
},
}
else:
return {
f"{attr}_uas": None,
f"{attr}_las": None,
f"{attr}_las_per_type": None,
}
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
"""Compute per-entity PRFScore objects for a sequence of examples. The
results are returned as a dictionary keyed by the entity type. You can
add the PRFScore objects to get micro-averaged total.
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples.
"""
scores = defaultdict(PRFScore)
score_per_type = defaultdict(PRFScore)
for eg in examples:
if not eg.y.has_annotation("ENT_IOB"):
continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y
for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore()
if pred_ent.label_ not in score_per_type:
score_per_type[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1]
@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds:
scores[pred_ent.label_].tp += 1
score_per_type[pred_ent.label_].tp += 1
golds.remove(key)
else:
scores[pred_ent.label_].fp += 1
score_per_type[pred_ent.label_].fp += 1
for label, start, end in golds:
scores[label].fn += 1
return scores
score_per_type[label].fn += 1
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
if len(totals) > 0:
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
else:
return {
"ents_p": None,
"ents_r": None,
"ents_f": None,
"ents_per_type": None,
}
#############################################################################
@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score):
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
"""
if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165)
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr)

View File

@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc):
pattern = [
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
]
nomatch_pattern = [
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}},
]
matcher = DependencyMatcher(en_vocab)
mock = Mock()
matcher.add("pattern", [pattern], on_match=mock)
matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
matches = matcher(doc)
assert len(matches) == 1
mock.assert_called_once_with(matcher, doc, 0, matches)
# check that matches with and without callback are the same (#4590)

View File

@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
scores = nlp.evaluate(dev_examples)
# "cat" is the only correct lemma
assert scores["lemma_acc"] == pytest.approx(0.2)
# the empty morphs are correct
assert scores["morph_acc"] == pytest.approx(0.6)
# no morphs are set
assert scores["morph_acc"] == None
def test_attributeruler_rule_order(nlp):

View File

@ -2,6 +2,7 @@ import pytest
from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear, ConfigValidationError
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
name = "test_class_component_model"
default_config = {
"model": {
"@architectures": "spacy.TextCatEnsemble.v1",
"exclusive_classes": False,
"pretrained_vectors": None,
"width": 64,
"embed_size": 2000,
"window_size": 1,
"conv_depth": 2,
"ngram_size": 1,
"dropout": None,
"@architectures": "spacy.TextCatEnsemble.v2",
"tok2vec": DEFAULT_TOK2VEC_MODEL,
"linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
"no_output_layer": False},
},
"value1": 10,
}

View File

@ -140,7 +140,7 @@ def test_overfitting_IO():
nlp = English()
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
# Set exclusive labels
config = {"model": {"exclusive_classes": True}}
config = {"model": {"linear_model": {"exclusive_classes": True}}}
textcat = nlp.add_pipe("textcat", config=config)
train_examples = []
for text, annotations in TRAIN_DATA:
@ -192,9 +192,8 @@ def test_overfitting_IO():
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
],

View File

@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal
import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors
from spacy.lang.en import English
from spacy.lang.en.examples import sentences as EN_SENTENCES
def get_textcat_kwargs():
def get_textcat_bow_kwargs():
return {
"width": 64,
"embed_size": 2000,
"pretrained_vectors": None,
"exclusive_classes": False,
"exclusive_classes": True,
"ngram_size": 1,
"window_size": 1,
"conv_depth": 2,
"dropout": None,
"nO": 7,
"no_output_layer": False,
"nO": 34,
}
def get_textcat_cnn_kwargs():
return {
"tok2vec": test_tok2vec(),
"exclusive_classes": False,
"nO": 13,
}
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model):
@ -105,7 +96,7 @@ def test_multi_hash_embed():
"seed,model_func,kwargs",
[
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
(0, build_text_classifier, get_textcat_kwargs()),
(0, build_bow_text_classifier, get_textcat_bow_kwargs()),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
],
)
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
"seed,model_func,kwargs,get_X",
[
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, build_text_classifier, get_textcat_kwargs(), get_docs),
(0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
],
)
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
"seed,dropout,model_func,kwargs,get_X",
[
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
(0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
],
)

View File

@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] == None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
@ -334,7 +390,8 @@ def test_roc_auc_score():
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
assert score.score == -float("inf")
with pytest.raises(ValueError):
s = score.score
y_true = [1, 1]
y_score = [0.25, 0.75]
@ -344,4 +401,5 @@ def test_roc_auc_score():
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
assert score.score == -float("inf")
with pytest.raises(ValueError):
s = score.score

View File

@ -51,7 +51,7 @@ def test_readers():
for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp)))
assert scores["cats_score"]
assert scores["cats_score"] == 0.0
# ensure the pipeline runs
doc = nlp("Quick test")
assert doc.cats

View File

@ -2,6 +2,7 @@ import numpy
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json, Example
from spacy.training.align import get_alignments
from spacy.training.converters import json_to_docs
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
@pytest.mark.skip("Outdated")
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
[
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
(["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
(
["a", "b", '"', "c"],
['ab"', "c"],
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
),
(["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
(["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
(
["ab", "c", "d"],
["a", "b", "cd"],
(6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
),
(
["a", "b", "cd"],
["a", "b", "c", "d"],
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
),
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
([" ", "a"], ["a"], ([[], [0]], [[1]])),
],
)
def test_align(tokens_a, tokens_b, expected): # noqa
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa
a2b, b2a = get_alignments(tokens_a, tokens_b)
assert (a2b, b2a) == expected # noqa
# check symmetry
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa
a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa
assert (b2a, a2b) == expected # noqa
def test_goldparse_startswith_space(en_tokenizer):
@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
def test_goldparse_endswith_space(en_tokenizer):
text = "a\n"
doc = en_tokenizer(text)
gold_words = ["a"]
entities = ["U-DATE"]
deps = ["ROOT"]
heads = [0]
example = Example.from_dict(
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
)
ner_tags = example.get_aligned_ner()
assert ner_tags == ["U-DATE", "O"]
assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
def test_gold_constructor():
"""Test that the Example constructor works fine"""
nlp = English()
@ -676,6 +691,87 @@ def test_alignment_different_texts():
Alignment.from_strings(other_tokens, spacy_tokens)
def test_alignment_spaces(en_vocab):
# single leading whitespace
other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
# multiple leading whitespace tokens
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
# both with leading whitespace, not identical
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
# same leading whitespace, different tokenization
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
# only one with trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
# different trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
# same trailing whitespace, different tokenization
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
# differing whitespace is allowed
other_tokens = ["a", " \n ", "b", "c"]
spacy_tokens = ["a", "b", " ", "c"]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.dataXd) == [0, 1, 3]
assert list(align.y2x.dataXd) == [0, 2, 3]
# other differences in whitespace are allowed
other_tokens = [" ", "a"]
spacy_tokens = [" ", "a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
other_tokens = ["a", " "]
spacy_tokens = ["a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc):
a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)

View File

@ -399,14 +399,13 @@ cdef class Doc:
return True
cdef int i
cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START
attr = intify_attr(attr)
# adjust attributes
if attr == HEAD:
# HEAD does not have an unset state, so rely on DEP
attr = DEP
elif attr == self.vocab.strings["IS_SENT_START"]:
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
attr = SENT_START
# special cases for sentence boundaries
if attr == SENT_START:
if "sents" in self.user_hooks:

View File

@ -1,6 +1,6 @@
from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .align import Alignment # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401

66
spacy/training/align.pyx Normal file
View File

@ -0,0 +1,66 @@
from typing import List, Tuple
from itertools import chain
import re
from ..errors import Errors
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
# Create character-to-token mappings
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
str_a = "".join(A).lower()
str_b = "".join(B).lower()
cdef int len_str_a = len(str_a)
cdef int len_str_b = len(str_b)
# Check that the two texts only differ in whitespace and capitalization
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
len_str_a != len(char_to_token_a) or \
len_str_b != len(char_to_token_b):
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
cdef int char_idx_a = 0
cdef int char_idx_b = 0
cdef int token_idx_a = 0
cdef int token_idx_b = 0
cdef int prev_token_idx_a = -1
cdef int prev_token_idx_b = -1
a2b = []
b2a = []
while char_idx_a < len_str_a and char_idx_b < len_str_b:
# Find the current token position from the character position
token_idx_a = char_to_token_a[char_idx_a]
token_idx_b = char_to_token_b[char_idx_b]
# Add a set for the next token if a token boundary has been crossed
if prev_token_idx_a != token_idx_a:
a2b.append(set())
if prev_token_idx_b != token_idx_b:
b2a.append(set())
# Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b]:
# Current tokens are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += len(A[token_idx_a])
char_idx_b += len(B[token_idx_b])
elif str_a[char_idx_a] == str_b[char_idx_b]:
# Current chars are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += 1
char_idx_b += 1
elif str_a[char_idx_a].isspace():
# Skip unaligned whitespace char in A
char_idx_a += 1
elif str_b[char_idx_b].isspace():
# Skip unaligned whitespace char in B
char_idx_b += 1
else:
# This should never happen
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
prev_token_idx_a = token_idx_a
prev_token_idx_b = token_idx_b
# Process unaligned trailing whitespace
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
# Return values as sorted lists per token position
return [sorted(x) for x in a2b], [sorted(x) for x in b2a]

View File

@ -2,9 +2,8 @@ from typing import List
import numpy
from thinc.types import Ragged
from dataclasses import dataclass
import tokenizations
from ..errors import Errors
from .align import get_alignments
@dataclass
@ -20,9 +19,7 @@ class Alignment:
@classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
raise ValueError(Errors.E949)
x2y, y2x = tokenizations.get_alignments(A, B)
x2y, y2x = get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x)

View File

@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
from ..tokens.span cimport Span
from ..tokens.span import Span
from ..attrs import IDS
from .align import Alignment
from .alignment import Alignment
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings

View File

@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
if not isinstance(T["train_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
if not isinstance(T["dev_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"]
# Components that shouldn't be updated during training

View File

@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..errors import Errors
from ..util import registry, load_model_from_config, dot_to_object
from ..util import registry, load_model_from_config, resolve_dot_names
def pretrain(
@ -38,7 +38,7 @@ def pretrain(
_config = nlp.config.interpolate()
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
corpus = dot_to_object(T, P["corpus"])
corpus = resolve_dot_names(_config, [P["corpus"]])[0]
batcher = P["batcher"]
model = create_pretraining_model(nlp, P)
optimizer = P["optimizer"]

View File

@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used
a feed-forward subnetwork to build a mixed representation. The features used
can be configured with the `attrs` argument. The suggested attributes are
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
some subword information, without construction a fully character-based
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
different architectures and settings to determine what works best on your
specific data and challenge.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatEnsemble.v1"
> exclusive_classes = false
> pretrained_vectors = null
> width = 64
> embed_size = 2000
> conv_depth = 2
> window_size = 1
> ngram_size = 1
> dropout = null
> @architectures = "spacy.TextCatEnsemble.v2"
> nO = null
>
> [model.linear_model]
> @architectures = "spacy.TextCatBOW.v1"
> exclusive_classes = true
> ngram_size = 1
> no_output_layer = false
>
> [model.tok2vec]
> @architectures = "spacy.Tok2Vec.v1"
>
> [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
> include_static_vectors = false
>
> [model.tok2vec.encode]
> @architectures = "spacy.MaxoutWindowEncoder.v1"
> width = ${model.tok2vec.embed.width}
> window_size = 1
> maxout_pieces = 3
> depth = 2
> ```
Stacked ensemble of a bag-of-words model and a neural network model. The neural
network has an internal CNN Tok2Vec layer and uses attention.
Stacked ensemble of a linear bag-of-words model and a neural network model. The
neural network is built upon a Tok2Vec layer and uses attention. The setting for
whether or not this model should cater for multi-label classification, is taken
from the linear model, where it is stored in `model.attrs["multi_label"]`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
| Name | Description |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
</Accordion>
### spacy.TextCatCNN.v1 {#TextCatCNN}
> #### Example Config

View File

@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
## Serialization fields {#serialization-fields}

View File

@ -68,6 +68,8 @@ Scores the tokenization:
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
character spans
Docs with `has_unknown_spaces` are skipped during scoring.
> #### Example
>
> ```python
@ -81,7 +83,8 @@ Scores the tokenization:
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
Scores a single token attribute.
Scores a single token attribute. Tokens with missing values in the reference doc
are skipped during scoring.
> #### Example
>
@ -90,20 +93,22 @@ Scores a single token attribute.
> print(scores["pos_acc"])
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
| Name | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
Scores a single token attribute per feature for a token attribute in the
Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
format.
format. Tokens with missing values in the reference doc are skipped during
scoring.
> #### Example
>
@ -112,13 +117,14 @@ format.
> print(scores["morph_per_feat"])
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
| Name | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
> print(scores["ents_f"])
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
| Name | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
with missing values for the `attr` (typically `dep`) are skipped during scoring.
> #### Example
>
@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
> print(scores["dep_uas"], scores["dep_las"])
> ```
| Name | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `head_attr` | The attribute containing the head token. ~~str~~ |
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
| Name | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `head_attr` | The attribute containing the head token. ~~str~~ |
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
containing scores for each label like `Doc.cats`. The reported overall score
depends on the scorer settings:
containing scores for each label like `Doc.cats`. The returned dictionary
contains the following scores:
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
each label is weighted equally
- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
across evaluations per label
- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
scores, keyed by label
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
description)
The reported `{attr}_score` depends on the classification properties:
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
of the positive label
- **3+ exclusive classes**, macro-averaged F-score:
`{attr}_score = {attr}_macro_f`
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
> #### Example
>

View File

@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers))
Computing similarity scores can be helpful in many situations, but it's also
important to maintain **realistic expectations** about what information it can
provide. Words can be related to each over in many ways, so a single
provide. Words can be related to each other in many ways, so a single
"similarity" score will always be a **mix of different signals**, and vectors
trained on different data can produce very different results that may not be
useful for your purpose. Here are some important considerations to keep in mind:

View File

@ -130,16 +130,31 @@ factory = "textcat"
labels = []
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
dropout = 0
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[components.textcat.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${components.textcat.model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
```
spaCy has two additional built-in `textcat` architectures, and you can easily
@ -687,7 +702,7 @@ Before the model can be used, it needs to be
[initialized](/usage/training#initialization). This function receives a callback
to access the full **training data set**, or a representative sample. This data
set can be used to deduce all **relevant labels**. Alternatively, a list of
labels can be provided to `initialize`, or you can call
labels can be provided to `initialize`, or you can call
`RelationExtractor.add_label` directly. The number of labels defines the output
dimensionality of the network, and will be used to do
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the

View File

@ -1244,15 +1244,10 @@ labels = []
# This function is created and then passed to the "textcat" component as
# the argument "model"
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
dropout = null
no_output_layer = false
[components.other_textcat]
factory = "textcat"

View File

@ -1142,7 +1142,7 @@ pattern = [
{
"LEFT_ID": "anchor_founded",
"REL_OP": ">",
"RIGHT_ID": "subject",
"RIGHT_ID": "founded_subject",
"RIGHT_ATTRS": {"DEP": "nsubj"},
}
# ...
@ -1212,7 +1212,7 @@ pattern = [
{
"LEFT_ID": "anchor_founded",
"REL_OP": ">",
"RIGHT_ID": "subject",
"RIGHT_ID": "founded_subject",
"RIGHT_ATTRS": {"DEP": "nsubj"},
},
{

View File

@ -717,7 +717,7 @@ tabular results to a file:
```python
### functions.py
import sys
from typing import IO, Tuple, Callable, Dict, Any
from typing import IO, Tuple, Callable, Dict, Any, Optional
import spacy
from spacy import Language
from pathlib import Path
@ -729,7 +729,7 @@ def custom_logger(log_path):
stdout: IO=sys.stdout,
stderr: IO=sys.stderr
) -> Tuple[Callable, Callable]:
stdout.write(f"Logging to {log_path}\n")
stdout.write(f"Logging to {log_path}\\n")
log_file = Path(log_path).open("w", encoding="utf8")
log_file.write("step\\t")
log_file.write("score\\t")

View File

@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0.
| Name | Description |
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
@ -1032,9 +1032,9 @@ change your names and imports:
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
and maintaining one of the many awesome [plugins and extensions](/universe).
We've tried to make it as easy as possible for you to upgrade your packages for
spaCy v3.0. The most common use case for plugins is providing pipeline components
and extension attributes. When migrating your plugin, double-check the
following:
spaCy v3.0. The most common use case for plugins is providing pipeline
components and extension attributes. When migrating your plugin, double-check
the following:
- Use the [`@Language.factory`](/api/language#factory) decorator to register
your component and assign it a name. This allows users to refer to your

View File

@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg)
Since each visualization is generated as a separate SVG, exporting `.svg` files
only works if you're rendering **one single doc** at a time. (This makes sense
after all, each visualization should be a standalone graphic.) So instead of
rendering all `Doc`s at one, loop over them and export them separately.
rendering all `Doc`s at once, loop over them and export them separately.
</Infobox>

View File

@ -120,7 +120,7 @@ function formatAccuracy(data) {
? null
: {
label,
value: value.toFixed(2),
value: (value * 100).toFixed(2),
help: MODEL_META[label],
}
})