Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-11-03 18:10:28 +01:00
commit 1075b7ebb7
48 changed files with 753 additions and 344 deletions

View File

@ -8,7 +8,6 @@ requires = [
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0rc0,<8.1.0", "thinc>=8.0.0rc0,<8.1.0",
"blis>=0.4.0,<0.8.0", "blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy" "pathy"
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -14,8 +14,7 @@ pathy
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.5.0,<2.0.0 pydantic>=1.5.0,<1.7.0
pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0

View File

@ -51,8 +51,8 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.5.0,<2.0.0 pydantic>=1.5.0,<1.7.0
pytokenizations jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0

View File

@ -49,6 +49,7 @@ MOD_NAMES = [
"spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system", "spacy.pipeline._parser_internals.transition_system",
"spacy.tokenizer", "spacy.tokenizer",
"spacy.training.align",
"spacy.training.gold_io", "spacy.training.gold_io",
"spacy.tokens.doc", "spacy.tokens.doc",
"spacy.tokens.span", "spacy.tokens.span",

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0rc1" __version__ = "3.0.0rc2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -93,27 +93,42 @@ def evaluate(
"SPEED": "speed", "SPEED": "speed",
} }
results = {} results = {}
data = {}
for metric, key in metrics.items(): for metric, key in metrics.items():
if key in scores: if key in scores:
if key == "cats_score": if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if key == "speed": if isinstance(scores[key], (int, float)):
results[metric] = f"{scores[key]:.0f}" if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else: else:
results[metric] = f"{scores[key]*100:.2f}" results[metric] = "-"
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results") msg.table(results, title="Results")
if "morph_per_feat" in scores:
if scores["morph_per_feat"]:
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
data["morph_per_feat"] = scores["morph_per_feat"]
if "dep_las_per_type" in scores:
if scores["dep_las_per_type"]:
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
data["dep_las_per_type"] = scores["dep_las_per_type"]
if "ents_per_type" in scores: if "ents_per_type" in scores:
if scores["ents_per_type"]: if scores["ents_per_type"]:
print_ents_per_type(msg, scores["ents_per_type"]) print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
data["ents_per_type"] = scores["ents_per_type"]
if "cats_f_per_type" in scores: if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]: if scores["cats_f_per_type"]:
print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
data["cats_f_per_type"] = scores["cats_f_per_type"]
if "cats_auc_per_type" in scores: if "cats_auc_per_type" in scores:
if scores["cats_auc_per_type"]: if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
if displacy_path: if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@ -157,7 +172,7 @@ def render_parses(
file_.write(html) file_.write(html)
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
data = [ data = [
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
for k, v in scores.items() for k, v in scores.items()
@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
data, data,
header=("", "P", "R", "F"), header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"), aligns=("l", "r", "r", "r"),
title="NER (per type)", title=f"{name} (per {type})",
)
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
data = [
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
for k, v in scores.items()
]
msg.table(
data,
header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"),
title="Textcat F (per label)",
) )

View File

@ -39,7 +39,7 @@ def init_vectors_cli(
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
msg.good( msg.good(
"Saved nlp object with vectors to output directory. You can now use the " "Saved nlp object with vectors to output directory. You can now use the "
"path to it in your config as the 'vectors' setting in [initialize.vocab].", "path to it in your config as the 'vectors' setting in [initialize].",
output_dir.resolve(), output_dir.resolve(),
) )
@ -100,7 +100,7 @@ def init_labels_cli(
extract the labels.""" extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)

View File

@ -136,15 +136,19 @@ factory = "textcat"
{% if optimize == "accuracy" %} {% if optimize == "accuracy" %}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v1" @architectures = "spacy.TextCatBOW.v1"
@ -271,15 +275,19 @@ factory = "textcat"
{% if optimize == "accuracy" %} {% if optimize == "accuracy" %}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v1" @architectures = "spacy.TextCatBOW.v1"

View File

@ -44,7 +44,7 @@ def train_cli(
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists(): if output_path is not None and not output_path.exists():
output_path.mkdir() output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}") msg.good(f"Created output directory: {output_path}")
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)

View File

@ -398,8 +398,8 @@ class Errors:
E163 = ("cumsum was found to be unstable: its last element does not " E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum") "correspond to sum")
E164 = ("x is neither increasing nor decreasing: {x}.") E164 = ("x is neither increasing nor decreasing: {x}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in " E165 = ("Only one class present in the gold labels: {label}. "
"that case.") "ROC AUC score is not defined in that case.")
E166 = ("Can only merge DocBins with the same value for '{param}'.\n" E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}") "Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}") E169 = ("Can't find module: {module}")
@ -456,6 +456,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E897 = ("Field '{field}' should be a dot-notation string referring to the "
"relevant section in the config, but found type {type} instead.")
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make " "is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your " "sure to store the component model as `self.model` in your "
@ -562,7 +564,10 @@ class Errors:
"a string value from {expected} but got: '{arg}'") "a string value from {expected} but got: '{arg}'")
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
"a list, but got: {arg_type}") "a list, but got: {arg_type}")
E949 = ("Can only create an alignment when the texts are the same.") E949 = ("Unable to align tokens for the predicted and reference docs. It "
"is only possible to align the docs when both texts are the same "
"except for whitespace and capitalization. The predicted tokens "
"start with: {x}. The reference tokens start with: {y}.")
E952 = ("The section '{name}' is not a valid section in the provided config.") E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "

View File

@ -286,10 +286,10 @@ cdef class DependencyMatcher:
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
for matched_tree in matched_trees: for matched_tree in matched_trees:
matched_key_trees.append((key, matched_tree)) matched_key_trees.append((key, matched_tree))
for i, (match_id, nodes) in enumerate(matched_key_trees): for i, (match_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(match_id) on_match = self._callbacks.get(match_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matched_key_trees) on_match(self, doc, i, matched_key_trees)
return matched_key_trees return matched_key_trees
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):

View File

@ -1,4 +1,6 @@
from typing import Optional from typing import Optional, List
from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@ -10,12 +12,13 @@ from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor from ..featureextractor import FeatureExtractor
from ...tokens import Doc
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier( def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model: ) -> Model[List[Doc], Floats2d]:
""" """
Build a simple CNN text classifier, given a token-to-vector model as inputs. Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
is applied instead, so that outputs are in the range [0, 1]. is applied instead, so that outputs are in the range [0, 1].
""" """
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
if exclusive_classes: if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model = cnn >> output_layer
model.set_ref("output_layer", output_layer) model.set_ref("output_layer", output_layer)
else: else:
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = ( model = cnn >> linear_layer >> Logistic()
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model.set_ref("output_layer", linear_layer) model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO) model.set_dim("nO", nO)
@ -45,8 +47,7 @@ def build_bow_text_classifier(
ngram_size: int, ngram_size: int,
no_output_layer: bool, no_output_layer: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model: ) -> Model[List[Doc], Floats2d]:
# Don't document this yet, I'm not sure it's right.
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO) sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -59,6 +60,39 @@ def build_bow_text_classifier(
return model return model
@registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier(
tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO")
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model
# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1") @registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier( def build_text_classifier(
width: int, width: int,
@ -158,11 +192,8 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1") @registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata( def build_text_classifier_lowdata(
width: int, width: int, dropout: Optional[float], nO: Optional[int] = None
pretrained_vectors: Optional[bool], ) -> Model[List[Doc], Floats2d]:
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right. # Don't document this yet, I'm not sure it's right.
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):

View File

@ -106,7 +106,7 @@ def MultiHashEmbed(
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical """Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations. through a feed-forward subnetwork to build a mixed representation.
The features used can be configured with the 'attrs' argument. The suggested The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into

View File

@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/tagger#score DOCS: https://nightly.spacy.io/api/tagger#score
""" """
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score") validate_examples(examples, "AttributeRuler.score")
results = {} results = {}
attrs = set() attrs = set()
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
elif attr == POS: elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH: elif attr == MORPH:
results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
elif attr == LEMMA: elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results return results

View File

@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
DOCS: https://nightly.spacy.io/api/dependencyparser#score DOCS: https://nightly.spacy.io/api/dependencyparser#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score") validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr): def dep_getter(token, attr):
dep = getattr(token, attr) dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower() dep = token.vocab.strings.as_string(dep).lower()
return dep return dep
results = {} results = {}
results.update(Scorer.score_spans(examples, "sents", **kwargs)) results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter) kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct")) kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs)) results.update(Scorer.score_deps(examples, "dep", **kwargs))

View File

@ -10,7 +10,7 @@ from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer from ..scorer import get_ner_prf
from ..training import validate_examples from ..training import validate_examples
@ -340,7 +340,7 @@ class EntityRuler(Pipe):
def score(self, examples, **kwargs): def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score") validate_examples(examples, "EntityRuler.score")
return Scorer.score_spans(examples, "ents", **kwargs) return get_ner_prf(examples)
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()

View File

@ -251,10 +251,13 @@ class Morphologizer(Tagger):
DOCS: https://nightly.spacy.io/api/morphologizer#score DOCS: https://nightly.spacy.io/api/morphologizer#score
""" """
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score") validate_examples(examples, "Morphologizer.score")
results = {} results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, results.update(Scorer.score_token_attr_per_feat(examples,
"morph", **kwargs)) "morph", getter=morph_key_getter, **kwargs))
return results return results

View File

@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
DOCS: https://nightly.spacy.io/api/entityrecognizer#score DOCS: https://nightly.spacy.io/api/entityrecognizer#score
""" """
validate_examples(examples, "EntityRecognizer.score") validate_examples(examples, "EntityRecognizer.score")
score_per_type = get_ner_prf(examples) return get_ner_prf(examples)
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}

View File

@ -155,8 +155,11 @@ class Sentencizer(Pipe):
DOCS: https://nightly.spacy.io/api/sentencizer#score DOCS: https://nightly.spacy.io/api/sentencizer#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score") validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs) results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"] del results["sents_per_type"]
return results return results

View File

@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
""" """
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score") validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", **kwargs) results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"] del results["sents_per_type"]
return results return results

View File

@ -16,15 +16,30 @@ from ..vocab import Vocab
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
pretrained_vectors = null [model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64 width = 64
conv_depth = 2 rows = [2000, 2000, 1000, 1000, 1000, 1000]
embed_size = 2000 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1 window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1 ngram_size = 1
dropout = null no_output_layer = false
""" """
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
@ -60,9 +75,11 @@ subword_features = true
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
"cats_p": None, "cats_micro_p": None,
"cats_r": None, "cats_micro_r": None,
"cats_f": None, "cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None, "cats_macro_f": None,
"cats_macro_auc": None, "cats_macro_auc": None,
"cats_f_per_type": None, "cats_f_per_type": None,

View File

@ -1,9 +1,9 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
from .training import Example from .training import Example
from .tokens import Token, Doc, Span from .tokens import Token, Doc, Span, MorphAnalysis
from .errors import Errors from .errors import Errors
from .util import get_lang_class, SimpleFrozenList from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology from .morphology import Morphology
@ -13,7 +13,8 @@ if TYPE_CHECKING:
from .language import Language # noqa: F401 from .language import Language # noqa: F401
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
MISSING_VALUES = frozenset([None, 0, ""])
class PRFScore: class PRFScore:
@ -24,6 +25,9 @@ class PRFScore:
self.fp = 0 self.fp = 0
self.fn = 0 self.fn = 0
def __len__(self) -> int:
return self.tp + self.fp + self.fn
def __iadd__(self, other): def __iadd__(self, other):
self.tp += other.tp self.tp += other.tp
self.fp += other.fp self.fp += other.fp
@ -59,7 +63,9 @@ class PRFScore:
class ROCAUCScore: class ROCAUCScore:
"""An AUC ROC score.""" """An AUC ROC score. This is only defined for binary classification.
Use the method is_binary before calculating the score, otherwise it
may throw an error."""
def __init__(self) -> None: def __init__(self) -> None:
self.golds = [] self.golds = []
@ -71,16 +77,16 @@ class ROCAUCScore:
self.cands.append(cand) self.cands.append(cand)
self.golds.append(gold) self.golds.append(gold)
def is_binary(self):
return len(np.unique(self.golds)) == 2
@property @property
def score(self): def score(self):
if not self.is_binary():
raise ValueError(Errors.E165.format(label=set(self.golds)))
if len(self.golds) == self.saved_score_at_len: if len(self.golds) == self.saved_score_at_len:
return self.saved_score return self.saved_score
try: self.saved_score = _roc_auc_score(self.golds, self.cands)
self.saved_score = _roc_auc_score(self.golds, self.cands)
# catch ValueError: Only one class present in y_true.
# ROC AUC score is not defined in that case.
except ValueError:
self.saved_score = -float("inf")
self.saved_score_at_len = len(self.golds) self.saved_score_at_len = len(self.golds)
return self.saved_score return self.saved_score
@ -92,7 +98,7 @@ class Scorer:
self, self,
nlp: Optional["Language"] = None, nlp: Optional["Language"] = None,
default_lang: str = "xx", default_lang: str = "xx",
default_pipeline=DEFAULT_PIPELINE, default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg, **cfg,
) -> None: ) -> None:
"""Initialize the Scorer. """Initialize the Scorer.
@ -124,13 +130,13 @@ class Scorer:
return scores return scores
@staticmethod @staticmethod
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
"""Returns accuracy and PRF scores for tokenization. """Returns accuracy and PRF scores for tokenization.
* token_acc: # correct tokens / # gold tokens * token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for token character spans * token_p/r/f: PRF for token character spans
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
RETURNS (Dict[str, float]): A dictionary containing the scores RETURNS (Dict[str, Any]): A dictionary containing the scores
token_acc/p/r/f. token_acc/p/r/f.
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
@ -140,6 +146,8 @@ class Scorer:
for example in examples: for example in examples:
gold_doc = example.reference gold_doc = example.reference
pred_doc = example.predicted pred_doc = example.predicted
if gold_doc.has_unknown_spaces:
continue
align = example.alignment align = example.alignment
gold_spans = set() gold_spans = set()
pred_spans = set() pred_spans = set()
@ -156,12 +164,20 @@ class Scorer:
else: else:
acc_score.tp += 1 acc_score.tp += 1
prf_score.score_set(pred_spans, gold_spans) prf_score.score_set(pred_spans, gold_spans)
return { if len(acc_score) > 0:
"token_acc": acc_score.fscore, return {
"token_p": prf_score.precision, "token_acc": acc_score.fscore,
"token_r": prf_score.recall, "token_p": prf_score.precision,
"token_f": prf_score.fscore, "token_r": prf_score.recall,
} "token_f": prf_score.fscore,
}
else:
return {
"token_acc": None,
"token_p": None,
"token_r": None,
"token_f": None
}
@staticmethod @staticmethod
def score_token_attr( def score_token_attr(
@ -169,8 +185,9 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
) -> Dict[str, float]: ) -> Dict[str, Any]:
"""Returns an accuracy score for a token-level attribute. """Returns an accuracy score for a token-level attribute.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
@ -178,7 +195,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (Dict[str, float]): A dictionary containing the accuracy score RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
under the key attr_acc. under the key attr_acc.
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
@ -189,17 +206,27 @@ class Scorer:
pred_doc = example.predicted pred_doc = example.predicted
align = example.alignment align = example.alignment
gold_tags = set() gold_tags = set()
missing_indices = set()
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
gold_tags.add((gold_i, getter(token, attr))) value = getter(token, attr)
if value not in missing_values:
gold_tags.add((gold_i, getter(token, attr)))
else:
missing_indices.add(gold_i)
pred_tags = set() pred_tags = set()
for token in pred_doc: for token in pred_doc:
if token.orth_.isspace(): if token.orth_.isspace():
continue continue
if align.x2y.lengths[token.i] == 1: if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
pred_tags.add((gold_i, getter(token, attr))) if gold_i not in missing_indices:
pred_tags.add((gold_i, getter(token, attr)))
tag_score.score_set(pred_tags, gold_tags) tag_score.score_set(pred_tags, gold_tags)
return {f"{attr}_acc": tag_score.fscore} score_key = f"{attr}_acc"
if len(tag_score) == 0:
return {score_key: None}
else:
return {score_key: tag_score.fscore}
@staticmethod @staticmethod
def score_token_attr_per_feat( def score_token_attr_per_feat(
@ -207,8 +234,9 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
): ) -> Dict[str, Any]:
"""Return PRF scores per feat for a token attribute in UFEATS format. """Return PRF scores per feat for a token attribute in UFEATS format.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
@ -216,7 +244,7 @@ class Scorer:
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
RETURNS (dict): A dictionary containing the per-feat PRF scores unders RETURNS (dict): A dictionary containing the per-feat PRF scores under
the key attr_per_feat. the key attr_per_feat.
""" """
per_feat = {} per_feat = {}
@ -225,9 +253,11 @@ class Scorer:
gold_doc = example.reference gold_doc = example.reference
align = example.alignment align = example.alignment
gold_per_feat = {} gold_per_feat = {}
missing_indices = set()
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
morph = str(getter(token, attr)) value = getter(token, attr)
if morph: morph = gold_doc.vocab.strings[value]
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
for feat in morph.split(Morphology.FEATURE_SEP): for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP) field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat: if field not in per_feat:
@ -235,27 +265,35 @@ class Scorer:
if field not in gold_per_feat: if field not in gold_per_feat:
gold_per_feat[field] = set() gold_per_feat[field] = set()
gold_per_feat[field].add((gold_i, feat)) gold_per_feat[field].add((gold_i, feat))
else:
missing_indices.add(gold_i)
pred_per_feat = {} pred_per_feat = {}
for token in pred_doc: for token in pred_doc:
if token.orth_.isspace(): if token.orth_.isspace():
continue continue
if align.x2y.lengths[token.i] == 1: if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
morph = str(getter(token, attr)) if gold_i not in missing_indices:
if morph: value = getter(token, attr)
for feat in morph.split("|"): morph = gold_doc.vocab.strings[value]
field, values = feat.split("=") if value not in missing_values and morph != Morphology.EMPTY_MORPH:
if field not in per_feat: for feat in morph.split(Morphology.FEATURE_SEP):
per_feat[field] = PRFScore() field, values = feat.split(Morphology.FIELD_SEP)
if field not in pred_per_feat: if field not in per_feat:
pred_per_feat[field] = set() per_feat[field] = PRFScore()
pred_per_feat[field].add((gold_i, feat)) if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )
result = {k: v.to_dict() for k, v in per_feat.items()} score_key = f"{attr}_per_feat"
return {f"{attr}_per_feat": result} if any([len(v) for v in per_feat.values()]):
result = {k: v.to_dict() for k, v in per_feat.items()}
return {score_key: result}
else:
return {score_key: None}
@staticmethod @staticmethod
def score_spans( def score_spans(
@ -263,6 +301,7 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Doc, str], Iterable[Span]] = getattr, getter: Callable[[Doc, str], Iterable[Span]] = getattr,
has_annotation: Optional[Callable[[Doc], bool]] = None,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans. """Returns PRF scores for labeled spans.
@ -282,18 +321,10 @@ class Scorer:
for example in examples: for example in examples:
pred_doc = example.predicted pred_doc = example.predicted
gold_doc = example.reference gold_doc = example.reference
# TODO # Option to handle docs without sents
# This is a temporary hack to work around the problem that the scorer if has_annotation is not None:
# fails if you have examples that are not fully annotated for all if not has_annotation(gold_doc):
# the tasks in your pipeline. For instance, you might have a corpus continue
# of NER annotations that does not set sentence boundaries, but the
# pipeline includes a parser or senter, and then the score_weights
# are used to evaluate that component. When the scorer attempts
# to read the sentences from the gold document, it fails.
try:
list(getter(gold_doc, attr))
except ValueError:
continue
# Find all labels in gold and doc # Find all labels in gold and doc
labels = set( labels = set(
[k.label_ for k in getter(gold_doc, attr)] [k.label_ for k in getter(gold_doc, attr)]
@ -321,13 +352,21 @@ class Scorer:
v.score_set(pred_per_type[k], gold_per_type[k]) v.score_set(pred_per_type[k], gold_per_type[k])
# Score for all labels # Score for all labels
score.score_set(pred_spans, gold_spans) score.score_set(pred_spans, gold_spans)
results = { if len(score) > 0:
f"{attr}_p": score.precision, return {
f"{attr}_r": score.recall, f"{attr}_p": score.precision,
f"{attr}_f": score.fscore, f"{attr}_r": score.recall,
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, f"{attr}_f": score.fscore,
} f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
return results }
else:
return {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
f"{attr}_per_type": None,
}
@staticmethod @staticmethod
def score_cats( def score_cats(
@ -362,9 +401,13 @@ class Scorer:
for all: for all:
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score), attr_score_desc (text description of the overall score),
attr_micro_p,
attr_micro_r,
attr_micro_f, attr_micro_f,
attr_macro_p,
attr_macro_r,
attr_macro_f, attr_macro_f,
attr_auc, attr_macro_auc,
attr_f_per_type, attr_f_per_type,
attr_auc_per_type attr_auc_per_type
@ -384,9 +427,6 @@ class Scorer:
pred_cats = getter(example.predicted, attr) pred_cats = getter(example.predicted, attr)
gold_cats = getter(example.reference, attr) gold_cats = getter(example.reference, attr)
# I think the AUC metric is applicable regardless of whether we're
# doing multi-label classification? Unsure. If not, move this into
# the elif pred_cats and gold_cats block below.
for label in labels: for label in labels:
pred_score = pred_cats.get(label, 0.0) pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0)
@ -431,7 +471,9 @@ class Scorer:
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats # Limit macro_auc to those labels with gold annotations,
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
results = { results = {
f"{attr}_score": None, f"{attr}_score": None,
f"{attr}_score_desc": None, f"{attr}_score_desc": None,
@ -443,7 +485,7 @@ class Scorer:
f"{attr}_macro_f": macro_f, f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": macro_auc, f"{attr}_macro_auc": macro_auc,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
} }
if len(labels) == 2 and not multi_label and positive_label: if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@ -534,6 +576,7 @@ class Scorer:
head_attr: str = "head", head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr, head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Iterable[str] = SimpleFrozenList(), ignore_labels: Iterable[str] = SimpleFrozenList(),
missing_values: Set[Any] = MISSING_VALUES,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency """Returns the UAS, LAS, and LAS per type scores for dependency
@ -558,6 +601,7 @@ class Scorer:
unlabelled = PRFScore() unlabelled = PRFScore()
labelled = PRFScore() labelled = PRFScore()
labelled_per_dep = dict() labelled_per_dep = dict()
missing_indices = set()
for example in examples: for example in examples:
gold_doc = example.reference gold_doc = example.reference
pred_doc = example.predicted pred_doc = example.predicted
@ -567,13 +611,16 @@ class Scorer:
for gold_i, token in enumerate(gold_doc): for gold_i, token in enumerate(gold_doc):
dep = getter(token, attr) dep = getter(token, attr)
head = head_getter(token, head_attr) head = head_getter(token, head_attr)
if dep not in ignore_labels: if dep not in missing_values:
gold_deps.add((gold_i, head.i, dep)) if dep not in ignore_labels:
if dep not in labelled_per_dep: gold_deps.add((gold_i, head.i, dep))
labelled_per_dep[dep] = PRFScore() if dep not in labelled_per_dep:
if dep not in gold_deps_per_dep: labelled_per_dep[dep] = PRFScore()
gold_deps_per_dep[dep] = set() if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep].add((gold_i, head.i, dep)) gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
else:
missing_indices.add(gold_i)
pred_deps = set() pred_deps = set()
pred_deps_per_dep = {} pred_deps_per_dep = {}
for token in pred_doc: for token in pred_doc:
@ -583,25 +630,26 @@ class Scorer:
gold_i = None gold_i = None
else: else:
gold_i = align.x2y[token.i].dataXd[0, 0] gold_i = align.x2y[token.i].dataXd[0, 0]
dep = getter(token, attr) if gold_i not in missing_indices:
head = head_getter(token, head_attr) dep = getter(token, attr)
if dep not in ignore_labels and token.orth_.strip(): head = head_getter(token, head_attr)
if align.x2y.lengths[head.i] == 1: if dep not in ignore_labels and token.orth_.strip():
gold_head = align.x2y[head.i].dataXd[0, 0] if align.x2y.lengths[head.i] == 1:
else: gold_head = align.x2y[head.i].dataXd[0, 0]
gold_head = None else:
# None is indistinct, so we can't just add it to the set gold_head = None
# Multiple (None, None) deps are possible # None is indistinct, so we can't just add it to the set
if gold_i is None or gold_head is None: # Multiple (None, None) deps are possible
unlabelled.fp += 1 if gold_i is None or gold_head is None:
labelled.fp += 1 unlabelled.fp += 1
else: labelled.fp += 1
pred_deps.add((gold_i, gold_head, dep)) else:
if dep not in labelled_per_dep: pred_deps.add((gold_i, gold_head, dep))
labelled_per_dep[dep] = PRFScore() if dep not in labelled_per_dep:
if dep not in pred_deps_per_dep: labelled_per_dep[dep] = PRFScore()
pred_deps_per_dep[dep] = set() if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
labelled.score_set(pred_deps, gold_deps) labelled.score_set(pred_deps, gold_deps)
for dep in labelled_per_dep: for dep in labelled_per_dep:
labelled_per_dep[dep].score_set( labelled_per_dep[dep].score_set(
@ -610,29 +658,34 @@ class Scorer:
unlabelled.score_set( unlabelled.score_set(
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
) )
return { if len(unlabelled) > 0:
f"{attr}_uas": unlabelled.fscore, return {
f"{attr}_las": labelled.fscore, f"{attr}_uas": unlabelled.fscore,
f"{attr}_las_per_type": { f"{attr}_las": labelled.fscore,
k: v.to_dict() for k, v in labelled_per_dep.items() f"{attr}_las_per_type": {
}, k: v.to_dict() for k, v in labelled_per_dep.items()
} },
}
else:
return {
f"{attr}_uas": None,
f"{attr}_las": None,
f"{attr}_las_per_type": None,
}
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
"""Compute per-entity PRFScore objects for a sequence of examples. The """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
results are returned as a dictionary keyed by the entity type. You can
add the PRFScore objects to get micro-averaged total.
""" """
scores = defaultdict(PRFScore) score_per_type = defaultdict(PRFScore)
for eg in examples: for eg in examples:
if not eg.y.has_annotation("ENT_IOB"): if not eg.y.has_annotation("ENT_IOB"):
continue continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents} golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y align_x2y = eg.alignment.x2y
for pred_ent in eg.x.ents: for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores: if pred_ent.label_ not in score_per_type:
scores[pred_ent.label_] = PRFScore() score_per_type[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices): if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1] g_span = eg.y[indices[0] : indices[-1] + 1]
@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
if all(token.ent_iob != 0 for token in g_span): if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1) key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds: if key in golds:
scores[pred_ent.label_].tp += 1 score_per_type[pred_ent.label_].tp += 1
golds.remove(key) golds.remove(key)
else: else:
scores[pred_ent.label_].fp += 1 score_per_type[pred_ent.label_].fp += 1
for label, start, end in golds: for label, start, end in golds:
scores[label].fn += 1 score_per_type[label].fn += 1
return scores totals = PRFScore()
for prf in score_per_type.values():
totals += prf
if len(totals) > 0:
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
else:
return {
"ents_p": None,
"ents_r": None,
"ents_f": None,
"ents_per_type": None,
}
############################################################################# #############################################################################
@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score):
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
""" """
if len(np.unique(y_true)) != 2: if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165) raise ValueError(Errors.E165.format(label=np.unique(y_true)))
fpr, tpr, _ = _roc_curve(y_true, y_score) fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr) return _auc(fpr, tpr)

View File

@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc):
pattern = [ pattern = [
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
] ]
nomatch_pattern = [
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}},
]
matcher = DependencyMatcher(en_vocab) matcher = DependencyMatcher(en_vocab)
mock = Mock() mock = Mock()
matcher.add("pattern", [pattern], on_match=mock) matcher.add("pattern", [pattern], on_match=mock)
matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1
mock.assert_called_once_with(matcher, doc, 0, matches) mock.assert_called_once_with(matcher, doc, 0, matches)
# check that matches with and without callback are the same (#4590) # check that matches with and without callback are the same (#4590)

View File

@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
# "cat" is the only correct lemma # "cat" is the only correct lemma
assert scores["lemma_acc"] == pytest.approx(0.2) assert scores["lemma_acc"] == pytest.approx(0.2)
# the empty morphs are correct # no morphs are set
assert scores["morph_acc"] == pytest.approx(0.6) assert scores["morph_acc"] == None
def test_attributeruler_rule_order(nlp): def test_attributeruler_rule_order(nlp):

View File

@ -2,6 +2,7 @@ import pytest
from spacy.language import Language from spacy.language import Language
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear, ConfigValidationError from thinc.api import Model, Linear, ConfigValidationError
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
name = "test_class_component_model" name = "test_class_component_model"
default_config = { default_config = {
"model": { "model": {
"@architectures": "spacy.TextCatEnsemble.v1", "@architectures": "spacy.TextCatEnsemble.v2",
"exclusive_classes": False, "tok2vec": DEFAULT_TOK2VEC_MODEL,
"pretrained_vectors": None, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
"width": 64, "no_output_layer": False},
"embed_size": 2000,
"window_size": 1,
"conv_depth": 2,
"ngram_size": 1,
"dropout": None,
}, },
"value1": 10, "value1": 10,
} }

View File

@ -140,7 +140,7 @@ def test_overfitting_IO():
nlp = English() nlp = English()
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
# Set exclusive labels # Set exclusive labels
config = {"model": {"exclusive_classes": True}} config = {"model": {"linear_model": {"exclusive_classes": True}}}
textcat = nlp.add_pipe("textcat", config=config) textcat = nlp.add_pipe("textcat", config=config)
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
@ -192,9 +192,8 @@ def test_overfitting_IO():
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
], ],

View File

@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
import numpy import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.examples import sentences as EN_SENTENCES from spacy.lang.en.examples import sentences as EN_SENTENCES
def get_textcat_kwargs(): def get_textcat_bow_kwargs():
return { return {
"width": 64, "exclusive_classes": True,
"embed_size": 2000,
"pretrained_vectors": None,
"exclusive_classes": False,
"ngram_size": 1, "ngram_size": 1,
"window_size": 1, "no_output_layer": False,
"conv_depth": 2, "nO": 34,
"dropout": None,
"nO": 7,
} }
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return { return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
"tok2vec": test_tok2vec(),
"exclusive_classes": False,
"nO": 13,
}
def get_all_params(model): def get_all_params(model):
@ -105,7 +96,7 @@ def test_multi_hash_embed():
"seed,model_func,kwargs", "seed,model_func,kwargs",
[ [
(0, build_Tok2Vec_model, get_tok2vec_kwargs()), (0, build_Tok2Vec_model, get_tok2vec_kwargs()),
(0, build_text_classifier, get_textcat_kwargs()), (0, build_bow_text_classifier, get_textcat_bow_kwargs()),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
], ],
) )
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
"seed,model_func,kwargs,get_X", "seed,model_func,kwargs,get_X",
[ [
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, build_text_classifier, get_textcat_kwargs(), get_docs), (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
], ],
) )
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
"seed,dropout,model_func,kwargs,get_X", "seed,dropout,model_func,kwargs,get_X",
[ [
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
], ],
) )

View File

@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
def test_partial_annotation(en_tokenizer):
pred_doc = en_tokenizer("a b c d e")
pred_doc[0].tag_ = "A"
pred_doc[0].pos_ = "X"
pred_doc[0].set_morph("Feat=Val")
pred_doc[0].dep_ = "dep"
# unannotated reference
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
for key in scores:
# cats doesn't have an unset state
if key.startswith("cats"):
continue
assert scores[key] == None
# partially annotated reference, not overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[1].tag_ = "A"
ref_doc[1].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 0.0
assert scores["pos_acc"] == 0.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
# partially annotated reference, overlapping with predicted annotation
ref_doc = en_tokenizer("a b c d e")
ref_doc.has_unknown_spaces = True
ref_doc[0].tag_ = "A"
ref_doc[0].pos_ = "X"
ref_doc[1].set_morph("Feat=Val")
ref_doc[1].dep_ = "dep"
example = Example(pred_doc, ref_doc)
scorer = Scorer()
scores = scorer.score([example])
assert scores["token_acc"] == None
assert scores["tag_acc"] == 1.0
assert scores["pos_acc"] == 1.0
assert scores["morph_acc"] == 0.0
assert scores["dep_uas"] == 1.0
assert scores["dep_las"] == 0.0
assert scores["sents_f"] == None
def test_roc_auc_score(): def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite # Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1] y_true = [0, 1]
@ -334,7 +390,8 @@ def test_roc_auc_score():
score = ROCAUCScore() score = ROCAUCScore()
score.score_set(0.25, 0) score.score_set(0.25, 0)
score.score_set(0.75, 0) score.score_set(0.75, 0)
assert score.score == -float("inf") with pytest.raises(ValueError):
s = score.score
y_true = [1, 1] y_true = [1, 1]
y_score = [0.25, 0.75] y_score = [0.25, 0.75]
@ -344,4 +401,5 @@ def test_roc_auc_score():
score = ROCAUCScore() score = ROCAUCScore()
score.score_set(0.25, 1) score.score_set(0.25, 1)
score.score_set(0.75, 1) score.score_set(0.75, 1)
assert score.score == -float("inf") with pytest.raises(ValueError):
s = score.score

View File

@ -51,7 +51,7 @@ def test_readers():
for example in train_corpus(nlp): for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer) nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp))) scores = nlp.evaluate(list(dev_corpus(nlp)))
assert scores["cats_score"] assert scores["cats_score"] == 0.0
# ensure the pipeline runs # ensure the pipeline runs
doc = nlp("Quick test") doc = nlp("Quick test")
assert doc.cats assert doc.cats

View File

@ -2,6 +2,7 @@ import numpy
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json, Example from spacy.training import Corpus, docs_to_json, Example
from spacy.training.align import get_alignments
from spacy.training.converters import json_to_docs from spacy.training.converters import json_to_docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
@pytest.mark.skip("Outdated")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"tokens_a,tokens_b,expected", "tokens_a,tokens_b,expected",
[ [
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
( (
["a", "b", '"', "c"], ["a", "b", '"', "c"],
['ab"', "c"], ['ab"', "c"],
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
), ),
(["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
( (
["ab", "c", "d"], ["ab", "c", "d"],
["a", "b", "cd"], ["a", "b", "cd"],
(6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), ([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
), ),
( (
["a", "b", "cd"], ["a", "b", "cd"],
["a", "b", "c", "d"], ["a", "b", "c", "d"],
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
), ),
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), ([" ", "a"], ["a"], ([[], [0]], [[1]])),
], ],
) )
def test_align(tokens_a, tokens_b, expected): # noqa def test_align(tokens_a, tokens_b, expected): # noqa
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa a2b, b2a = get_alignments(tokens_a, tokens_b)
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa assert (a2b, b2a) == expected # noqa
# check symmetry # check symmetry
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa assert (b2a, a2b) == expected # noqa
def test_goldparse_startswith_space(en_tokenizer): def test_goldparse_startswith_space(en_tokenizer):
@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
def test_goldparse_endswith_space(en_tokenizer):
text = "a\n"
doc = en_tokenizer(text)
gold_words = ["a"]
entities = ["U-DATE"]
deps = ["ROOT"]
heads = [0]
example = Example.from_dict(
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
)
ner_tags = example.get_aligned_ner()
assert ner_tags == ["U-DATE", "O"]
assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
def test_gold_constructor(): def test_gold_constructor():
"""Test that the Example constructor works fine""" """Test that the Example constructor works fine"""
nlp = English() nlp = English()
@ -676,6 +691,87 @@ def test_alignment_different_texts():
Alignment.from_strings(other_tokens, spacy_tokens) Alignment.from_strings(other_tokens, spacy_tokens)
def test_alignment_spaces(en_vocab):
# single leading whitespace
other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
# multiple leading whitespace tokens
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
# both with leading whitespace, not identical
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
# same leading whitespace, different tokenization
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
# only one with trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
# different trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
# same trailing whitespace, different tokenization
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
# differing whitespace is allowed
other_tokens = ["a", " \n ", "b", "c"]
spacy_tokens = ["a", "b", " ", "c"]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.dataXd) == [0, 1, 3]
assert list(align.y2x.dataXd) == [0, 2, 3]
# other differences in whitespace are allowed
other_tokens = [" ", "a"]
spacy_tokens = [" ", "a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
other_tokens = ["a", " "]
spacy_tokens = ["a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc): def test_retokenized_docs(doc):
a = doc.to_array(["TAG"]) a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)

View File

@ -399,14 +399,13 @@ cdef class Doc:
return True return True
cdef int i cdef int i
cdef int range_start = 0 cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START
attr = intify_attr(attr) attr = intify_attr(attr)
# adjust attributes # adjust attributes
if attr == HEAD: if attr == HEAD:
# HEAD does not have an unset state, so rely on DEP # HEAD does not have an unset state, so rely on DEP
attr = DEP attr = DEP
elif attr == self.vocab.strings["IS_SENT_START"]:
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
attr = SENT_START
# special cases for sentence boundaries # special cases for sentence boundaries
if attr == SENT_START: if attr == SENT_START:
if "sents" in self.user_hooks: if "sents" in self.user_hooks:

View File

@ -1,6 +1,6 @@
from .corpus import Corpus # noqa: F401 from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .align import Alignment # noqa: F401 from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401

66
spacy/training/align.pyx Normal file
View File

@ -0,0 +1,66 @@
from typing import List, Tuple
from itertools import chain
import re
from ..errors import Errors
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
# Create character-to-token mappings
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
str_a = "".join(A).lower()
str_b = "".join(B).lower()
cdef int len_str_a = len(str_a)
cdef int len_str_b = len(str_b)
# Check that the two texts only differ in whitespace and capitalization
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
len_str_a != len(char_to_token_a) or \
len_str_b != len(char_to_token_b):
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
cdef int char_idx_a = 0
cdef int char_idx_b = 0
cdef int token_idx_a = 0
cdef int token_idx_b = 0
cdef int prev_token_idx_a = -1
cdef int prev_token_idx_b = -1
a2b = []
b2a = []
while char_idx_a < len_str_a and char_idx_b < len_str_b:
# Find the current token position from the character position
token_idx_a = char_to_token_a[char_idx_a]
token_idx_b = char_to_token_b[char_idx_b]
# Add a set for the next token if a token boundary has been crossed
if prev_token_idx_a != token_idx_a:
a2b.append(set())
if prev_token_idx_b != token_idx_b:
b2a.append(set())
# Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b]:
# Current tokens are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += len(A[token_idx_a])
char_idx_b += len(B[token_idx_b])
elif str_a[char_idx_a] == str_b[char_idx_b]:
# Current chars are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += 1
char_idx_b += 1
elif str_a[char_idx_a].isspace():
# Skip unaligned whitespace char in A
char_idx_a += 1
elif str_b[char_idx_b].isspace():
# Skip unaligned whitespace char in B
char_idx_b += 1
else:
# This should never happen
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
prev_token_idx_a = token_idx_a
prev_token_idx_b = token_idx_b
# Process unaligned trailing whitespace
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
# Return values as sorted lists per token position
return [sorted(x) for x in a2b], [sorted(x) for x in b2a]

View File

@ -2,9 +2,8 @@ from typing import List
import numpy import numpy
from thinc.types import Ragged from thinc.types import Ragged
from dataclasses import dataclass from dataclasses import dataclass
import tokenizations
from ..errors import Errors from .align import get_alignments
@dataclass @dataclass
@ -20,9 +19,7 @@ class Alignment:
@classmethod @classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): x2y, y2x = get_alignments(A, B)
raise ValueError(Errors.E949)
x2y, y2x = tokenizations.get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x) return Alignment.from_indices(x2y=x2y, y2x=y2x)

View File

@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.span import Span from ..tokens.span import Span
from ..attrs import IDS from ..attrs import IDS
from .align import Alignment from .alignment import Alignment
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings

View File

@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]] dot_names = [T["train_corpus"], T["dev_corpus"]]
if not isinstance(T["train_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
if not isinstance(T["dev_corpus"], str):
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
train_corpus, dev_corpus = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"] optimizer = T["optimizer"]
# Components that shouldn't be updated during training # Components that shouldn't be updated during training

View File

@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..errors import Errors from ..errors import Errors
from ..util import registry, load_model_from_config, dot_to_object from ..util import registry, load_model_from_config, resolve_dot_names
def pretrain( def pretrain(
@ -38,7 +38,7 @@ def pretrain(
_config = nlp.config.interpolate() _config = nlp.config.interpolate()
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
corpus = dot_to_object(T, P["corpus"]) corpus = resolve_dot_names(_config, [P["corpus"]])[0]
batcher = P["batcher"] batcher = P["batcher"]
model = create_pretraining_model(nlp, P) model = create_pretraining_model(nlp, P)
optimizer = P["optimizer"] optimizer = P["optimizer"]

View File

@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used a feed-forward subnetwork to build a mixed representation. The features used
can be configured with the `attrs` argument. The suggested attributes are can be configured with the `attrs` argument. The suggested attributes are
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
some subword information, without construction a fully character-based some subword information, without construction a fully character-based
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
different architectures and settings to determine what works best on your different architectures and settings to determine what works best on your
specific data and challenge. specific data and challenge.
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TextCatEnsemble.v1" > @architectures = "spacy.TextCatEnsemble.v2"
> exclusive_classes = false
> pretrained_vectors = null
> width = 64
> embed_size = 2000
> conv_depth = 2
> window_size = 1
> ngram_size = 1
> dropout = null
> nO = null > nO = null
>
> [model.linear_model]
> @architectures = "spacy.TextCatBOW.v1"
> exclusive_classes = true
> ngram_size = 1
> no_output_layer = false
>
> [model.tok2vec]
> @architectures = "spacy.Tok2Vec.v1"
>
> [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
> include_static_vectors = false
>
> [model.tok2vec.encode]
> @architectures = "spacy.MaxoutWindowEncoder.v1"
> width = ${model.tok2vec.embed.width}
> window_size = 1
> maxout_pieces = 3
> depth = 2
> ``` > ```
Stacked ensemble of a bag-of-words model and a neural network model. The neural Stacked ensemble of a linear bag-of-words model and a neural network model. The
network has an internal CNN Tok2Vec layer and uses attention. neural network is built upon a Tok2Vec layer and uses attention. The setting for
whether or not this model should cater for multi-label classification, is taken
from the linear model, where it is stored in `model.attrs["multi_label"]`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
| Name | Description | | Name | Description |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
</Accordion>
### spacy.TextCatCNN.v1 {#TextCatCNN} ### spacy.TextCatCNN.v1 {#TextCatCNN}
> #### Example Config > #### Example Config

View File

@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -68,6 +68,8 @@ Scores the tokenization:
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
character spans character spans
Docs with `has_unknown_spaces` are skipped during scoring.
> #### Example > #### Example
> >
> ```python > ```python
@ -81,7 +83,8 @@ Scores the tokenization:
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
Scores a single token attribute. Scores a single token attribute. Tokens with missing values in the reference doc
are skipped during scoring.
> #### Example > #### Example
> >
@ -90,20 +93,22 @@ Scores a single token attribute.
> print(scores["pos_acc"]) > print(scores["pos_acc"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
Scores a single token attribute per feature for a token attribute in the Scores a single token attribute per feature for a token attribute in the
Universal Dependencies Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation) [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
format. format. Tokens with missing values in the reference doc are skipped during
scoring.
> #### Example > #### Example
> >
@ -112,13 +117,14 @@ format.
> print(scores["morph_per_feat"]) > print(scores["morph_per_feat"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
> print(scores["ents_f"]) > print(scores["ents_f"])
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ |
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
with missing values for the `attr` (typically `dep`) are skipped during scoring.
> #### Example > #### Example
> >
@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
> print(scores["dep_uas"], scores["dep_las"]) > print(scores["dep_uas"], scores["dep_las"])
> ``` > ```
| Name | Description | | Name | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `head_attr` | The attribute containing the head token. ~~str~~ | | `head_attr` | The attribute containing the head token. ~~str~~ |
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | | `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | | `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
containing scores for each label like `Doc.cats`. The reported overall score containing scores for each label like `Doc.cats`. The returned dictionary
depends on the scorer settings: contains the following scores:
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / - `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall each label is weighted equally
score), `{attr}_f_per_type`, `{attr}_auc_per_type` - `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` across evaluations per label
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; - `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` scores, keyed by label
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
description)
The reported `{attr}_score` depends on the classification properties:
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
of the positive label
- **3+ exclusive classes**, macro-averaged F-score:
`{attr}_score = {attr}_macro_f`
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
> #### Example > #### Example
> >

View File

@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers))
Computing similarity scores can be helpful in many situations, but it's also Computing similarity scores can be helpful in many situations, but it's also
important to maintain **realistic expectations** about what information it can important to maintain **realistic expectations** about what information it can
provide. Words can be related to each over in many ways, so a single provide. Words can be related to each other in many ways, so a single
"similarity" score will always be a **mix of different signals**, and vectors "similarity" score will always be a **mix of different signals**, and vectors
trained on different data can produce very different results that may not be trained on different data can produce very different results that may not be
useful for your purpose. Here are some important considerations to keep in mind: useful for your purpose. Here are some important considerations to keep in mind:

View File

@ -130,16 +130,31 @@ factory = "textcat"
labels = [] labels = []
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatEnsemble.v2"
exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
dropout = 0
nO = null nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[components.textcat.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${components.textcat.model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
``` ```
spaCy has two additional built-in `textcat` architectures, and you can easily spaCy has two additional built-in `textcat` architectures, and you can easily
@ -687,7 +702,7 @@ Before the model can be used, it needs to be
[initialized](/usage/training#initialization). This function receives a callback [initialized](/usage/training#initialization). This function receives a callback
to access the full **training data set**, or a representative sample. This data to access the full **training data set**, or a representative sample. This data
set can be used to deduce all **relevant labels**. Alternatively, a list of set can be used to deduce all **relevant labels**. Alternatively, a list of
labels can be provided to `initialize`, or you can call labels can be provided to `initialize`, or you can call
`RelationExtractor.add_label` directly. The number of labels defines the output `RelationExtractor.add_label` directly. The number of labels defines the output
dimensionality of the network, and will be used to do dimensionality of the network, and will be used to do
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the

View File

@ -1244,15 +1244,10 @@ labels = []
# This function is created and then passed to the "textcat" component as # This function is created and then passed to the "textcat" component as
# the argument "model" # the argument "model"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1" @architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false exclusive_classes = false
pretrained_vectors = null
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1 ngram_size = 1
dropout = null no_output_layer = false
[components.other_textcat] [components.other_textcat]
factory = "textcat" factory = "textcat"

View File

@ -1142,7 +1142,7 @@ pattern = [
{ {
"LEFT_ID": "anchor_founded", "LEFT_ID": "anchor_founded",
"REL_OP": ">", "REL_OP": ">",
"RIGHT_ID": "subject", "RIGHT_ID": "founded_subject",
"RIGHT_ATTRS": {"DEP": "nsubj"}, "RIGHT_ATTRS": {"DEP": "nsubj"},
} }
# ... # ...
@ -1212,7 +1212,7 @@ pattern = [
{ {
"LEFT_ID": "anchor_founded", "LEFT_ID": "anchor_founded",
"REL_OP": ">", "REL_OP": ">",
"RIGHT_ID": "subject", "RIGHT_ID": "founded_subject",
"RIGHT_ATTRS": {"DEP": "nsubj"}, "RIGHT_ATTRS": {"DEP": "nsubj"},
}, },
{ {

View File

@ -717,7 +717,7 @@ tabular results to a file:
```python ```python
### functions.py ### functions.py
import sys import sys
from typing import IO, Tuple, Callable, Dict, Any from typing import IO, Tuple, Callable, Dict, Any, Optional
import spacy import spacy
from spacy import Language from spacy import Language
from pathlib import Path from pathlib import Path
@ -729,7 +729,7 @@ def custom_logger(log_path):
stdout: IO=sys.stdout, stdout: IO=sys.stdout,
stderr: IO=sys.stderr stderr: IO=sys.stderr
) -> Tuple[Callable, Callable]: ) -> Tuple[Callable, Callable]:
stdout.write(f"Logging to {log_path}\n") stdout.write(f"Logging to {log_path}\\n")
log_file = Path(log_path).open("w", encoding="utf8") log_file = Path(log_path).open("w", encoding="utf8")
log_file.write("step\\t") log_file.write("step\\t")
log_file.write("score\\t") log_file.write("score\\t")

View File

@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0.
| Name | Description | | Name | Description |
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | | [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. | | [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. | | [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. | | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
@ -1032,9 +1032,9 @@ change your names and imports:
Thanks to everyone who's been contributing to the spaCy ecosystem by developing Thanks to everyone who's been contributing to the spaCy ecosystem by developing
and maintaining one of the many awesome [plugins and extensions](/universe). and maintaining one of the many awesome [plugins and extensions](/universe).
We've tried to make it as easy as possible for you to upgrade your packages for We've tried to make it as easy as possible for you to upgrade your packages for
spaCy v3.0. The most common use case for plugins is providing pipeline components spaCy v3.0. The most common use case for plugins is providing pipeline
and extension attributes. When migrating your plugin, double-check the components and extension attributes. When migrating your plugin, double-check
following: the following:
- Use the [`@Language.factory`](/api/language#factory) decorator to register - Use the [`@Language.factory`](/api/language#factory) decorator to register
your component and assign it a name. This allows users to refer to your your component and assign it a name. This allows users to refer to your

View File

@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg)
Since each visualization is generated as a separate SVG, exporting `.svg` files Since each visualization is generated as a separate SVG, exporting `.svg` files
only works if you're rendering **one single doc** at a time. (This makes sense only works if you're rendering **one single doc** at a time. (This makes sense
after all, each visualization should be a standalone graphic.) So instead of after all, each visualization should be a standalone graphic.) So instead of
rendering all `Doc`s at one, loop over them and export them separately. rendering all `Doc`s at once, loop over them and export them separately.
</Infobox> </Infobox>

View File

@ -120,7 +120,7 @@ function formatAccuracy(data) {
? null ? null
: { : {
label, label,
value: value.toFixed(2), value: (value * 100).toFixed(2),
help: MODEL_META[label], help: MODEL_META[label],
} }
}) })