diff --git a/pyproject.toml b/pyproject.toml index 14a2d7690..0ceda4454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ requires = [ "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0rc0,<8.1.0", "blis>=0.4.0,<0.8.0", - "pytokenizations", "pathy" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 36f0d1e92..3a777f163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,7 @@ pathy numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.5.0,<2.0.0 -pytokenizations +pydantic>=1.5.0,<1.7.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index adf0c0e20..95ada08ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,8 +51,8 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.5.0,<2.0.0 - pytokenizations + pydantic>=1.5.0,<1.7.0 + jinja2 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.py b/setup.py index 604d65745..160d2ed1c 100755 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ MOD_NAMES = [ "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", + "spacy.training.align", "spacy.training.gold_io", "spacy.tokens.doc", "spacy.tokens.span", diff --git a/spacy/about.py b/spacy/about.py index bf1d53a7b..24a3ead22 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc1" +__version__ = "3.0.0rc2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 566820283..a0ea9fbc9 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -93,27 +93,42 @@ def evaluate( "SPEED": "speed", } results = {} + data = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - if key == "speed": - results[metric] = f"{scores[key]:.0f}" + if isinstance(scores[key], (int, float)): + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" else: - results[metric] = f"{scores[key]*100:.2f}" - data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} + results[metric] = "-" + data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") + if "morph_per_feat" in scores: + if scores["morph_per_feat"]: + print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") + data["morph_per_feat"] = scores["morph_per_feat"] + if "dep_las_per_type" in scores: + if scores["dep_las_per_type"]: + print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") + data["dep_las_per_type"] = scores["dep_las_per_type"] if "ents_per_type" in scores: if scores["ents_per_type"]: - print_ents_per_type(msg, scores["ents_per_type"]) + print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") + data["ents_per_type"] = scores["ents_per_type"] if "cats_f_per_type" in scores: if scores["cats_f_per_type"]: - print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) + print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") + data["cats_f_per_type"] = scores["cats_f_per_type"] if "cats_auc_per_type" in scores: if scores["cats_auc_per_type"]: print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) + data["cats_auc_per_type"] = scores["cats_auc_per_type"] if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] @@ -157,7 +172,7 @@ def render_parses( file_.write(html) -def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: +def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: data = [ (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") for k, v in scores.items() @@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), - title="NER (per type)", - ) - - -def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: - data = [ - (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") - for k, v in scores.items() - ] - msg.table( - data, - header=("", "P", "R", "F"), - aligns=("l", "r", "r", "r"), - title="Textcat F (per label)", + title=f"{name} (per {type})", ) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 1c0233539..da474795e 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -39,7 +39,7 @@ def init_vectors_cli( nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " - "path to it in your config as the 'vectors' setting in [initialize.vocab].", + "path to it in your config as the 'vectors' setting in [initialize].", output_dir.resolve(), ) @@ -100,7 +100,7 @@ def init_labels_cli( extract the labels.""" util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) if not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index d92de9c15..1194438de 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -136,15 +136,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" @@ -271,15 +275,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0b27f63dc..fe1e82eb2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -44,7 +44,7 @@ def train_cli( if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if output_path is not None and not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) diff --git a/spacy/errors.py b/spacy/errors.py index 5fab0bab1..f4fd3731f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -398,8 +398,8 @@ class Errors: E163 = ("cumsum was found to be unstable: its last element does not " "correspond to sum") E164 = ("x is neither increasing nor decreasing: {x}.") - E165 = ("Only one class present in y_true. ROC AUC score is not defined in " - "that case.") + E165 = ("Only one class present in the gold labels: {label}. " + "ROC AUC score is not defined in that case.") E166 = ("Can only merge DocBins with the same value for '{param}'.\n" "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") @@ -456,6 +456,8 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E897 = ("Field '{field}' should be a dot-notation string referring to the " + "relevant section in the config, but found type {type} instead.") E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " "is not set or None. If you've implemented a custom component, make " "sure to store the component model as `self.model` in your " @@ -562,7 +564,10 @@ class Errors: "a string value from {expected} but got: '{arg}'") E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " "a list, but got: {arg_type}") - E949 = ("Can only create an alignment when the texts are the same.") + E949 = ("Unable to align tokens for the predicted and reference docs. It " + "is only possible to align the docs when both texts are the same " + "except for whitespace and capitalization. The predicted tokens " + "start with: {x}. The reference tokens start with: {y}.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 067b2167c..02f7c9318 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -286,10 +286,10 @@ cdef class DependencyMatcher: self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) for matched_tree in matched_trees: matched_key_trees.append((key, matched_tree)) - for i, (match_id, nodes) in enumerate(matched_key_trees): - on_match = self._callbacks.get(match_id) - if on_match is not None: - on_match(self, doc, i, matched_key_trees) + for i, (match_id, nodes) in enumerate(matched_key_trees): + on_match = self._callbacks.get(match_id) + if on_match is not None: + on_match(self, doc, i, matched_key_trees) return matched_key_trees def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ec8998e2d..d4aed2839 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,4 +1,6 @@ -from typing import Optional +from typing import Optional, List + +from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum @@ -10,12 +12,13 @@ from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor +from ...tokens import Doc @registry.architectures.register("spacy.TextCatCNN.v1") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None -) -> Model: +) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the @@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier( is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): + cnn = tok2vec >> list2ragged() >> reduce_mean() if exclusive_classes: output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model = cnn >> output_layer model.set_ref("output_layer", output_layer) else: linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = ( - tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() - ) + model = cnn >> linear_layer >> Logistic() model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) @@ -45,8 +47,7 @@ def build_bow_text_classifier( ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, -) -> Model: - # Don't document this yet, I'm not sure it's right. +) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear @@ -59,6 +60,39 @@ def build_bow_text_classifier( return model +@registry.architectures.register("spacy.TextCatEnsemble.v2") +def build_text_classifier( + tok2vec: Model[List[Doc], List[Floats2d]], + linear_model: Model[List[Doc], Floats2d], + nO: Optional[int] = None, +) -> Model[List[Doc], Floats2d]: + exclusive_classes = not linear_model.attrs["multi_label"] + with Model.define_operators({">>": chain, "|": concatenate}): + width = tok2vec.get_dim("nO") + cnn_model = ( + tok2vec + >> list2ragged() + >> ParametricAttention(width) # TODO: benchmark performance difference of this layer + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) + ) + + nO_double = nO * 2 if nO else None + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=nO_double) + else: + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() + model = (linear_model | cnn_model) >> output_layer + model.set_ref("tok2vec", tok2vec) + if model.has_dim("nO") is not False: + model.set_dim("nO", nO) + model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.attrs["multi_label"] = not exclusive_classes + return model + +# TODO: move to legacy @registry.architectures.register("spacy.TextCatEnsemble.v1") def build_text_classifier( width: int, @@ -158,11 +192,8 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( - width: int, - pretrained_vectors: Optional[bool], - dropout: Optional[float], - nO: Optional[int] = None, -) -> Model: + width: int, dropout: Optional[float], nO: Optional[int] = None +) -> Model[List[Doc], Floats2d]: # Don't document this yet, I'm not sure it's right. # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 95e200927..8755d0d0d 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -106,7 +106,7 @@ def MultiHashEmbed( ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it - through a feed-forward subnetwork to build a mixed representations. + through a feed-forward subnetwork to build a mixed representation. The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index e17d3be98..68e26c4be 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -226,6 +226,9 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/tagger#score """ + def morph_key_getter(token, attr): + return getattr(token, attr).key + validate_examples(examples, "AttributeRuler.score") results = {} attrs = set() @@ -237,7 +240,8 @@ class AttributeRuler(Pipe): elif attr == POS: results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) elif attr == MORPH: - results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) + results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs)) elif attr == LEMMA: results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) return results diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index bdef332cc..a9dcd705e 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -155,13 +155,16 @@ cdef class DependencyParser(Parser): DOCS: https://nightly.spacy.io/api/dependencyparser#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "DependencyParser.score") def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep results = {} - results.update(Scorer.score_spans(examples, "sents", **kwargs)) + results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 382ca338d..2a3b8dd00 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -10,7 +10,7 @@ from ..errors import Errors from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher -from ..scorer import Scorer +from ..scorer import get_ner_prf from ..training import validate_examples @@ -340,7 +340,7 @@ class EntityRuler(Pipe): def score(self, examples, **kwargs): validate_examples(examples, "EntityRuler.score") - return Scorer.score_spans(examples, "ents", **kwargs) + return get_ner_prf(examples) def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index ac111f28b..a03c7daf0 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -251,10 +251,13 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#score """ + def morph_key_getter(token, attr): + return getattr(token, attr).key + validate_examples(examples, "Morphologizer.score") results = {} results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) results.update(Scorer.score_token_attr_per_feat(examples, - "morph", **kwargs)) + "morph", getter=morph_key_getter, **kwargs)) return results diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 6482d6125..0f93b43ac 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser): DOCS: https://nightly.spacy.io/api/entityrecognizer#score """ validate_examples(examples, "EntityRecognizer.score") - score_per_type = get_ner_prf(examples) - totals = PRFScore() - for prf in score_per_type.values(): - totals += prf - return { - "ents_p": totals.precision, - "ents_r": totals.recall, - "ents_f": totals.fscore, - "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, - } + return get_ner_prf(examples) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 7656b330c..6e8b1c324 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -155,8 +155,11 @@ class Sentencizer(Pipe): DOCS: https://nightly.spacy.io/api/sentencizer#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "Sentencizer.score") - results = Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) del results["sents_per_type"] return results diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 15a21902a..ad777ea58 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger): RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. DOCS: https://nightly.spacy.io/api/sentencerecognizer#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "SentenceRecognizer.score") - results = Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) del results["sents_per_type"] return results diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 5ebe0e104..0781a000c 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -16,15 +16,30 @@ from ..vocab import Vocab default_model_config = """ [model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null +@architectures = "spacy.TextCatEnsemble.v2" + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" width = 64 -conv_depth = 2 -embed_size = 2000 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} window_size = 1 +maxout_pieces = 3 +depth = 2 + +[model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false ngram_size = 1 -dropout = null +no_output_layer = false """ DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] @@ -60,9 +75,11 @@ subword_features = true default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, - "cats_p": None, - "cats_r": None, - "cats_f": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, "cats_macro_f": None, "cats_macro_auc": None, "cats_f_per_type": None, diff --git a/spacy/scorer.py b/spacy/scorer.py index d1065f3a9..fe64c23ad 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,9 +1,9 @@ -from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING +from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING import numpy as np from collections import defaultdict from .training import Example -from .tokens import Token, Doc, Span +from .tokens import Token, Doc, Span, MorphAnalysis from .errors import Errors from .util import get_lang_class, SimpleFrozenList from .morphology import Morphology @@ -13,7 +13,8 @@ if TYPE_CHECKING: from .language import Language # noqa: F401 -DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] +DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat") +MISSING_VALUES = frozenset([None, 0, ""]) class PRFScore: @@ -24,6 +25,9 @@ class PRFScore: self.fp = 0 self.fn = 0 + def __len__(self) -> int: + return self.tp + self.fp + self.fn + def __iadd__(self, other): self.tp += other.tp self.fp += other.fp @@ -59,7 +63,9 @@ class PRFScore: class ROCAUCScore: - """An AUC ROC score.""" + """An AUC ROC score. This is only defined for binary classification. + Use the method is_binary before calculating the score, otherwise it + may throw an error.""" def __init__(self) -> None: self.golds = [] @@ -71,16 +77,16 @@ class ROCAUCScore: self.cands.append(cand) self.golds.append(gold) + def is_binary(self): + return len(np.unique(self.golds)) == 2 + @property def score(self): + if not self.is_binary(): + raise ValueError(Errors.E165.format(label=set(self.golds))) if len(self.golds) == self.saved_score_at_len: return self.saved_score - try: - self.saved_score = _roc_auc_score(self.golds, self.cands) - # catch ValueError: Only one class present in y_true. - # ROC AUC score is not defined in that case. - except ValueError: - self.saved_score = -float("inf") + self.saved_score = _roc_auc_score(self.golds, self.cands) self.saved_score_at_len = len(self.golds) return self.saved_score @@ -92,7 +98,7 @@ class Scorer: self, nlp: Optional["Language"] = None, default_lang: str = "xx", - default_pipeline=DEFAULT_PIPELINE, + default_pipeline: Iterable[str] = DEFAULT_PIPELINE, **cfg, ) -> None: """Initialize the Scorer. @@ -124,13 +130,13 @@ class Scorer: return scores @staticmethod - def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: + def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: """Returns accuracy and PRF scores for tokenization. * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for token character spans examples (Iterable[Example]): Examples to score - RETURNS (Dict[str, float]): A dictionary containing the scores + RETURNS (Dict[str, Any]): A dictionary containing the scores token_acc/p/r/f. DOCS: https://nightly.spacy.io/api/scorer#score_tokenization @@ -140,6 +146,8 @@ class Scorer: for example in examples: gold_doc = example.reference pred_doc = example.predicted + if gold_doc.has_unknown_spaces: + continue align = example.alignment gold_spans = set() pred_spans = set() @@ -156,12 +164,20 @@ class Scorer: else: acc_score.tp += 1 prf_score.score_set(pred_spans, gold_spans) - return { - "token_acc": acc_score.fscore, - "token_p": prf_score.precision, - "token_r": prf_score.recall, - "token_f": prf_score.fscore, - } + if len(acc_score) > 0: + return { + "token_acc": acc_score.fscore, + "token_p": prf_score.precision, + "token_r": prf_score.recall, + "token_f": prf_score.fscore, + } + else: + return { + "token_acc": None, + "token_p": None, + "token_r": None, + "token_f": None + } @staticmethod def score_token_attr( @@ -169,8 +185,9 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, + missing_values: Set[Any] = MISSING_VALUES, **cfg, - ) -> Dict[str, float]: + ) -> Dict[str, Any]: """Returns an accuracy score for a token-level attribute. examples (Iterable[Example]): Examples to score @@ -178,7 +195,7 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - RETURNS (Dict[str, float]): A dictionary containing the accuracy score + RETURNS (Dict[str, Any]): A dictionary containing the accuracy score under the key attr_acc. DOCS: https://nightly.spacy.io/api/scorer#score_token_attr @@ -189,17 +206,27 @@ class Scorer: pred_doc = example.predicted align = example.alignment gold_tags = set() + missing_indices = set() for gold_i, token in enumerate(gold_doc): - gold_tags.add((gold_i, getter(token, attr))) + value = getter(token, attr) + if value not in missing_values: + gold_tags.add((gold_i, getter(token, attr))) + else: + missing_indices.add(gold_i) pred_tags = set() for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] - pred_tags.add((gold_i, getter(token, attr))) + if gold_i not in missing_indices: + pred_tags.add((gold_i, getter(token, attr))) tag_score.score_set(pred_tags, gold_tags) - return {f"{attr}_acc": tag_score.fscore} + score_key = f"{attr}_acc" + if len(tag_score) == 0: + return {score_key: None} + else: + return {score_key: tag_score.fscore} @staticmethod def score_token_attr_per_feat( @@ -207,8 +234,9 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, + missing_values: Set[Any] = MISSING_VALUES, **cfg, - ): + ) -> Dict[str, Any]: """Return PRF scores per feat for a token attribute in UFEATS format. examples (Iterable[Example]): Examples to score @@ -216,7 +244,7 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - RETURNS (dict): A dictionary containing the per-feat PRF scores unders + RETURNS (dict): A dictionary containing the per-feat PRF scores under the key attr_per_feat. """ per_feat = {} @@ -225,9 +253,11 @@ class Scorer: gold_doc = example.reference align = example.alignment gold_per_feat = {} + missing_indices = set() for gold_i, token in enumerate(gold_doc): - morph = str(getter(token, attr)) - if morph: + value = getter(token, attr) + morph = gold_doc.vocab.strings[value] + if value not in missing_values and morph != Morphology.EMPTY_MORPH: for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: @@ -235,27 +265,35 @@ class Scorer: if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) + else: + missing_indices.add(gold_i) pred_per_feat = {} for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] - morph = str(getter(token, attr)) - if morph: - for feat in morph.split("|"): - field, values = feat.split("=") - if field not in per_feat: - per_feat[field] = PRFScore() - if field not in pred_per_feat: - pred_per_feat[field] = set() - pred_per_feat[field].add((gold_i, feat)) + if gold_i not in missing_indices: + value = getter(token, attr) + morph = gold_doc.vocab.strings[value] + if value not in missing_values and morph != Morphology.EMPTY_MORPH: + for feat in morph.split(Morphology.FEATURE_SEP): + field, values = feat.split(Morphology.FIELD_SEP) + if field not in per_feat: + per_feat[field] = PRFScore() + if field not in pred_per_feat: + pred_per_feat[field] = set() + pred_per_feat[field].add((gold_i, feat)) for field in per_feat: per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) - result = {k: v.to_dict() for k, v in per_feat.items()} - return {f"{attr}_per_feat": result} + score_key = f"{attr}_per_feat" + if any([len(v) for v in per_feat.values()]): + result = {k: v.to_dict() for k, v in per_feat.items()} + return {score_key: result} + else: + return {score_key: None} @staticmethod def score_spans( @@ -263,6 +301,7 @@ class Scorer: attr: str, *, getter: Callable[[Doc, str], Iterable[Span]] = getattr, + has_annotation: Optional[Callable[[Doc], bool]] = None, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. @@ -282,18 +321,10 @@ class Scorer: for example in examples: pred_doc = example.predicted gold_doc = example.reference - # TODO - # This is a temporary hack to work around the problem that the scorer - # fails if you have examples that are not fully annotated for all - # the tasks in your pipeline. For instance, you might have a corpus - # of NER annotations that does not set sentence boundaries, but the - # pipeline includes a parser or senter, and then the score_weights - # are used to evaluate that component. When the scorer attempts - # to read the sentences from the gold document, it fails. - try: - list(getter(gold_doc, attr)) - except ValueError: - continue + # Option to handle docs without sents + if has_annotation is not None: + if not has_annotation(gold_doc): + continue # Find all labels in gold and doc labels = set( [k.label_ for k in getter(gold_doc, attr)] @@ -321,13 +352,21 @@ class Scorer: v.score_set(pred_per_type[k], gold_per_type[k]) # Score for all labels score.score_set(pred_spans, gold_spans) - results = { - f"{attr}_p": score.precision, - f"{attr}_r": score.recall, - f"{attr}_f": score.fscore, - f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, - } - return results + if len(score) > 0: + return { + f"{attr}_p": score.precision, + f"{attr}_r": score.recall, + f"{attr}_f": score.fscore, + f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + } + else: + return { + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + f"{attr}_per_type": None, + } + @staticmethod def score_cats( @@ -362,9 +401,13 @@ class Scorer: for all: attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score_desc (text description of the overall score), + attr_micro_p, + attr_micro_r, attr_micro_f, + attr_macro_p, + attr_macro_r, attr_macro_f, - attr_auc, + attr_macro_auc, attr_f_per_type, attr_auc_per_type @@ -384,9 +427,6 @@ class Scorer: pred_cats = getter(example.predicted, attr) gold_cats = getter(example.reference, attr) - # I think the AUC metric is applicable regardless of whether we're - # doing multi-label classification? Unsure. If not, move this into - # the elif pred_cats and gold_cats block below. for label in labels: pred_score = pred_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0) @@ -431,7 +471,9 @@ class Scorer: macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats - macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats + # Limit macro_auc to those labels with gold annotations, + # but still divide by all cats to avoid artificial boosting of datasets with missing labels + macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats results = { f"{attr}_score": None, f"{attr}_score_desc": None, @@ -443,7 +485,7 @@ class Scorer: f"{attr}_macro_f": macro_f, f"{attr}_macro_auc": macro_auc, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, - f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, + f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] @@ -534,6 +576,7 @@ class Scorer: head_attr: str = "head", head_getter: Callable[[Token, str], Token] = getattr, ignore_labels: Iterable[str] = SimpleFrozenList(), + missing_values: Set[Any] = MISSING_VALUES, **cfg, ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency @@ -558,6 +601,7 @@ class Scorer: unlabelled = PRFScore() labelled = PRFScore() labelled_per_dep = dict() + missing_indices = set() for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -567,13 +611,16 @@ class Scorer: for gold_i, token in enumerate(gold_doc): dep = getter(token, attr) head = head_getter(token, head_attr) - if dep not in ignore_labels: - gold_deps.add((gold_i, head.i, dep)) - if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() - if dep not in gold_deps_per_dep: - gold_deps_per_dep[dep] = set() - gold_deps_per_dep[dep].add((gold_i, head.i, dep)) + if dep not in missing_values: + if dep not in ignore_labels: + gold_deps.add((gold_i, head.i, dep)) + if dep not in labelled_per_dep: + labelled_per_dep[dep] = PRFScore() + if dep not in gold_deps_per_dep: + gold_deps_per_dep[dep] = set() + gold_deps_per_dep[dep].add((gold_i, head.i, dep)) + else: + missing_indices.add(gold_i) pred_deps = set() pred_deps_per_dep = {} for token in pred_doc: @@ -583,25 +630,26 @@ class Scorer: gold_i = None else: gold_i = align.x2y[token.i].dataXd[0, 0] - dep = getter(token, attr) - head = head_getter(token, head_attr) - if dep not in ignore_labels and token.orth_.strip(): - if align.x2y.lengths[head.i] == 1: - gold_head = align.x2y[head.i].dataXd[0, 0] - else: - gold_head = None - # None is indistinct, so we can't just add it to the set - # Multiple (None, None) deps are possible - if gold_i is None or gold_head is None: - unlabelled.fp += 1 - labelled.fp += 1 - else: - pred_deps.add((gold_i, gold_head, dep)) - if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() - if dep not in pred_deps_per_dep: - pred_deps_per_dep[dep] = set() - pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) + if gold_i not in missing_indices: + dep = getter(token, attr) + head = head_getter(token, head_attr) + if dep not in ignore_labels and token.orth_.strip(): + if align.x2y.lengths[head.i] == 1: + gold_head = align.x2y[head.i].dataXd[0, 0] + else: + gold_head = None + # None is indistinct, so we can't just add it to the set + # Multiple (None, None) deps are possible + if gold_i is None or gold_head is None: + unlabelled.fp += 1 + labelled.fp += 1 + else: + pred_deps.add((gold_i, gold_head, dep)) + if dep not in labelled_per_dep: + labelled_per_dep[dep] = PRFScore() + if dep not in pred_deps_per_dep: + pred_deps_per_dep[dep] = set() + pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) labelled.score_set(pred_deps, gold_deps) for dep in labelled_per_dep: labelled_per_dep[dep].score_set( @@ -610,29 +658,34 @@ class Scorer: unlabelled.score_set( set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) ) - return { - f"{attr}_uas": unlabelled.fscore, - f"{attr}_las": labelled.fscore, - f"{attr}_las_per_type": { - k: v.to_dict() for k, v in labelled_per_dep.items() - }, - } + if len(unlabelled) > 0: + return { + f"{attr}_uas": unlabelled.fscore, + f"{attr}_las": labelled.fscore, + f"{attr}_las_per_type": { + k: v.to_dict() for k, v in labelled_per_dep.items() + }, + } + else: + return { + f"{attr}_uas": None, + f"{attr}_las": None, + f"{attr}_las_per_type": None, + } -def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: - """Compute per-entity PRFScore objects for a sequence of examples. The - results are returned as a dictionary keyed by the entity type. You can - add the PRFScore objects to get micro-averaged total. +def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: + """Compute micro-PRF and per-entity PRF scores for a sequence of examples. """ - scores = defaultdict(PRFScore) + score_per_type = defaultdict(PRFScore) for eg in examples: if not eg.y.has_annotation("ENT_IOB"): continue golds = {(e.label_, e.start, e.end) for e in eg.y.ents} align_x2y = eg.alignment.x2y for pred_ent in eg.x.ents: - if pred_ent.label_ not in scores: - scores[pred_ent.label_] = PRFScore() + if pred_ent.label_ not in score_per_type: + score_per_type[pred_ent.label_] = PRFScore() indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() if len(indices): g_span = eg.y[indices[0] : indices[-1] + 1] @@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: if all(token.ent_iob != 0 for token in g_span): key = (pred_ent.label_, indices[0], indices[-1] + 1) if key in golds: - scores[pred_ent.label_].tp += 1 + score_per_type[pred_ent.label_].tp += 1 golds.remove(key) else: - scores[pred_ent.label_].fp += 1 + score_per_type[pred_ent.label_].fp += 1 for label, start, end in golds: - scores[label].fn += 1 - return scores + score_per_type[label].fn += 1 + totals = PRFScore() + for prf in score_per_type.values(): + totals += prf + if len(totals) > 0: + return { + "ents_p": totals.precision, + "ents_r": totals.recall, + "ents_f": totals.fscore, + "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + } + else: + return { + "ents_p": None, + "ents_r": None, + "ents_f": None, + "ents_per_type": None, + } ############################################################################# @@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score): `_ """ if len(np.unique(y_true)) != 2: - raise ValueError(Errors.E165) + raise ValueError(Errors.E165.format(label=np.unique(y_true))) fpr, tpr, _ = _roc_curve(y_true, y_score) return _auc(fpr, tpr) diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index e18a8f6d8..481187348 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc): pattern = [ {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, ] + nomatch_pattern = [ + {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}}, + ] matcher = DependencyMatcher(en_vocab) mock = Mock() matcher.add("pattern", [pattern], on_match=mock) + matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock) matches = matcher(doc) + assert len(matches) == 1 mock.assert_called_once_with(matcher, doc, 0, matches) # check that matches with and without callback are the same (#4590) diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 6c66469cc..02726172b 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts): scores = nlp.evaluate(dev_examples) # "cat" is the only correct lemma assert scores["lemma_acc"] == pytest.approx(0.2) - # the empty morphs are correct - assert scores["morph_acc"] == pytest.approx(0.6) + # no morphs are set + assert scores["morph_acc"] == None def test_attributeruler_rule_order(nlp): diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index cac394913..6f07c0220 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -2,6 +2,7 @@ import pytest from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.tokens import Doc from spacy.util import registry, SimpleFrozenDict, combine_score_weights from thinc.api import Model, Linear, ConfigValidationError @@ -156,15 +157,10 @@ def test_pipe_class_component_model(): name = "test_class_component_model" default_config = { "model": { - "@architectures": "spacy.TextCatEnsemble.v1", - "exclusive_classes": False, - "pretrained_vectors": None, - "width": 64, - "embed_size": 2000, - "window_size": 1, - "conv_depth": 2, - "ngram_size": 1, - "dropout": None, + "@architectures": "spacy.TextCatEnsemble.v2", + "tok2vec": DEFAULT_TOK2VEC_MODEL, + "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, + "no_output_layer": False}, }, "value1": 10, } diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 91348b1b3..06d512a32 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -140,7 +140,7 @@ def test_overfitting_IO(): nlp = English() nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} # Set exclusive labels - config = {"model": {"exclusive_classes": True}} + config = {"model": {"linear_model": {"exclusive_classes": True}}} textcat = nlp.add_pipe("textcat", config=config) train_examples = [] for text, annotations in TRAIN_DATA: @@ -192,9 +192,8 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, ], diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index e8884e6b2..200d7dcfd 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder -from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier from spacy.ml.staticvectors import StaticVectors from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES -def get_textcat_kwargs(): +def get_textcat_bow_kwargs(): return { - "width": 64, - "embed_size": 2000, - "pretrained_vectors": None, - "exclusive_classes": False, + "exclusive_classes": True, "ngram_size": 1, - "window_size": 1, - "conv_depth": 2, - "dropout": None, - "nO": 7, + "no_output_layer": False, + "nO": 34, } def get_textcat_cnn_kwargs(): - return { - "tok2vec": test_tok2vec(), - "exclusive_classes": False, - "nO": 13, - } + return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -105,7 +96,7 @@ def test_multi_hash_embed(): "seed,model_func,kwargs", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs()), - (0, build_text_classifier, get_textcat_kwargs()), + (0, build_bow_text_classifier, get_textcat_bow_kwargs()), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), ], ) @@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): "seed,model_func,kwargs,get_X", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) @@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): "seed,dropout,model_func,kwargs,get_X", [ (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 4c1b09849..56b276f0b 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -277,6 +277,62 @@ def test_tag_score(tagged_doc): assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) +def test_partial_annotation(en_tokenizer): + pred_doc = en_tokenizer("a b c d e") + pred_doc[0].tag_ = "A" + pred_doc[0].pos_ = "X" + pred_doc[0].set_morph("Feat=Val") + pred_doc[0].dep_ = "dep" + + # unannotated reference + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + for key in scores: + # cats doesn't have an unset state + if key.startswith("cats"): + continue + assert scores[key] == None + + # partially annotated reference, not overlapping with predicted annotation + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + ref_doc[1].tag_ = "A" + ref_doc[1].pos_ = "X" + ref_doc[1].set_morph("Feat=Val") + ref_doc[1].dep_ = "dep" + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + assert scores["token_acc"] == None + assert scores["tag_acc"] == 0.0 + assert scores["pos_acc"] == 0.0 + assert scores["morph_acc"] == 0.0 + assert scores["dep_uas"] == 1.0 + assert scores["dep_las"] == 0.0 + assert scores["sents_f"] == None + + # partially annotated reference, overlapping with predicted annotation + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + ref_doc[0].tag_ = "A" + ref_doc[0].pos_ = "X" + ref_doc[1].set_morph("Feat=Val") + ref_doc[1].dep_ = "dep" + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + assert scores["token_acc"] == None + assert scores["tag_acc"] == 1.0 + assert scores["pos_acc"] == 1.0 + assert scores["morph_acc"] == 0.0 + assert scores["dep_uas"] == 1.0 + assert scores["dep_las"] == 0.0 + assert scores["sents_f"] == None + + def test_roc_auc_score(): # Binary classification, toy tests from scikit-learn test suite y_true = [0, 1] @@ -334,7 +390,8 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 0) score.score_set(0.75, 0) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score y_true = [1, 1] y_score = [0.25, 0.75] @@ -344,4 +401,5 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 1) score.score_set(0.75, 1) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 9d82ca50a..ff2559d2a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -51,7 +51,7 @@ def test_readers(): for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) - assert scores["cats_score"] + assert scores["cats_score"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 07e1aef01..ba485ab45 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -2,6 +2,7 @@ import numpy from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json, Example +from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -@pytest.mark.skip("Outdated") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ - (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), + (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])), ( ["a", "b", '"', "c"], ['ab"', "c"], - (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), + ([[0], [0], [0], [1]], [[0, 1, 2], [3]]), ), - (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), + (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])), ( ["ab", "c", "d"], ["a", "b", "cd"], - (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), + ([[0, 1], [2], [2]], [[0], [0], [1, 2]]), ), ( ["a", "b", "cd"], ["a", "b", "c", "d"], - (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), + ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), ), - ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), + ([" ", "a"], ["a"], ([[], [0]], [[1]])), ], ) def test_align(tokens_a, tokens_b, expected): # noqa - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa - assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_a, tokens_b) + assert (a2b, b2a) == expected # noqa # check symmetry - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa - assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa + assert (b2a, a2b) == expected # noqa def test_goldparse_startswith_space(en_tokenizer): @@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer): assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] +def test_goldparse_endswith_space(en_tokenizer): + text = "a\n" + doc = en_tokenizer(text) + gold_words = ["a"] + entities = ["U-DATE"] + deps = ["ROOT"] + heads = [0] + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} + ) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["U-DATE", "O"] + assert example.get_aligned("DEP", as_string=True) == ["ROOT", None] + + def test_gold_constructor(): """Test that the Example constructor works fine""" nlp = English() @@ -676,6 +691,87 @@ def test_alignment_different_texts(): Alignment.from_strings(other_tokens, spacy_tokens) +def test_alignment_spaces(en_vocab): + # single leading whitespace + other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] + + # multiple leading whitespace tokens + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] + + # both with leading whitespace, not identical + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7] + + # same leading whitespace, different tokenization + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7] + + # only one with trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + # different trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6] + + # same trailing whitespace, different tokenization + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7] + + # differing whitespace is allowed + other_tokens = ["a", " \n ", "b", "c"] + spacy_tokens = ["a", "b", " ", "c"] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.dataXd) == [0, 1, 3] + assert list(align.y2x.dataXd) == [0, 2, 3] + + # other differences in whitespace are allowed + other_tokens = [" ", "a"] + spacy_tokens = [" ", "a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + other_tokens = ["a", " "] + spacy_tokens = ["a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + def test_retokenized_docs(doc): a = doc.to_array(["TAG"]) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index abc82030d..c824b2752 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -399,14 +399,13 @@ cdef class Doc: return True cdef int i cdef int range_start = 0 + if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: # HEAD does not have an unset state, so rely on DEP attr = DEP - elif attr == self.vocab.strings["IS_SENT_START"]: - # as in Matcher, allow IS_SENT_START as an alias of SENT_START - attr = SENT_START # special cases for sentence boundaries if attr == SENT_START: if "sents" in self.user_hooks: diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 86341dd9a..5111b80dc 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,6 +1,6 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 -from .align import Alignment # noqa: F401 +from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx new file mode 100644 index 000000000..b9d89f789 --- /dev/null +++ b/spacy/training/align.pyx @@ -0,0 +1,66 @@ +from typing import List, Tuple +from itertools import chain +import re + +from ..errors import Errors + + +def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: + # Create character-to-token mappings + char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) + char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) + str_a = "".join(A).lower() + str_b = "".join(B).lower() + cdef int len_str_a = len(str_a) + cdef int len_str_b = len(str_b) + # Check that the two texts only differ in whitespace and capitalization + if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ + len_str_a != len(char_to_token_a) or \ + len_str_b != len(char_to_token_b): + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + cdef int char_idx_a = 0 + cdef int char_idx_b = 0 + cdef int token_idx_a = 0 + cdef int token_idx_b = 0 + cdef int prev_token_idx_a = -1 + cdef int prev_token_idx_b = -1 + a2b = [] + b2a = [] + while char_idx_a < len_str_a and char_idx_b < len_str_b: + # Find the current token position from the character position + token_idx_a = char_to_token_a[char_idx_a] + token_idx_b = char_to_token_b[char_idx_b] + # Add a set for the next token if a token boundary has been crossed + if prev_token_idx_a != token_idx_a: + a2b.append(set()) + if prev_token_idx_b != token_idx_b: + b2a.append(set()) + # Process the alignment at the current position + if A[token_idx_a] == B[token_idx_b]: + # Current tokens are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += len(A[token_idx_a]) + char_idx_b += len(B[token_idx_b]) + elif str_a[char_idx_a] == str_b[char_idx_b]: + # Current chars are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += 1 + char_idx_b += 1 + elif str_a[char_idx_a].isspace(): + # Skip unaligned whitespace char in A + char_idx_a += 1 + elif str_b[char_idx_b].isspace(): + # Skip unaligned whitespace char in B + char_idx_b += 1 + else: + # This should never happen + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + prev_token_idx_a = token_idx_a + prev_token_idx_b = token_idx_b + # Process unaligned trailing whitespace + a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) + b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) + # Return values as sorted lists per token position + return [sorted(x) for x in a2b], [sorted(x) for x in b2a] diff --git a/spacy/training/align.py b/spacy/training/alignment.py similarity index 75% rename from spacy/training/align.py rename to spacy/training/alignment.py index e8f17a667..3e3b60ca6 100644 --- a/spacy/training/align.py +++ b/spacy/training/alignment.py @@ -2,9 +2,8 @@ from typing import List import numpy from thinc.types import Ragged from dataclasses import dataclass -import tokenizations -from ..errors import Errors +from .align import get_alignments @dataclass @@ -20,9 +19,7 @@ class Alignment: @classmethod def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": - if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): - raise ValueError(Errors.E949) - x2y, y2x = tokenizations.get_alignments(A, B) + x2y, y2x = get_alignments(A, B) return Alignment.from_indices(x2y=x2y, y2x=y2x) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index a8da49c61..6a556b5e7 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS -from .align import Alignment +from .alignment import Alignment from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 7c84caf95..3d79eb78f 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] + if not isinstance(T["train_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"]))) + if not isinstance(T["dev_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"]))) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index b91fb07a8..e5c41c70b 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..errors import Errors -from ..util import registry, load_model_from_config, dot_to_object +from ..util import registry, load_model_from_config, resolve_dot_names def pretrain( @@ -38,7 +38,7 @@ def pretrain( _config = nlp.config.interpolate() T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) - corpus = dot_to_object(T, P["corpus"]) + corpus = resolve_dot_names(_config, [P["corpus"]])[0] batcher = P["batcher"] model = create_pretraining_model(nlp, P) optimizer = P["optimizer"] diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3157c261a..fe2223017 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representations. The features used +a feed-forward subnetwork to build a mixed representation. The features used can be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based @@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with different architectures and settings to determine what works best on your specific data and challenge. -### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} +### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatEnsemble.v1" -> exclusive_classes = false -> pretrained_vectors = null -> width = 64 -> embed_size = 2000 -> conv_depth = 2 -> window_size = 1 -> ngram_size = 1 -> dropout = null +> @architectures = "spacy.TextCatEnsemble.v2" > nO = null +> +> [model.linear_model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = true +> ngram_size = 1 +> no_output_layer = false +> +> [model.tok2vec] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.tok2vec.embed] +> @architectures = "spacy.MultiHashEmbed.v1" +> width = 64 +> rows = [2000, 2000, 1000, 1000, 1000, 1000] +> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +> include_static_vectors = false +> +> [model.tok2vec.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> width = ${model.tok2vec.embed.width} +> window_size = 1 +> maxout_pieces = 3 +> depth = 2 > ``` -Stacked ensemble of a bag-of-words model and a neural network model. The neural -network has an internal CNN Tok2Vec layer and uses attention. +Stacked ensemble of a linear bag-of-words model and a neural network model. The +neural network is built upon a Tok2Vec layer and uses attention. The setting for +whether or not this model should cater for multi-label classification, is taken +from the linear model, where it is stored in `model.attrs["multi_label"]`. + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ | +| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + + +The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + ### spacy.TextCatCNN.v1 {#TextCatCNN} > #### Example Config diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index d511dc889..16bbc2700 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -683,6 +683,7 @@ The L2 norm of the document's vector representation. | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 0dbc0de33..fb48d68cc 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -68,6 +68,8 @@ Scores the tokenization: - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token character spans +Docs with `has_unknown_spaces` are skipped during scoring. + > #### Example > > ```python @@ -81,7 +83,8 @@ Scores the tokenization: ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} -Scores a single token attribute. +Scores a single token attribute. Tokens with missing values in the reference doc +are skipped during scoring. > #### Example > @@ -90,20 +93,22 @@ Scores a single token attribute. > print(scores["pos_acc"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} Scores a single token attribute per feature for a token attribute in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) -format. +format. Tokens with missing values in the reference doc are skipped during +scoring. > #### Example > @@ -112,13 +117,14 @@ format. > print(scores["morph_per_feat"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} @@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans. > print(scores["ents_f"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | -| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | +| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ | +| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} -Calculate the UAS, LAS, and LAS per type scores for dependency parses. +Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens +with missing values for the `attr` (typically `dep`) are skipped during scoring. > #### Example > @@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. > print(scores["dep_uas"], scores["dep_las"]) > ``` -| Name | Description | -| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| `head_attr` | The attribute containing the head token. ~~str~~ | -| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | -| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | -| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `head_attr` | The attribute containing the head token. ~~str~~ | +| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | +| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict -containing scores for each label like `Doc.cats`. The reported overall score -depends on the scorer settings: +containing scores for each label like `Doc.cats`. The returned dictionary +contains the following scores: -1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / - `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall - score), `{attr}_f_per_type`, `{attr}_auc_per_type` -2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` -3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; -4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` +- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across + each label is weighted equally +- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values + across evaluations per label +- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of + scores, keyed by label +- A final `{attr}_score` and corresponding `{attr}_score_desc` (text + description) + +The reported `{attr}_score` depends on the classification properties: + +- **binary exclusive with positive label:** `{attr}_score` is set to the F-score + of the positive label +- **3+ exclusive classes**, macro-averaged F-score: + `{attr}_score = {attr}_macro_f` +- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc` > #### Example > diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 2a8733f41..f05fedd7d 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers)) Computing similarity scores can be helpful in many situations, but it's also important to maintain **realistic expectations** about what information it can -provide. Words can be related to each over in many ways, so a single +provide. Words can be related to each other in many ways, so a single "similarity" score will always be a **mix of different signals**, and vectors trained on different data can produce very different results that may not be useful for your purpose. Here are some important considerations to keep in mind: diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index d7b2593e7..aa62a77d4 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -130,16 +130,31 @@ factory = "textcat" labels = [] [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 -dropout = 0 +@architectures = "spacy.TextCatEnsemble.v2" nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[components.textcat.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 64 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[components.textcat.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${components.textcat.model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false ``` spaCy has two additional built-in `textcat` architectures, and you can easily @@ -687,7 +702,7 @@ Before the model can be used, it needs to be [initialized](/usage/training#initialization). This function receives a callback to access the full **training data set**, or a representative sample. This data set can be used to deduce all **relevant labels**. Alternatively, a list of -labels can be provided to `initialize`, or you can call +labels can be provided to `initialize`, or you can call `RelationExtractor.add_label` directly. The number of labels defines the output dimensionality of the network, and will be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index a0cf36909..ef44009ae 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1244,15 +1244,10 @@ labels = [] # This function is created and then passed to the "textcat" component as # the argument "model" [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" +@architectures = "spacy.TextCatBOW.v1" exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 ngram_size = 1 -dropout = null +no_output_layer = false [components.other_textcat] factory = "textcat" diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 131bd8c94..44d0fd388 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1142,7 +1142,7 @@ pattern = [ { "LEFT_ID": "anchor_founded", "REL_OP": ">", - "RIGHT_ID": "subject", + "RIGHT_ID": "founded_subject", "RIGHT_ATTRS": {"DEP": "nsubj"}, } # ... @@ -1212,7 +1212,7 @@ pattern = [ { "LEFT_ID": "anchor_founded", "REL_OP": ">", - "RIGHT_ID": "subject", + "RIGHT_ID": "founded_subject", "RIGHT_ATTRS": {"DEP": "nsubj"}, }, { diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 5a42d2172..274ea5989 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -717,7 +717,7 @@ tabular results to a file: ```python ### functions.py import sys -from typing import IO, Tuple, Callable, Dict, Any +from typing import IO, Tuple, Callable, Dict, Any, Optional import spacy from spacy import Language from pathlib import Path @@ -729,7 +729,7 @@ def custom_logger(log_path): stdout: IO=sys.stdout, stderr: IO=sys.stderr ) -> Tuple[Callable, Callable]: - stdout.write(f"Logging to {log_path}\n") + stdout.write(f"Logging to {log_path}\\n") log_file = Path(log_path).open("w", encoding="utf8") log_file.write("step\\t") log_file.write("score\\t") diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index fe4765285..b25b28a6d 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0. | Name | Description | | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | -| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. | | [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | -| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. | +| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. | | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. | @@ -1032,9 +1032,9 @@ change your names and imports: Thanks to everyone who's been contributing to the spaCy ecosystem by developing and maintaining one of the many awesome [plugins and extensions](/universe). We've tried to make it as easy as possible for you to upgrade your packages for -spaCy v3.0. The most common use case for plugins is providing pipeline components -and extension attributes. When migrating your plugin, double-check the -following: +spaCy v3.0. The most common use case for plugins is providing pipeline +components and extension attributes. When migrating your plugin, double-check +the following: - Use the [`@Language.factory`](/api/language#factory) decorator to register your component and assign it a name. This allows users to refer to your diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 73b2d072d..cc73e7e67 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg) Since each visualization is generated as a separate SVG, exporting `.svg` files only works if you're rendering **one single doc** at a time. (This makes sense – after all, each visualization should be a standalone graphic.) So instead of -rendering all `Doc`s at one, loop over them and export them separately. +rendering all `Doc`s at once, loop over them and export them separately. diff --git a/website/src/templates/models.js b/website/src/templates/models.js index b9658dacd..17140b072 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -120,7 +120,7 @@ function formatAccuracy(data) { ? null : { label, - value: value.toFixed(2), + value: (value * 100).toFixed(2), help: MODEL_META[label], } })