From 75a202ce6506177d5de97b47bfd96fd3c7909503 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 18 Oct 2020 14:50:41 +0200 Subject: [PATCH 01/16] TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos --- spacy/about.py | 2 +- spacy/cli/init_pipeline.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 36 +++++++----- spacy/cli/train.py | 2 +- spacy/errors.py | 6 +- spacy/ml/models/textcat.py | 57 ++++++++++++++----- spacy/ml/models/tok2vec.py | 2 +- spacy/pipeline/textcat.py | 35 +++++++++--- spacy/scorer.py | 30 ++++++---- spacy/tests/pipeline/test_pipe_factories.py | 14 ++--- spacy/tests/pipeline/test_textcat.py | 7 +-- spacy/tests/test_models.py | 27 +++------ spacy/tests/test_scorer.py | 6 +- spacy/tests/training/test_readers.py | 2 +- spacy/training/initialize.py | 4 ++ website/docs/api/architectures.md | 56 +++++++++++++----- website/docs/api/scorer.md | 26 ++++++--- website/docs/usage/layers-architectures.md | 35 ++++++++---- website/docs/usage/processing-pipelines.md | 9 +-- website/docs/usage/training.md | 4 +- 20 files changed, 235 insertions(+), 127 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index bf1d53a7b..24a3ead22 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc1" +__version__ = "3.0.0rc2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 1c0233539..f45097205 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -100,7 +100,7 @@ def init_labels_cli( extract the labels.""" util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) if not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index d92de9c15..1194438de 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -136,15 +136,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" @@ -271,15 +275,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0b27f63dc..fe1e82eb2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -44,7 +44,7 @@ def train_cli( if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if output_path is not None and not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) diff --git a/spacy/errors.py b/spacy/errors.py index 5fab0bab1..2898fbcaa 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -398,8 +398,8 @@ class Errors: E163 = ("cumsum was found to be unstable: its last element does not " "correspond to sum") E164 = ("x is neither increasing nor decreasing: {x}.") - E165 = ("Only one class present in y_true. ROC AUC score is not defined in " - "that case.") + E165 = ("Only one class present in the gold labels: {label}. " + "ROC AUC score is not defined in that case.") E166 = ("Can only merge DocBins with the same value for '{param}'.\n" "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") @@ -456,6 +456,8 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E897 = ("Field '{field}' should be a dot-notation string referring to the " + "relevant section in the config, but found type {type} instead.") E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " "is not set or None. If you've implemented a custom component, make " "sure to store the component model as `self.model` in your " diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ec8998e2d..d4aed2839 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,4 +1,6 @@ -from typing import Optional +from typing import Optional, List + +from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum @@ -10,12 +12,13 @@ from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor +from ...tokens import Doc @registry.architectures.register("spacy.TextCatCNN.v1") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None -) -> Model: +) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the @@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier( is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): + cnn = tok2vec >> list2ragged() >> reduce_mean() if exclusive_classes: output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model = cnn >> output_layer model.set_ref("output_layer", output_layer) else: linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = ( - tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() - ) + model = cnn >> linear_layer >> Logistic() model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) @@ -45,8 +47,7 @@ def build_bow_text_classifier( ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, -) -> Model: - # Don't document this yet, I'm not sure it's right. +) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear @@ -59,6 +60,39 @@ def build_bow_text_classifier( return model +@registry.architectures.register("spacy.TextCatEnsemble.v2") +def build_text_classifier( + tok2vec: Model[List[Doc], List[Floats2d]], + linear_model: Model[List[Doc], Floats2d], + nO: Optional[int] = None, +) -> Model[List[Doc], Floats2d]: + exclusive_classes = not linear_model.attrs["multi_label"] + with Model.define_operators({">>": chain, "|": concatenate}): + width = tok2vec.get_dim("nO") + cnn_model = ( + tok2vec + >> list2ragged() + >> ParametricAttention(width) # TODO: benchmark performance difference of this layer + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) + ) + + nO_double = nO * 2 if nO else None + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=nO_double) + else: + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() + model = (linear_model | cnn_model) >> output_layer + model.set_ref("tok2vec", tok2vec) + if model.has_dim("nO") is not False: + model.set_dim("nO", nO) + model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.attrs["multi_label"] = not exclusive_classes + return model + +# TODO: move to legacy @registry.architectures.register("spacy.TextCatEnsemble.v1") def build_text_classifier( width: int, @@ -158,11 +192,8 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( - width: int, - pretrained_vectors: Optional[bool], - dropout: Optional[float], - nO: Optional[int] = None, -) -> Model: + width: int, dropout: Optional[float], nO: Optional[int] = None +) -> Model[List[Doc], Floats2d]: # Don't document this yet, I'm not sure it's right. # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 95e200927..8755d0d0d 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -106,7 +106,7 @@ def MultiHashEmbed( ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it - through a feed-forward subnetwork to build a mixed representations. + through a feed-forward subnetwork to build a mixed representation. The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 5ebe0e104..0781a000c 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -16,15 +16,30 @@ from ..vocab import Vocab default_model_config = """ [model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null +@architectures = "spacy.TextCatEnsemble.v2" + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" width = 64 -conv_depth = 2 -embed_size = 2000 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} window_size = 1 +maxout_pieces = 3 +depth = 2 + +[model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false ngram_size = 1 -dropout = null +no_output_layer = false """ DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] @@ -60,9 +75,11 @@ subword_features = true default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, - "cats_p": None, - "cats_r": None, - "cats_f": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, "cats_macro_f": None, "cats_macro_auc": None, "cats_f_per_type": None, diff --git a/spacy/scorer.py b/spacy/scorer.py index d1065f3a9..273bda898 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -59,7 +59,9 @@ class PRFScore: class ROCAUCScore: - """An AUC ROC score.""" + """An AUC ROC score. This is only defined for binary classification. + Use the method is_binary before calculating the score, otherwise it + may throw an error.""" def __init__(self) -> None: self.golds = [] @@ -71,16 +73,16 @@ class ROCAUCScore: self.cands.append(cand) self.golds.append(gold) + def is_binary(self): + return len(np.unique(self.golds)) == 2 + @property def score(self): + if not self.is_binary(): + raise ValueError(Errors.E165.format(label=set(self.golds))) if len(self.golds) == self.saved_score_at_len: return self.saved_score - try: - self.saved_score = _roc_auc_score(self.golds, self.cands) - # catch ValueError: Only one class present in y_true. - # ROC AUC score is not defined in that case. - except ValueError: - self.saved_score = -float("inf") + self.saved_score = _roc_auc_score(self.golds, self.cands) self.saved_score_at_len = len(self.golds) return self.saved_score @@ -362,9 +364,13 @@ class Scorer: for all: attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score_desc (text description of the overall score), + attr_micro_p, + attr_micro_r, attr_micro_f, + attr_macro_p, + attr_macro_r, attr_macro_f, - attr_auc, + attr_macro_auc, attr_f_per_type, attr_auc_per_type @@ -431,7 +437,9 @@ class Scorer: macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats - macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats + # Limit macro_auc to those labels with gold annotations, + # but still divide by all cats to avoid artificial boosting of datasets with missing labels + macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats results = { f"{attr}_score": None, f"{attr}_score_desc": None, @@ -443,7 +451,7 @@ class Scorer: f"{attr}_macro_f": macro_f, f"{attr}_macro_auc": macro_auc, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, - f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, + f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] @@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score): `_ """ if len(np.unique(y_true)) != 2: - raise ValueError(Errors.E165) + raise ValueError(Errors.E165.format(label=np.unique(y_true))) fpr, tpr, _ = _roc_curve(y_true, y_score) return _auc(fpr, tpr) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index cac394913..6f07c0220 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -2,6 +2,7 @@ import pytest from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.tokens import Doc from spacy.util import registry, SimpleFrozenDict, combine_score_weights from thinc.api import Model, Linear, ConfigValidationError @@ -156,15 +157,10 @@ def test_pipe_class_component_model(): name = "test_class_component_model" default_config = { "model": { - "@architectures": "spacy.TextCatEnsemble.v1", - "exclusive_classes": False, - "pretrained_vectors": None, - "width": 64, - "embed_size": 2000, - "window_size": 1, - "conv_depth": 2, - "ngram_size": 1, - "dropout": None, + "@architectures": "spacy.TextCatEnsemble.v2", + "tok2vec": DEFAULT_TOK2VEC_MODEL, + "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, + "no_output_layer": False}, }, "value1": 10, } diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 91348b1b3..06d512a32 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -140,7 +140,7 @@ def test_overfitting_IO(): nlp = English() nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} # Set exclusive labels - config = {"model": {"exclusive_classes": True}} + config = {"model": {"linear_model": {"exclusive_classes": True}}} textcat = nlp.add_pipe("textcat", config=config) train_examples = [] for text, annotations in TRAIN_DATA: @@ -192,9 +192,8 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, ], diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index e8884e6b2..200d7dcfd 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder -from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier from spacy.ml.staticvectors import StaticVectors from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES -def get_textcat_kwargs(): +def get_textcat_bow_kwargs(): return { - "width": 64, - "embed_size": 2000, - "pretrained_vectors": None, - "exclusive_classes": False, + "exclusive_classes": True, "ngram_size": 1, - "window_size": 1, - "conv_depth": 2, - "dropout": None, - "nO": 7, + "no_output_layer": False, + "nO": 34, } def get_textcat_cnn_kwargs(): - return { - "tok2vec": test_tok2vec(), - "exclusive_classes": False, - "nO": 13, - } + return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -105,7 +96,7 @@ def test_multi_hash_embed(): "seed,model_func,kwargs", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs()), - (0, build_text_classifier, get_textcat_kwargs()), + (0, build_bow_text_classifier, get_textcat_bow_kwargs()), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), ], ) @@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): "seed,model_func,kwargs,get_X", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) @@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): "seed,dropout,model_func,kwargs,get_X", [ (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 4c1b09849..2682cd0ea 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -334,7 +334,8 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 0) score.score_set(0.75, 0) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score y_true = [1, 1] y_score = [0.25, 0.75] @@ -344,4 +345,5 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 1) score.score_set(0.75, 1) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 9d82ca50a..ff2559d2a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -51,7 +51,7 @@ def test_readers(): for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) - assert scores["cats_score"] + assert scores["cats_score"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 7c84caf95..3d79eb78f 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] + if not isinstance(T["train_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"]))) + if not isinstance(T["dev_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"]))) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3157c261a..fe2223017 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representations. The features used +a feed-forward subnetwork to build a mixed representation. The features used can be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based @@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with different architectures and settings to determine what works best on your specific data and challenge. -### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} +### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatEnsemble.v1" -> exclusive_classes = false -> pretrained_vectors = null -> width = 64 -> embed_size = 2000 -> conv_depth = 2 -> window_size = 1 -> ngram_size = 1 -> dropout = null +> @architectures = "spacy.TextCatEnsemble.v2" > nO = null +> +> [model.linear_model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = true +> ngram_size = 1 +> no_output_layer = false +> +> [model.tok2vec] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.tok2vec.embed] +> @architectures = "spacy.MultiHashEmbed.v1" +> width = 64 +> rows = [2000, 2000, 1000, 1000, 1000, 1000] +> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +> include_static_vectors = false +> +> [model.tok2vec.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> width = ${model.tok2vec.embed.width} +> window_size = 1 +> maxout_pieces = 3 +> depth = 2 > ``` -Stacked ensemble of a bag-of-words model and a neural network model. The neural -network has an internal CNN Tok2Vec layer and uses attention. +Stacked ensemble of a linear bag-of-words model and a neural network model. The +neural network is built upon a Tok2Vec layer and uses attention. The setting for +whether or not this model should cater for multi-label classification, is taken +from the linear model, where it is stored in `model.attrs["multi_label"]`. + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ | +| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + + +The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + ### spacy.TextCatCNN.v1 {#TextCatCNN} > #### Example Config diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 0dbc0de33..388d92801 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -174,15 +174,25 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict -containing scores for each label like `Doc.cats`. The reported overall score -depends on the scorer settings: +containing scores for each label like `Doc.cats`. The returned dictionary +contains the following scores: -1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / - `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall - score), `{attr}_f_per_type`, `{attr}_auc_per_type` -2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` -3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; -4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` +- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across + each label is weighted equally +- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values + across evaluations per label +- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of + scores, keyed by label +- A final `{attr}_score` and corresponding `{attr}_score_desc` (text + description) + +The reported `{attr}_score` depends on the classification properties: + +- **binary exclusive with positive label:** `{attr}_score` is set to the F-score + of the positive label +- **3+ exclusive classes**, macro-averaged F-score: + `{attr}_score = {attr}_macro_f` +- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc` > #### Example > diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index d7b2593e7..aa62a77d4 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -130,16 +130,31 @@ factory = "textcat" labels = [] [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 -dropout = 0 +@architectures = "spacy.TextCatEnsemble.v2" nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[components.textcat.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 64 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[components.textcat.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${components.textcat.model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false ``` spaCy has two additional built-in `textcat` architectures, and you can easily @@ -687,7 +702,7 @@ Before the model can be used, it needs to be [initialized](/usage/training#initialization). This function receives a callback to access the full **training data set**, or a representative sample. This data set can be used to deduce all **relevant labels**. Alternatively, a list of -labels can be provided to `initialize`, or you can call +labels can be provided to `initialize`, or you can call `RelationExtractor.add_label` directly. The number of labels defines the output dimensionality of the network, and will be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index a0cf36909..ef44009ae 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1244,15 +1244,10 @@ labels = [] # This function is created and then passed to the "textcat" component as # the argument "model" [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" +@architectures = "spacy.TextCatBOW.v1" exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 ngram_size = 1 -dropout = null +no_output_layer = false [components.other_textcat] factory = "textcat" diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 5a42d2172..274ea5989 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -717,7 +717,7 @@ tabular results to a file: ```python ### functions.py import sys -from typing import IO, Tuple, Callable, Dict, Any +from typing import IO, Tuple, Callable, Dict, Any, Optional import spacy from spacy import Language from pathlib import Path @@ -729,7 +729,7 @@ def custom_logger(log_path): stdout: IO=sys.stdout, stderr: IO=sys.stderr ) -> Tuple[Callable, Callable]: - stdout.write(f"Logging to {log_path}\n") + stdout.write(f"Logging to {log_path}\\n") log_file = Path(log_path).open("w", encoding="utf8") log_file.write("step\\t") log_file.write("score\\t") From db24dc56145ff04dc6b4c4bc3e39e316b847ef17 Mon Sep 17 00:00:00 2001 From: walterhenry <55140654+walterhenry@users.noreply.github.com> Date: Mon, 19 Oct 2020 11:11:32 +0200 Subject: [PATCH 02/16] Proofread remarks I think these may the last remarks for the nightly docs. Only two minor things actually. --- website/docs/usage/101/_vectors-similarity.md | 2 +- website/docs/usage/visualizers.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index cf5b70af2..c04198cb5 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers)) Computing similarity scores can be helpful in many situations, but it's also important to maintain **realistic expectations** about what information it can -provide. Words can be related to each over in many ways, so a single +provide. Words can be related to each other in many ways, so a single "similarity" score will always be a **mix of different signals**, and vectors trained on different data can produce very different results that may not be useful for your purpose. Here are some important considerations to keep in mind: diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 73b2d072d..cc73e7e67 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg) Since each visualization is generated as a separate SVG, exporting `.svg` files only works if you're rendering **one single doc** at a time. (This makes sense – after all, each visualization should be a standalone graphic.) So instead of -rendering all `Doc`s at one, loop over them and export them separately. +rendering all `Doc`s at once, loop over them and export them separately. From 4300858ecb5aa9941c309dcb11b0a1c13a98c304 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 12:07:46 +0200 Subject: [PATCH 03/16] Include per-type/feat scores in evaluate output --- spacy/cli/evaluate.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 566820283..165bbd399 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -105,15 +105,22 @@ def evaluate( msg.table(results, title="Results") + if "morph_per_feat" in scores: + if scores["morph_per_feat"]: + print_morph_per_feat(msg, scores["morph_per_feat"]) + data["morph_per_feat"] = scores["morph_per_feat"] if "ents_per_type" in scores: if scores["ents_per_type"]: print_ents_per_type(msg, scores["ents_per_type"]) + data["ents_per_type"] = scores["ents_per_type"] if "cats_f_per_type" in scores: if scores["cats_f_per_type"]: print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) + data["cats_f_per_type"] = scores["cats_f_per_type"] if "cats_auc_per_type" in scores: if scores["cats_auc_per_type"]: print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) + data["cats_auc_per_type"] = scores["cats_auc_per_type"] if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] @@ -157,6 +164,19 @@ def render_parses( file_.write(html) +def print_morph_per_feat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: + data = [ + (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") + for k, v in scores.items() + ] + msg.table( + data, + header=("", "P", "R", "F"), + aligns=("l", "r", "r", "r"), + title="MORPH (per feat)", + ) + + def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: data = [ (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") From dd207ca6d014d0b7418e334c6f523eab70e0570e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 13:18:47 +0200 Subject: [PATCH 04/16] Add dep_las_per_type and more generic PRF printer --- spacy/cli/evaluate.py | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 165bbd399..999c68be3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -107,15 +107,19 @@ def evaluate( if "morph_per_feat" in scores: if scores["morph_per_feat"]: - print_morph_per_feat(msg, scores["morph_per_feat"]) + print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") data["morph_per_feat"] = scores["morph_per_feat"] + if "dep_las_per_type" in scores: + if scores["dep_las_per_type"]: + print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") + data["dep_las_per_type"] = scores["dep_las_per_type"] if "ents_per_type" in scores: if scores["ents_per_type"]: - print_ents_per_type(msg, scores["ents_per_type"]) + print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") data["ents_per_type"] = scores["ents_per_type"] if "cats_f_per_type" in scores: if scores["cats_f_per_type"]: - print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) + print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") data["cats_f_per_type"] = scores["cats_f_per_type"] if "cats_auc_per_type" in scores: if scores["cats_auc_per_type"]: @@ -164,7 +168,7 @@ def render_parses( file_.write(html) -def print_morph_per_feat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: +def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: data = [ (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") for k, v in scores.items() @@ -173,33 +177,7 @@ def print_morph_per_feat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> N data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), - title="MORPH (per feat)", - ) - - -def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: - data = [ - (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") - for k, v in scores.items() - ] - msg.table( - data, - header=("", "P", "R", "F"), - aligns=("l", "r", "r", "r"), - title="NER (per type)", - ) - - -def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: - data = [ - (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") - for k, v in scores.items() - ] - msg.table( - data, - header=("", "P", "R", "F"), - aligns=("l", "r", "r", "r"), - title="Textcat F (per label)", + title=f"{name} (per {type})", ) From 563a21834e111728cb8804424d173eb14d90ef85 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 15:03:19 +0200 Subject: [PATCH 05/16] Save raw scores in evaluate output --- spacy/cli/evaluate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 999c68be3..018f81bd0 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -93,6 +93,7 @@ def evaluate( "SPEED": "speed", } results = {} + data = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": @@ -101,7 +102,7 @@ def evaluate( results[metric] = f"{scores[key]:.0f}" else: results[metric] = f"{scores[key]*100:.2f}" - data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} + data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") From fbe65b257bf084c428322ec28919f37fe7c2c951 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 18:55:55 +0200 Subject: [PATCH 06/16] Convert accuracy numbers on website models page --- website/src/templates/models.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index b9658dacd..17140b072 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -120,7 +120,7 @@ function formatAccuracy(data) { ? null : { label, - value: value.toFixed(2), + value: (value * 100).toFixed(2), help: MODEL_META[label], } }) From 56077e7e64090933f94d2e38b74d65eb499001c4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 18:58:15 +0200 Subject: [PATCH 07/16] Add dependency for jinja2 --- requirements.txt | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 36f0d1e92..aec511b9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.5.0,<2.0.0 pytokenizations +jinja2>=2.1.0,<3.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index adf0c0e20..6def9adef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.5.0,<2.0.0 pytokenizations + jinja2>=2.1.0,<3.0.0 # Official Python utilities setuptools packaging>=20.0 From 36292967574af48b7b4c8a55a80d10cf27310d48 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 19 Oct 2020 19:04:42 +0200 Subject: [PATCH 08/16] Fix requirements, remove version pins --- requirements.txt | 1 - setup.cfg | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index aec511b9f..36f0d1e92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,6 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.5.0,<2.0.0 pytokenizations -jinja2>=2.1.0,<3.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 6def9adef..e42bb9c57 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.5.0,<2.0.0 pytokenizations - jinja2>=2.1.0,<3.0.0 + jinja2 # Official Python utilities setuptools packaging>=20.0 From 2c9804038da8bf9d6b8032c31ef5cfff4e901718 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 23 Oct 2020 16:11:54 +0200 Subject: [PATCH 09/16] Fix success message [ci skip] --- spacy/cli/init_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index f45097205..da474795e 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -39,7 +39,7 @@ def init_vectors_cli( nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " - "path to it in your config as the 'vectors' setting in [initialize.vocab].", + "path to it in your config as the 'vectors' setting in [initialize].", output_dir.resolve(), ) From ace6ae435b0f1dc95f489099252b097929c9b78f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 26 Oct 2020 23:31:08 +0100 Subject: [PATCH 10/16] set pydantic upper pin to 1.7 for now (#6308) --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 36f0d1e92..c5e136a34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ pathy numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.5.0,<2.0.0 +pydantic>=1.5.0,<1.7.0 pytokenizations # Official Python utilities setuptools diff --git a/setup.cfg b/setup.cfg index e42bb9c57..762a7e888 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.5.0,<2.0.0 + pydantic>=1.5.0,<1.7.0 pytokenizations jinja2 # Official Python utilities From dc816bba9d564ae572af28a17cbf0580ba11db5e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 28 Oct 2020 16:32:46 +0100 Subject: [PATCH 11/16] Fix node name typo in dependency matcher example (#6311) --- website/docs/usage/rule-based-matching.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 131bd8c94..44d0fd388 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1142,7 +1142,7 @@ pattern = [ { "LEFT_ID": "anchor_founded", "REL_OP": ">", - "RIGHT_ID": "subject", + "RIGHT_ID": "founded_subject", "RIGHT_ATTRS": {"DEP": "nsubj"}, } # ... @@ -1212,7 +1212,7 @@ pattern = [ { "LEFT_ID": "anchor_founded", "REL_OP": ">", - "RIGHT_ID": "subject", + "RIGHT_ID": "founded_subject", "RIGHT_ATTRS": {"DEP": "nsubj"}, }, { From 2918923541ea1a16b18501bfce77300c15b72332 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 31 Oct 2020 12:17:06 +0100 Subject: [PATCH 12/16] fix resolving of dot notation (#6326) --- spacy/training/pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index b91fb07a8..e5c41c70b 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..errors import Errors -from ..util import registry, load_model_from_config, dot_to_object +from ..util import registry, load_model_from_config, resolve_dot_names def pretrain( @@ -38,7 +38,7 @@ def pretrain( _config = nlp.config.interpolate() T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) - corpus = dot_to_object(T, P["corpus"]) + corpus = resolve_dot_names(_config, [P["corpus"]])[0] batcher = P["batcher"] model = create_pretraining_model(nlp, P) optimizer = P["optimizer"] From 5d2cb86c34725e1335f87d1b2168a03f08a6464d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 31 Oct 2020 12:20:27 +0100 Subject: [PATCH 13/16] Fix on_match callback for DependencyMatcher (#6313) Fix `DependencyMatcher` so that the callback is called only once per match. --- spacy/matcher/dependencymatcher.pyx | 8 ++++---- spacy/tests/matcher/test_dependency_matcher.py | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 067b2167c..02f7c9318 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -286,10 +286,10 @@ cdef class DependencyMatcher: self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) for matched_tree in matched_trees: matched_key_trees.append((key, matched_tree)) - for i, (match_id, nodes) in enumerate(matched_key_trees): - on_match = self._callbacks.get(match_id) - if on_match is not None: - on_match(self, doc, i, matched_key_trees) + for i, (match_id, nodes) in enumerate(matched_key_trees): + on_match = self._callbacks.get(match_id) + if on_match is not None: + on_match(self, doc, i, matched_key_trees) return matched_key_trees def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index e18a8f6d8..481187348 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc): pattern = [ {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, ] + nomatch_pattern = [ + {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}}, + ] matcher = DependencyMatcher(en_vocab) mock = Mock() matcher.add("pattern", [pattern], on_match=mock) + matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock) matches = matcher(doc) + assert len(matches) == 1 mock.assert_called_once_with(matcher, doc, 0, matches) # check that matches with and without callback are the same (#4590) From a4b32b955200c48a17bdca46fbf36402603b2f49 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 3 Nov 2020 15:47:18 +0100 Subject: [PATCH 14/16] Handle missing reference values in scorer (#6286) * Handle missing reference values in scorer Handle missing values in reference doc during scoring where it is possible to detect an unset state for the attribute. If no reference docs contain annotation, `None` is returned instead of a score. `spacy evaluate` displays `-` for missing scores and the missing scores are saved as `None`/`null` in the metrics. Attributes without unset states: * `token.head`: relies on `token.dep` to recognize unset values * `doc.cats`: unable to handle missing annotation Additional changes: * add optional `has_annotation` check to `score_scans` to replace `doc.sents` hack * update `score_token_attr_per_feat` to handle missing and empty morph representations * fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START` vs. `SENT_START` * Fix import * Update return types --- spacy/cli/evaluate.py | 9 +- spacy/pipeline/attributeruler.py | 6 +- spacy/pipeline/dep_parser.pyx | 5 +- spacy/pipeline/entityruler.py | 4 +- spacy/pipeline/morphologizer.pyx | 7 +- spacy/pipeline/ner.pyx | 11 +- spacy/pipeline/sentencizer.pyx | 5 +- spacy/pipeline/senter.pyx | 5 +- spacy/scorer.py | 257 ++++++++++++-------- spacy/tests/pipeline/test_attributeruler.py | 4 +- spacy/tests/test_scorer.py | 56 +++++ spacy/tokens/doc.pyx | 5 +- website/docs/api/doc.md | 1 + website/docs/api/scorer.md | 77 +++--- 14 files changed, 294 insertions(+), 158 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 018f81bd0..a0ea9fbc9 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -98,10 +98,13 @@ def evaluate( if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - if key == "speed": - results[metric] = f"{scores[key]:.0f}" + if isinstance(scores[key], (int, float)): + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" else: - results[metric] = f"{scores[key]*100:.2f}" + results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index e17d3be98..68e26c4be 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -226,6 +226,9 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/tagger#score """ + def morph_key_getter(token, attr): + return getattr(token, attr).key + validate_examples(examples, "AttributeRuler.score") results = {} attrs = set() @@ -237,7 +240,8 @@ class AttributeRuler(Pipe): elif attr == POS: results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) elif attr == MORPH: - results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) + results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs)) elif attr == LEMMA: results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) return results diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index bdef332cc..a9dcd705e 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -155,13 +155,16 @@ cdef class DependencyParser(Parser): DOCS: https://nightly.spacy.io/api/dependencyparser#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "DependencyParser.score") def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep results = {} - results.update(Scorer.score_spans(examples, "sents", **kwargs)) + results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 382ca338d..2a3b8dd00 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -10,7 +10,7 @@ from ..errors import Errors from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher -from ..scorer import Scorer +from ..scorer import get_ner_prf from ..training import validate_examples @@ -340,7 +340,7 @@ class EntityRuler(Pipe): def score(self, examples, **kwargs): validate_examples(examples, "EntityRuler.score") - return Scorer.score_spans(examples, "ents", **kwargs) + return get_ner_prf(examples) def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index ac111f28b..a03c7daf0 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -251,10 +251,13 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#score """ + def morph_key_getter(token, attr): + return getattr(token, attr).key + validate_examples(examples, "Morphologizer.score") results = {} results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) results.update(Scorer.score_token_attr_per_feat(examples, - "morph", **kwargs)) + "morph", getter=morph_key_getter, **kwargs)) return results diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 6482d6125..0f93b43ac 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser): DOCS: https://nightly.spacy.io/api/entityrecognizer#score """ validate_examples(examples, "EntityRecognizer.score") - score_per_type = get_ner_prf(examples) - totals = PRFScore() - for prf in score_per_type.values(): - totals += prf - return { - "ents_p": totals.precision, - "ents_r": totals.recall, - "ents_f": totals.fscore, - "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, - } + return get_ner_prf(examples) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 7656b330c..6e8b1c324 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -155,8 +155,11 @@ class Sentencizer(Pipe): DOCS: https://nightly.spacy.io/api/sentencizer#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "Sentencizer.score") - results = Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) del results["sents_per_type"] return results diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 15a21902a..ad777ea58 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger): RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. DOCS: https://nightly.spacy.io/api/sentencerecognizer#score """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + validate_examples(examples, "SentenceRecognizer.score") - results = Scorer.score_spans(examples, "sents", **kwargs) + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) del results["sents_per_type"] return results diff --git a/spacy/scorer.py b/spacy/scorer.py index 273bda898..fe64c23ad 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,9 +1,9 @@ -from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING +from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING import numpy as np from collections import defaultdict from .training import Example -from .tokens import Token, Doc, Span +from .tokens import Token, Doc, Span, MorphAnalysis from .errors import Errors from .util import get_lang_class, SimpleFrozenList from .morphology import Morphology @@ -13,7 +13,8 @@ if TYPE_CHECKING: from .language import Language # noqa: F401 -DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] +DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat") +MISSING_VALUES = frozenset([None, 0, ""]) class PRFScore: @@ -24,6 +25,9 @@ class PRFScore: self.fp = 0 self.fn = 0 + def __len__(self) -> int: + return self.tp + self.fp + self.fn + def __iadd__(self, other): self.tp += other.tp self.fp += other.fp @@ -94,7 +98,7 @@ class Scorer: self, nlp: Optional["Language"] = None, default_lang: str = "xx", - default_pipeline=DEFAULT_PIPELINE, + default_pipeline: Iterable[str] = DEFAULT_PIPELINE, **cfg, ) -> None: """Initialize the Scorer. @@ -126,13 +130,13 @@ class Scorer: return scores @staticmethod - def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: + def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: """Returns accuracy and PRF scores for tokenization. * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for token character spans examples (Iterable[Example]): Examples to score - RETURNS (Dict[str, float]): A dictionary containing the scores + RETURNS (Dict[str, Any]): A dictionary containing the scores token_acc/p/r/f. DOCS: https://nightly.spacy.io/api/scorer#score_tokenization @@ -142,6 +146,8 @@ class Scorer: for example in examples: gold_doc = example.reference pred_doc = example.predicted + if gold_doc.has_unknown_spaces: + continue align = example.alignment gold_spans = set() pred_spans = set() @@ -158,12 +164,20 @@ class Scorer: else: acc_score.tp += 1 prf_score.score_set(pred_spans, gold_spans) - return { - "token_acc": acc_score.fscore, - "token_p": prf_score.precision, - "token_r": prf_score.recall, - "token_f": prf_score.fscore, - } + if len(acc_score) > 0: + return { + "token_acc": acc_score.fscore, + "token_p": prf_score.precision, + "token_r": prf_score.recall, + "token_f": prf_score.fscore, + } + else: + return { + "token_acc": None, + "token_p": None, + "token_r": None, + "token_f": None + } @staticmethod def score_token_attr( @@ -171,8 +185,9 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, + missing_values: Set[Any] = MISSING_VALUES, **cfg, - ) -> Dict[str, float]: + ) -> Dict[str, Any]: """Returns an accuracy score for a token-level attribute. examples (Iterable[Example]): Examples to score @@ -180,7 +195,7 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - RETURNS (Dict[str, float]): A dictionary containing the accuracy score + RETURNS (Dict[str, Any]): A dictionary containing the accuracy score under the key attr_acc. DOCS: https://nightly.spacy.io/api/scorer#score_token_attr @@ -191,17 +206,27 @@ class Scorer: pred_doc = example.predicted align = example.alignment gold_tags = set() + missing_indices = set() for gold_i, token in enumerate(gold_doc): - gold_tags.add((gold_i, getter(token, attr))) + value = getter(token, attr) + if value not in missing_values: + gold_tags.add((gold_i, getter(token, attr))) + else: + missing_indices.add(gold_i) pred_tags = set() for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] - pred_tags.add((gold_i, getter(token, attr))) + if gold_i not in missing_indices: + pred_tags.add((gold_i, getter(token, attr))) tag_score.score_set(pred_tags, gold_tags) - return {f"{attr}_acc": tag_score.fscore} + score_key = f"{attr}_acc" + if len(tag_score) == 0: + return {score_key: None} + else: + return {score_key: tag_score.fscore} @staticmethod def score_token_attr_per_feat( @@ -209,8 +234,9 @@ class Scorer: attr: str, *, getter: Callable[[Token, str], Any] = getattr, + missing_values: Set[Any] = MISSING_VALUES, **cfg, - ): + ) -> Dict[str, Any]: """Return PRF scores per feat for a token attribute in UFEATS format. examples (Iterable[Example]): Examples to score @@ -218,7 +244,7 @@ class Scorer: getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - RETURNS (dict): A dictionary containing the per-feat PRF scores unders + RETURNS (dict): A dictionary containing the per-feat PRF scores under the key attr_per_feat. """ per_feat = {} @@ -227,9 +253,11 @@ class Scorer: gold_doc = example.reference align = example.alignment gold_per_feat = {} + missing_indices = set() for gold_i, token in enumerate(gold_doc): - morph = str(getter(token, attr)) - if morph: + value = getter(token, attr) + morph = gold_doc.vocab.strings[value] + if value not in missing_values and morph != Morphology.EMPTY_MORPH: for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: @@ -237,27 +265,35 @@ class Scorer: if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) + else: + missing_indices.add(gold_i) pred_per_feat = {} for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] - morph = str(getter(token, attr)) - if morph: - for feat in morph.split("|"): - field, values = feat.split("=") - if field not in per_feat: - per_feat[field] = PRFScore() - if field not in pred_per_feat: - pred_per_feat[field] = set() - pred_per_feat[field].add((gold_i, feat)) + if gold_i not in missing_indices: + value = getter(token, attr) + morph = gold_doc.vocab.strings[value] + if value not in missing_values and morph != Morphology.EMPTY_MORPH: + for feat in morph.split(Morphology.FEATURE_SEP): + field, values = feat.split(Morphology.FIELD_SEP) + if field not in per_feat: + per_feat[field] = PRFScore() + if field not in pred_per_feat: + pred_per_feat[field] = set() + pred_per_feat[field].add((gold_i, feat)) for field in per_feat: per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) - result = {k: v.to_dict() for k, v in per_feat.items()} - return {f"{attr}_per_feat": result} + score_key = f"{attr}_per_feat" + if any([len(v) for v in per_feat.values()]): + result = {k: v.to_dict() for k, v in per_feat.items()} + return {score_key: result} + else: + return {score_key: None} @staticmethod def score_spans( @@ -265,6 +301,7 @@ class Scorer: attr: str, *, getter: Callable[[Doc, str], Iterable[Span]] = getattr, + has_annotation: Optional[Callable[[Doc], bool]] = None, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. @@ -284,18 +321,10 @@ class Scorer: for example in examples: pred_doc = example.predicted gold_doc = example.reference - # TODO - # This is a temporary hack to work around the problem that the scorer - # fails if you have examples that are not fully annotated for all - # the tasks in your pipeline. For instance, you might have a corpus - # of NER annotations that does not set sentence boundaries, but the - # pipeline includes a parser or senter, and then the score_weights - # are used to evaluate that component. When the scorer attempts - # to read the sentences from the gold document, it fails. - try: - list(getter(gold_doc, attr)) - except ValueError: - continue + # Option to handle docs without sents + if has_annotation is not None: + if not has_annotation(gold_doc): + continue # Find all labels in gold and doc labels = set( [k.label_ for k in getter(gold_doc, attr)] @@ -323,13 +352,21 @@ class Scorer: v.score_set(pred_per_type[k], gold_per_type[k]) # Score for all labels score.score_set(pred_spans, gold_spans) - results = { - f"{attr}_p": score.precision, - f"{attr}_r": score.recall, - f"{attr}_f": score.fscore, - f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, - } - return results + if len(score) > 0: + return { + f"{attr}_p": score.precision, + f"{attr}_r": score.recall, + f"{attr}_f": score.fscore, + f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + } + else: + return { + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + f"{attr}_per_type": None, + } + @staticmethod def score_cats( @@ -390,9 +427,6 @@ class Scorer: pred_cats = getter(example.predicted, attr) gold_cats = getter(example.reference, attr) - # I think the AUC metric is applicable regardless of whether we're - # doing multi-label classification? Unsure. If not, move this into - # the elif pred_cats and gold_cats block below. for label in labels: pred_score = pred_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0) @@ -542,6 +576,7 @@ class Scorer: head_attr: str = "head", head_getter: Callable[[Token, str], Token] = getattr, ignore_labels: Iterable[str] = SimpleFrozenList(), + missing_values: Set[Any] = MISSING_VALUES, **cfg, ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency @@ -566,6 +601,7 @@ class Scorer: unlabelled = PRFScore() labelled = PRFScore() labelled_per_dep = dict() + missing_indices = set() for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -575,13 +611,16 @@ class Scorer: for gold_i, token in enumerate(gold_doc): dep = getter(token, attr) head = head_getter(token, head_attr) - if dep not in ignore_labels: - gold_deps.add((gold_i, head.i, dep)) - if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() - if dep not in gold_deps_per_dep: - gold_deps_per_dep[dep] = set() - gold_deps_per_dep[dep].add((gold_i, head.i, dep)) + if dep not in missing_values: + if dep not in ignore_labels: + gold_deps.add((gold_i, head.i, dep)) + if dep not in labelled_per_dep: + labelled_per_dep[dep] = PRFScore() + if dep not in gold_deps_per_dep: + gold_deps_per_dep[dep] = set() + gold_deps_per_dep[dep].add((gold_i, head.i, dep)) + else: + missing_indices.add(gold_i) pred_deps = set() pred_deps_per_dep = {} for token in pred_doc: @@ -591,25 +630,26 @@ class Scorer: gold_i = None else: gold_i = align.x2y[token.i].dataXd[0, 0] - dep = getter(token, attr) - head = head_getter(token, head_attr) - if dep not in ignore_labels and token.orth_.strip(): - if align.x2y.lengths[head.i] == 1: - gold_head = align.x2y[head.i].dataXd[0, 0] - else: - gold_head = None - # None is indistinct, so we can't just add it to the set - # Multiple (None, None) deps are possible - if gold_i is None or gold_head is None: - unlabelled.fp += 1 - labelled.fp += 1 - else: - pred_deps.add((gold_i, gold_head, dep)) - if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() - if dep not in pred_deps_per_dep: - pred_deps_per_dep[dep] = set() - pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) + if gold_i not in missing_indices: + dep = getter(token, attr) + head = head_getter(token, head_attr) + if dep not in ignore_labels and token.orth_.strip(): + if align.x2y.lengths[head.i] == 1: + gold_head = align.x2y[head.i].dataXd[0, 0] + else: + gold_head = None + # None is indistinct, so we can't just add it to the set + # Multiple (None, None) deps are possible + if gold_i is None or gold_head is None: + unlabelled.fp += 1 + labelled.fp += 1 + else: + pred_deps.add((gold_i, gold_head, dep)) + if dep not in labelled_per_dep: + labelled_per_dep[dep] = PRFScore() + if dep not in pred_deps_per_dep: + pred_deps_per_dep[dep] = set() + pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) labelled.score_set(pred_deps, gold_deps) for dep in labelled_per_dep: labelled_per_dep[dep].score_set( @@ -618,29 +658,34 @@ class Scorer: unlabelled.score_set( set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) ) - return { - f"{attr}_uas": unlabelled.fscore, - f"{attr}_las": labelled.fscore, - f"{attr}_las_per_type": { - k: v.to_dict() for k, v in labelled_per_dep.items() - }, - } + if len(unlabelled) > 0: + return { + f"{attr}_uas": unlabelled.fscore, + f"{attr}_las": labelled.fscore, + f"{attr}_las_per_type": { + k: v.to_dict() for k, v in labelled_per_dep.items() + }, + } + else: + return { + f"{attr}_uas": None, + f"{attr}_las": None, + f"{attr}_las_per_type": None, + } -def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: - """Compute per-entity PRFScore objects for a sequence of examples. The - results are returned as a dictionary keyed by the entity type. You can - add the PRFScore objects to get micro-averaged total. +def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: + """Compute micro-PRF and per-entity PRF scores for a sequence of examples. """ - scores = defaultdict(PRFScore) + score_per_type = defaultdict(PRFScore) for eg in examples: if not eg.y.has_annotation("ENT_IOB"): continue golds = {(e.label_, e.start, e.end) for e in eg.y.ents} align_x2y = eg.alignment.x2y for pred_ent in eg.x.ents: - if pred_ent.label_ not in scores: - scores[pred_ent.label_] = PRFScore() + if pred_ent.label_ not in score_per_type: + score_per_type[pred_ent.label_] = PRFScore() indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() if len(indices): g_span = eg.y[indices[0] : indices[-1] + 1] @@ -650,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: if all(token.ent_iob != 0 for token in g_span): key = (pred_ent.label_, indices[0], indices[-1] + 1) if key in golds: - scores[pred_ent.label_].tp += 1 + score_per_type[pred_ent.label_].tp += 1 golds.remove(key) else: - scores[pred_ent.label_].fp += 1 + score_per_type[pred_ent.label_].fp += 1 for label, start, end in golds: - scores[label].fn += 1 - return scores + score_per_type[label].fn += 1 + totals = PRFScore() + for prf in score_per_type.values(): + totals += prf + if len(totals) > 0: + return { + "ents_p": totals.precision, + "ents_r": totals.recall, + "ents_f": totals.fscore, + "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + } + else: + return { + "ents_p": None, + "ents_r": None, + "ents_f": None, + "ents_per_type": None, + } ############################################################################# diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 6c66469cc..02726172b 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts): scores = nlp.evaluate(dev_examples) # "cat" is the only correct lemma assert scores["lemma_acc"] == pytest.approx(0.2) - # the empty morphs are correct - assert scores["morph_acc"] == pytest.approx(0.6) + # no morphs are set + assert scores["morph_acc"] == None def test_attributeruler_rule_order(nlp): diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 2682cd0ea..56b276f0b 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -277,6 +277,62 @@ def test_tag_score(tagged_doc): assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) +def test_partial_annotation(en_tokenizer): + pred_doc = en_tokenizer("a b c d e") + pred_doc[0].tag_ = "A" + pred_doc[0].pos_ = "X" + pred_doc[0].set_morph("Feat=Val") + pred_doc[0].dep_ = "dep" + + # unannotated reference + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + for key in scores: + # cats doesn't have an unset state + if key.startswith("cats"): + continue + assert scores[key] == None + + # partially annotated reference, not overlapping with predicted annotation + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + ref_doc[1].tag_ = "A" + ref_doc[1].pos_ = "X" + ref_doc[1].set_morph("Feat=Val") + ref_doc[1].dep_ = "dep" + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + assert scores["token_acc"] == None + assert scores["tag_acc"] == 0.0 + assert scores["pos_acc"] == 0.0 + assert scores["morph_acc"] == 0.0 + assert scores["dep_uas"] == 1.0 + assert scores["dep_las"] == 0.0 + assert scores["sents_f"] == None + + # partially annotated reference, overlapping with predicted annotation + ref_doc = en_tokenizer("a b c d e") + ref_doc.has_unknown_spaces = True + ref_doc[0].tag_ = "A" + ref_doc[0].pos_ = "X" + ref_doc[1].set_morph("Feat=Val") + ref_doc[1].dep_ = "dep" + example = Example(pred_doc, ref_doc) + scorer = Scorer() + scores = scorer.score([example]) + assert scores["token_acc"] == None + assert scores["tag_acc"] == 1.0 + assert scores["pos_acc"] == 1.0 + assert scores["morph_acc"] == 0.0 + assert scores["dep_uas"] == 1.0 + assert scores["dep_las"] == 0.0 + assert scores["sents_f"] == None + + def test_roc_auc_score(): # Binary classification, toy tests from scikit-learn test suite y_true = [0, 1] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index abc82030d..c824b2752 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -399,14 +399,13 @@ cdef class Doc: return True cdef int i cdef int range_start = 0 + if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: # HEAD does not have an unset state, so rely on DEP attr = DEP - elif attr == self.vocab.strings["IS_SENT_START"]: - # as in Matcher, allow IS_SENT_START as an alias of SENT_START - attr = SENT_START # special cases for sentence boundaries if attr == SENT_START: if "sents" in self.user_hooks: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index d511dc889..16bbc2700 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -683,6 +683,7 @@ The L2 norm of the document's vector representation. | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 388d92801..fb48d68cc 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -68,6 +68,8 @@ Scores the tokenization: - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token character spans +Docs with `has_unknown_spaces` are skipped during scoring. + > #### Example > > ```python @@ -81,7 +83,8 @@ Scores the tokenization: ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} -Scores a single token attribute. +Scores a single token attribute. Tokens with missing values in the reference doc +are skipped during scoring. > #### Example > @@ -90,20 +93,22 @@ Scores a single token attribute. > print(scores["pos_acc"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} Scores a single token attribute per feature for a token attribute in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) -format. +format. Tokens with missing values in the reference doc are skipped during +scoring. > #### Example > @@ -112,13 +117,14 @@ format. > print(scores["morph_per_feat"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} @@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans. > print(scores["ents_f"]) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | -| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | +| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ | +| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} -Calculate the UAS, LAS, and LAS per type scores for dependency parses. +Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens +with missing values for the `attr` (typically `dep`) are skipped during scoring. > #### Example > @@ -160,16 +168,17 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. > print(scores["dep_uas"], scores["dep_las"]) > ``` -| Name | Description | -| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| `head_attr` | The attribute containing the head token. ~~str~~ | -| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | -| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | -| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `head_attr` | The attribute containing the head token. ~~str~~ | +| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | +| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} From 1c4df8fd095e8671dff5e760edca1213063a99bc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 3 Nov 2020 16:24:38 +0100 Subject: [PATCH 15/16] Replace pytokenizations with internal alignment (#6293) * Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely --- pyproject.toml | 1 - requirements.txt | 1 - setup.cfg | 1 - setup.py | 1 + spacy/errors.py | 5 +- spacy/tests/training/test_training.py | 118 ++++++++++++++++++++-- spacy/training/__init__.py | 2 +- spacy/training/align.pyx | 66 ++++++++++++ spacy/training/{align.py => alignment.py} | 7 +- spacy/training/example.pyx | 2 +- 10 files changed, 182 insertions(+), 22 deletions(-) create mode 100644 spacy/training/align.pyx rename spacy/training/{align.py => alignment.py} (75%) diff --git a/pyproject.toml b/pyproject.toml index 14a2d7690..0ceda4454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ requires = [ "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0rc0,<8.1.0", "blis>=0.4.0,<0.8.0", - "pytokenizations", "pathy" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index c5e136a34..3a777f163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,6 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.5.0,<1.7.0 -pytokenizations # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 762a7e888..95ada08ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,6 @@ install_requires = numpy>=1.15.0 requests>=2.13.0,<3.0.0 pydantic>=1.5.0,<1.7.0 - pytokenizations jinja2 # Official Python utilities setuptools diff --git a/setup.py b/setup.py index 604d65745..160d2ed1c 100755 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ MOD_NAMES = [ "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", + "spacy.training.align", "spacy.training.gold_io", "spacy.tokens.doc", "spacy.tokens.span", diff --git a/spacy/errors.py b/spacy/errors.py index 2898fbcaa..f4fd3731f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -564,7 +564,10 @@ class Errors: "a string value from {expected} but got: '{arg}'") E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " "a list, but got: {arg_type}") - E949 = ("Can only create an alignment when the texts are the same.") + E949 = ("Unable to align tokens for the predicted and reference docs. It " + "is only possible to align the docs when both texts are the same " + "except for whitespace and capitalization. The predicted tokens " + "start with: {x}. The reference tokens start with: {y}.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 07e1aef01..ba485ab45 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -2,6 +2,7 @@ import numpy from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json, Example +from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -@pytest.mark.skip("Outdated") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ - (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), + (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])), ( ["a", "b", '"', "c"], ['ab"', "c"], - (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), + ([[0], [0], [0], [1]], [[0, 1, 2], [3]]), ), - (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), + (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])), ( ["ab", "c", "d"], ["a", "b", "cd"], - (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), + ([[0, 1], [2], [2]], [[0], [0], [1, 2]]), ), ( ["a", "b", "cd"], ["a", "b", "c", "d"], - (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), + ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), ), - ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), + ([" ", "a"], ["a"], ([[], [0]], [[1]])), ], ) def test_align(tokens_a, tokens_b, expected): # noqa - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa - assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_a, tokens_b) + assert (a2b, b2a) == expected # noqa # check symmetry - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa - assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa + assert (b2a, a2b) == expected # noqa def test_goldparse_startswith_space(en_tokenizer): @@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer): assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] +def test_goldparse_endswith_space(en_tokenizer): + text = "a\n" + doc = en_tokenizer(text) + gold_words = ["a"] + entities = ["U-DATE"] + deps = ["ROOT"] + heads = [0] + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} + ) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["U-DATE", "O"] + assert example.get_aligned("DEP", as_string=True) == ["ROOT", None] + + def test_gold_constructor(): """Test that the Example constructor works fine""" nlp = English() @@ -676,6 +691,87 @@ def test_alignment_different_texts(): Alignment.from_strings(other_tokens, spacy_tokens) +def test_alignment_spaces(en_vocab): + # single leading whitespace + other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] + + # multiple leading whitespace tokens + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] + + # both with leading whitespace, not identical + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7] + + # same leading whitespace, different tokenization + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7] + + # only one with trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + # different trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6] + + # same trailing whitespace, different tokenization + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7] + + # differing whitespace is allowed + other_tokens = ["a", " \n ", "b", "c"] + spacy_tokens = ["a", "b", " ", "c"] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.dataXd) == [0, 1, 3] + assert list(align.y2x.dataXd) == [0, 2, 3] + + # other differences in whitespace are allowed + other_tokens = [" ", "a"] + spacy_tokens = [" ", "a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + other_tokens = ["a", " "] + spacy_tokens = ["a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + def test_retokenized_docs(doc): a = doc.to_array(["TAG"]) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 86341dd9a..5111b80dc 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,6 +1,6 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 -from .align import Alignment # noqa: F401 +from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx new file mode 100644 index 000000000..b9d89f789 --- /dev/null +++ b/spacy/training/align.pyx @@ -0,0 +1,66 @@ +from typing import List, Tuple +from itertools import chain +import re + +from ..errors import Errors + + +def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: + # Create character-to-token mappings + char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) + char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) + str_a = "".join(A).lower() + str_b = "".join(B).lower() + cdef int len_str_a = len(str_a) + cdef int len_str_b = len(str_b) + # Check that the two texts only differ in whitespace and capitalization + if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ + len_str_a != len(char_to_token_a) or \ + len_str_b != len(char_to_token_b): + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + cdef int char_idx_a = 0 + cdef int char_idx_b = 0 + cdef int token_idx_a = 0 + cdef int token_idx_b = 0 + cdef int prev_token_idx_a = -1 + cdef int prev_token_idx_b = -1 + a2b = [] + b2a = [] + while char_idx_a < len_str_a and char_idx_b < len_str_b: + # Find the current token position from the character position + token_idx_a = char_to_token_a[char_idx_a] + token_idx_b = char_to_token_b[char_idx_b] + # Add a set for the next token if a token boundary has been crossed + if prev_token_idx_a != token_idx_a: + a2b.append(set()) + if prev_token_idx_b != token_idx_b: + b2a.append(set()) + # Process the alignment at the current position + if A[token_idx_a] == B[token_idx_b]: + # Current tokens are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += len(A[token_idx_a]) + char_idx_b += len(B[token_idx_b]) + elif str_a[char_idx_a] == str_b[char_idx_b]: + # Current chars are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += 1 + char_idx_b += 1 + elif str_a[char_idx_a].isspace(): + # Skip unaligned whitespace char in A + char_idx_a += 1 + elif str_b[char_idx_b].isspace(): + # Skip unaligned whitespace char in B + char_idx_b += 1 + else: + # This should never happen + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + prev_token_idx_a = token_idx_a + prev_token_idx_b = token_idx_b + # Process unaligned trailing whitespace + a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) + b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) + # Return values as sorted lists per token position + return [sorted(x) for x in a2b], [sorted(x) for x in b2a] diff --git a/spacy/training/align.py b/spacy/training/alignment.py similarity index 75% rename from spacy/training/align.py rename to spacy/training/alignment.py index e8f17a667..3e3b60ca6 100644 --- a/spacy/training/align.py +++ b/spacy/training/alignment.py @@ -2,9 +2,8 @@ from typing import List import numpy from thinc.types import Ragged from dataclasses import dataclass -import tokenizations -from ..errors import Errors +from .align import get_alignments @dataclass @@ -20,9 +19,7 @@ class Alignment: @classmethod def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": - if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): - raise ValueError(Errors.E949) - x2y, y2x = tokenizations.get_alignments(A, B) + x2y, y2x = get_alignments(A, B) return Alignment.from_indices(x2y=x2y, y2x=y2x) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index a8da49c61..6a556b5e7 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS -from .align import Alignment +from .alignment import Alignment from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings From 019a1dd5e82dce3bd181dde656e907f55431084f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Nov 2020 18:10:06 +0100 Subject: [PATCH 16/16] Fix v3 overview [ci skip] --- website/docs/usage/v3.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index fe4765285..b25b28a6d 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0. | Name | Description | | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | -| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. | | [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | -| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. | +| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. | | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. | @@ -1032,9 +1032,9 @@ change your names and imports: Thanks to everyone who's been contributing to the spaCy ecosystem by developing and maintaining one of the many awesome [plugins and extensions](/universe). We've tried to make it as easy as possible for you to upgrade your packages for -spaCy v3.0. The most common use case for plugins is providing pipeline components -and extension attributes. When migrating your plugin, double-check the -following: +spaCy v3.0. The most common use case for plugins is providing pipeline +components and extension attributes. When migrating your plugin, double-check +the following: - Use the [`@Language.factory`](/api/language#factory) decorator to register your component and assign it a name. This allows users to refer to your