Remove unused, experimental multi-task components (#11919)

* Remove experimental multi-task components These are incomplete implementations and are not usable in their current state. * Remove orphaned error message * Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6 * Revert "Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)" This reverts commit 77c0fd7b17. Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2026-03-06 21:01:34 +03:00 · 2022-12-08 13:24:45 +01:00 · 2022-12-08 13:24:45 +01:00 · f5aabaf7d6
commit f5aabaf7d6
parent d60997febb
3 changed files with 0 additions and 224 deletions
--- a/setup.py
+++ b/setup.py
@ -38,7 +38,6 @@ MOD_NAMES = [
    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.trainable_pipe",
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -243,8 +243,6 @@ class Errors(metaclass=ErrorsWithCodes):
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
    E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
            "refers to an issue with the `Vocab` or `StringStore`.")
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -1,221 +0,0 @@
-# cython: infer_types=True, profile=True, binding=True
-from typing import Optional
-import numpy
-from thinc.api import CosineDistance, to_categorical, Model, Config
-from thinc.api import set_dropout_rate
-
-from ..tokens.doc cimport Doc
-
-from .trainable_pipe import TrainablePipe
-from .tagger import Tagger
-from ..training import validate_examples
-from ..language import Language
-from ._parser_internals import nonproj
-from ..attrs import POS, ID
-from ..errors import Errors
-
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        docs = [eg.predicted for eg in examples]
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        docs = [eg.predicted for eg in examples]
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError