diff --git a/spacy/_ml.py b/spacy/_ml.py
index 964b1fa7a..231f6a7a4 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -444,7 +444,46 @@ def getitem(i):
     return layerize(getitem_fwd)
 
 
-def build_tagger_model(nr_class, **cfg):
+@describe.attributes(
+    W=Synapses("Weights matrix",
+        lambda obj: (obj.nO, obj.nI),
+        lambda W, ops: None)
+)
+class MultiSoftmax(Affine):
+    '''Neural network layer that predicts several multi-class attributes at once.
+    For instance, we might predict one class with 6 variables, and another with 5.
+    We predict the 11 neurons required for this, and then softmax them such
+    that columns 0-6 make a probability distribution and coumns 6-11 make another.
+    '''
+    name = 'multisoftmax'
+
+    def __init__(self, out_sizes, nI=None, **kwargs):
+        Model.__init__(self, **kwargs)
+        self.out_sizes = out_sizes
+        self.nO = sum(out_sizes)
+        self.nI = nI
+
+    def predict(self, input__BI):
+        output__BO = self.ops.affine(self.W, self.b, input__BI)
+        i = 0
+        for out_size in self.out_sizes:
+            self.ops.softmax(output__BO[:, i : i+out_size], inplace=True)
+            i += out_size
+        return output__BO
+
+    def begin_update(self, input__BI, drop=0.):
+        output__BO = self.predict(input__BI)
+        def finish_update(grad__BO, sgd=None):
+            self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
+            self.d_b += grad__BO.sum(axis=0)
+            grad__BI = self.ops.gemm(grad__BO, self.W)
+            if sgd is not None:
+                sgd(self._mem.weights, self._mem.gradient, key=self.id)
+            return grad__BI
+        return output__BO, finish_update
+
+
+def build_tagger_model(class_nums, **cfg):
     embed_size = util.env_opt('embed_size', 7000)
     if 'token_vector_width' in cfg:
         token_vector_width = cfg['token_vector_width']
@@ -459,7 +498,8 @@ def build_tagger_model(nr_class, **cfg):
             tok2vec = Tok2Vec(token_vector_width, embed_size,
                               subword_features=subword_features,
                               pretrained_vectors=pretrained_vectors)
-        softmax = with_flatten(Softmax(nr_class, token_vector_width))
+        softmax = with_flatten(
+            MultiSoftmax(class_nums, token_vector_width))
         model = (
             tok2vec
             >> softmax