From a0c899a0fff08e09f7ebabb8e0e50baa4f4b0897 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 10 Nov 2020 13:14:47 +0100
Subject: [PATCH 1/7] Fix textcat + transformer architecture (#6371)

* add pooling to textcat TransformerListener

* maybe_get_dim in case it's null
---
 spacy/cli/templates/quickstart_training.jinja | 3 +++
 spacy/ml/models/textcat.py                    | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 1194438de..37983cb1a 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -143,6 +143,9 @@ nO = null
 @architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
 
+[components.textcat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
 [components.textcat.model.linear_model]
 @architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index d4aed2839..2ec036810 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -61,14 +61,14 @@ def build_bow_text_classifier(
 
 
 @registry.architectures.register("spacy.TextCatEnsemble.v2")
-def build_text_classifier(
+def build_text_classifier_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     linear_model: Model[List[Doc], Floats2d],
     nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
     exclusive_classes = not linear_model.attrs["multi_label"]
     with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.get_dim("nO")
+        width = tok2vec.maybe_get_dim("nO")
         cnn_model = (
                 tok2vec
                 >> list2ragged()
@@ -94,7 +94,7 @@ def build_text_classifier(
 
 # TODO: move to legacy
 @registry.architectures.register("spacy.TextCatEnsemble.v1")
-def build_text_classifier(
+def build_text_classifier_v1(
     width: int,
     embed_size: int,
     pretrained_vectors: Optional[bool],

From a7e7d6c6c902055b66b208c299cfe8b578a497d8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 10 Nov 2020 13:15:09 +0100
Subject: [PATCH 2/7] Ignore misaligned in Morphologizer.get_loss (#6363)

Fix bug where `Morphologizer.get_loss` treated misaligned annotation as
`EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH`
mappings.
---
 spacy/pipeline/morphologizer.pyx           | 19 ++++++++-----------
 spacy/tests/pipeline/test_morphologizer.py | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index a03c7daf0..305f8f5df 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -92,9 +92,6 @@ class Morphologizer(Tagger):
         # 2) labels_pos stores a mapping from morph+POS->POS
         cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
         self.cfg = dict(sorted(cfg.items()))
-        # add mappings for empty morph
-        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
-        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
 
     @property
     def labels(self):
@@ -201,8 +198,8 @@ class Morphologizer(Tagger):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
-                doc.c[j].pos = self.cfg["labels_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
 
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
@@ -228,12 +225,12 @@ class Morphologizer(Tagger):
                 # doesn't, so if either is None, treat both as None here so that
                 # truths doesn't end up with an unknown morph+POS combination
                 if pos is None or morph is None:
-                    pos = None
-                    morph = None
-                label_dict = Morphology.feats_to_dict(morph)
-                if pos:
-                    label_dict[self.POS_FEAT] = pos
-                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    label = None
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[self.POS_FEAT] = pos
+                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
                 eg_truths.append(label)
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 85d1d6c8b..add42e00a 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -116,3 +116,23 @@ def test_overfitting_IO():
     no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
     assert_equal(batch_deps_1, batch_deps_2)
     assert_equal(batch_deps_1, no_batch_deps)
+
+    # Test without POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            token.pos_ = ""
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
+    gold_pos_tags = ["", "", "", ""]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags

From 96726ec1f62ade72b7904b2b21b4a893f93d0ca8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 17 Nov 2020 14:36:44 +0100
Subject: [PATCH 3/7] Fix DocBin init in training example (#6396)

---
 website/docs/usage/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 274ea5989..58c846e9d 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -969,7 +969,7 @@ import spacy
 from spacy.tokens import Doc, DocBin
 
 nlp = spacy.blank("en")
-docbin = DocBin(nlp.vocab)
+docbin = DocBin()
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
 ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]

From 165993d8e57f2bd0ea35f4792f414951dc6c4787 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 19 Nov 2020 14:11:38 +0100
Subject: [PATCH 4/7] fix typo in transformer docs (#6404)

---
 website/docs/api/transformer.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 5754d2238..e31c8ad2c 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
 > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 > ```
 
-| Setting                 | Description                                                                                                                                                                                                                                                                                                           |
-| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                           |
-| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
-| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                        |
+| Setting                 | Description                                                                                                                                                                                                                                                                                                   |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                   |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                |
 
 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py

From 26296ab223b809cef5bd1fd997a4112119815864 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Nov 2020 07:39:49 +0100
Subject: [PATCH 5/7] Add error message if DocBin zlib decompress fails (#6394)

Add a better error message if DocBin zlib decompress fails, indicating
that the data is not in `DocBin` format.
---
 spacy/errors.py            | 4 ++++
 spacy/tokens/_serialize.py | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index f4fd3731f..c2bb36b93 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -712,6 +712,10 @@ class Errors:
     E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
              "token itself. To set the morph from this MorphAnalysis, set from "
              "the string value with: `token.set_morph(str(other_morph))`.")
+    E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
+             "DocBin (.spacy) format. If your data is in spaCy v2's JSON "
+             "training format, convert it using `python -m spacy convert "
+             "file.json .`.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 11eb75821..821f55eb6 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -198,7 +198,10 @@ class DocBin:
 
         DOCS: https://nightly.spacy.io/api/docbin#from_bytes
         """
-        msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
+        try:
+            msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
+        except zlib.error:
+            raise ValueError(Errors.E1014)
         self.attrs = msg["attrs"]
         self.strings = set(msg["strings"])
         lengths = numpy.frombuffer(msg["lengths"], dtype="int32")

From d21d2c2e59fd2347fc0280e4388fa7fa50786f8d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 27 Nov 2020 15:15:51 +0800
Subject: [PATCH 6/7] Don't multiply accuracy by 100

---
 website/src/templates/models.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 17140b072..b9658dacd 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -120,7 +120,7 @@ function formatAccuracy(data) {
                 ? null
                 : {
                       label,
-                      value: (value * 100).toFixed(2),
+                      value: value.toFixed(2),
                       help: MODEL_META[label],
                   }
         })

From 9beba7164f5aae9797243bcdb8bd963a95a6f557 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 27 Nov 2020 15:17:14 +0800
Subject: [PATCH 7/7] Make jinja2 top-level import

No problem anymore since it's now an official dependency
---
 requirements.txt         | 2 +-
 spacy/cli/init_config.py | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3a777f163..074775ae8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.5.0,<1.7.0
+jinja2
 # Official Python utilities
 setuptools
 packaging>=20.0
@@ -26,4 +27,3 @@ pytest>=4.6.5
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-jinja2
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 9f73b17ae..ff11f97f6 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
 from thinc.api import Config
 import srsly
 import re
+from jinja2 import Template
 
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
@@ -127,10 +128,6 @@ def init_config(
 ) -> None:
     is_stdout = str(output_file) == "-"
     msg = Printer(no_print=is_stdout)
-    try:
-        from jinja2 import Template
-    except ImportError:
-        msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
     with TEMPLATE_PATH.open("r") as f:
         template = Template(f.read())
     # Filter out duplicates since tok2vec and transformer are added by template