Merge branch 'develop' into nightly.spacy.io

2025-07-24 07:00:04 +03:00 · 2020-11-27 15:17:33 +08:00 · 2020-11-27 15:17:33 +08:00 · 576eeed849
commit 576eeed849
parent 11ac3e0a36 9beba7164f
10 changed files with 50 additions and 26 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -15,6 +15,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.5.0,<1.7.0
+jinja2
 # Official Python utilities
 setuptools
 packaging>=20.0
@ -26,4 +27,3 @@ pytest>=4.6.5
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-jinja2
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
 from thinc.api import Config
 import srsly
 import re
+from jinja2 import Template

 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
@ -127,10 +128,6 @@ def init_config(
 ) -> None:
    is_stdout = str(output_file) == "-"
    msg = Printer(no_print=is_stdout)
-    try:
-        from jinja2 import Template
-    except ImportError:
-        msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
    # Filter out duplicates since tok2vec and transformer are added by template
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -143,6 +143,9 @@ nO = null
@architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0

+[components.textcat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
 [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -712,6 +712,10 @@ class Errors:
    E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
             "token itself. To set the morph from this MorphAnalysis, set from "
             "the string value with: `token.set_morph(str(other_morph))`.")
+    E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
+             "DocBin (.spacy) format. If your data is in spaCy v2's JSON "
+             "training format, convert it using `python -m spacy convert "
+             "file.json .`.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -61,14 +61,14 @@ def build_bow_text_classifier(


@registry.architectures.register("spacy.TextCatEnsemble.v2")
-def build_text_classifier(
+def build_text_classifier_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.get_dim("nO")
+        width = tok2vec.maybe_get_dim("nO")
        cnn_model = (
                tok2vec
                >> list2ragged()
@ -94,7 +94,7 @@ def build_text_classifier(

 # TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
-def build_text_classifier(
+def build_text_classifier_v1(
    width: int,
    embed_size: int,
    pretrained_vectors: Optional[bool],
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
        # 2) labels_pos stores a mapping from morph+POS->POS
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
        self.cfg = dict(sorted(cfg.items()))
-        # add mappings for empty morph
-        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
-        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]

    @property
    def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
-                doc.c[j].pos = self.cfg["labels_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)

    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
                # doesn't, so if either is None, treat both as None here so that
                # truths doesn't end up with an unknown morph+POS combination
                if pos is None or morph is None:
-                    pos = None
-                    morph = None
-                label_dict = Morphology.feats_to_dict(morph)
-                if pos:
-                    label_dict[self.POS_FEAT] = pos
-                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    label = None
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[self.POS_FEAT] = pos
+                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -116,3 +116,23 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
+
+    # Test without POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            token.pos_ = ""
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
+    gold_pos_tags = ["", "", "", ""]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -198,7 +198,10 @@ class DocBin:

        DOCS: https://nightly.spacy.io/api/docbin#from_bytes
        """
-        msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
+        try:
+            msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
+        except zlib.error:
+            raise ValueError(Errors.E1014)
        self.attrs = msg["attrs"]
        self.strings = set(msg["strings"])
        lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
 > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 > ```

-| Setting                 | Description                                                                                                                                                                                                                                                                                                           |
-| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                           |
-| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
-| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                        |
+| Setting                 | Description                                                                                                                                                                                                                                                                                                   |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                   |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                |

 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@ -120,7 +120,7 @@ function formatAccuracy(data) {
                ? null
                : {
                      label,
-                      value: (value * 100).toFixed(2),
+                      value: value.toFixed(2),
                      help: MODEL_META[label],
                  }
        })