Merge pull request #7039 from svlandeg/debug

2025-10-02 09:56:39 +03:00 · 2021-02-13 11:53:41 +11:00 · 2021-02-13 11:53:41 +11:00 · e583050547
commit e583050547
parent f4712a634e 03b4ec7d7f
2 changed files with 78 additions and 8 deletions
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -291,14 +291,16 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
        # of data.
        # When the components batch differently, we don't receive a matching
        # prediction from the upstream, so we can't predict.
-        if not all(doc.tensor.size for doc in inputs):
-            # But we do need to do *something* if the tensor hasn't been set.
-            # The compromise is to at least return data of the right shape,
-            # so the output is valid.
-            width = model.get_dim("nO")
-            outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs]
-        else:
-            outputs = [doc.tensor for doc in inputs]
+        outputs = []
+        width = model.get_dim("nO")
+        for doc in inputs:
+            if doc.tensor.size == 0:
+                # But we do need to do *something* if the tensor hasn't been set.
+                # The compromise is to at least return data of the right shape,
+                # so the output is valid.
+                outputs.append(model.ops.alloc2f(len(doc), width))
+            else:
+                outputs.append(doc.tensor)
        return outputs, lambda dX: []


--- a/spacy/tests/regression/test_issue7029.py
+++ b/spacy/tests/regression/test_issue7029.py
@ -0,0 +1,68 @@
+from spacy.lang.en import English
+from spacy.training import Example
+from spacy.util import load_config_from_str
+
+
+CONFIG = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+def test_issue7029():
+    """Test that an empty document doesn't mess up an entire batch.
+    """
+    nlp = English.from_config(load_config_from_str(CONFIG))
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
+    nlp.select_pipes(enable=["tok2vec", "tagger"])
+    docs1 = list(nlp.pipe(texts, batch_size=1))
+    docs2 = list(nlp.pipe(texts, batch_size=4))
+    assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]