Merge pull request #7039 from svlandeg/debug

This commit is contained in:
Ines Montani 2021-02-13 11:53:41 +11:00 committed by GitHub
commit e583050547
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 8 deletions

View File

@ -291,14 +291,16 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
# of data.
# When the components batch differently, we don't receive a matching
# prediction from the upstream, so we can't predict.
if not all(doc.tensor.size for doc in inputs):
# But we do need to do *something* if the tensor hasn't been set.
# The compromise is to at least return data of the right shape,
# so the output is valid.
width = model.get_dim("nO")
outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs]
else:
outputs = [doc.tensor for doc in inputs]
outputs = []
width = model.get_dim("nO")
for doc in inputs:
if doc.tensor.size == 0:
# But we do need to do *something* if the tensor hasn't been set.
# The compromise is to at least return data of the right shape,
# so the output is valid.
outputs.append(model.ops.alloc2f(len(doc), width))
else:
outputs.append(doc.tensor)
return outputs, lambda dX: []

View File

@ -0,0 +1,68 @@
from spacy.lang.en import English
from spacy.training import Example
from spacy.util import load_config_from_str
CONFIG = """
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
"""
TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
def test_issue7029():
"""Test that an empty document doesn't mess up an entire batch.
"""
nlp = English.from_config(load_config_from_str(CONFIG))
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
nlp.select_pipes(enable=["tok2vec", "tagger"])
docs1 = list(nlp.pipe(texts, batch_size=1))
docs2 = list(nlp.pipe(texts, batch_size=4))
assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]