prevent None in gold fields (#5425)

* set gold fields to empty list instead of keeping them as None

* add unit test
This commit is contained in:
Sofie Van Landeghem 2020-05-13 22:08:50 +02:00 committed by GitHub
parent 113e7981d0
commit b04738903e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 2 deletions

View File

@ -658,7 +658,15 @@ cdef class GoldParse:
entdoc = None
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if self.length == 0:
self.words = []
self.tags = []
self.heads = []
self.labels = []
self.ner = []
self.morphology = []
else:
if words is None:
words = [token.text for token in doc]
if tags is None:

View File

@ -7,7 +7,7 @@ from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
from spacy.gold import GoldParse, minibatch
from spacy.tokens import Doc
@ -174,6 +174,31 @@ def test_accept_blocked_token():
assert ner2.moves.is_valid(state2, "U-")
def test_train_empty():
"""Test that training an empty text does not throw errors."""
train_data = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("", {"entities": []}),
]
nlp = English()
ner = nlp.create_pipe("ner")
ner.add_label("PERSON")
nlp.add_pipe(ner, last=True)
nlp.begin_training()
for itn in range(2):
losses = {}
batches = minibatch(train_data)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
losses=losses,
)
def test_overwrite_token():
nlp = English()
ner1 = nlp.create_pipe("ner")