From f292f7f0e6756bb54eda2a011a7fa4094fff7c10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 23:47:46 +0100 Subject: [PATCH] Fix Issue #599, by considering empty documents to be parsed and tagged. Implementation is a bit dodgy. --- spacy/tokens/doc.pyx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 32106da99..1200a0517 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -151,6 +151,11 @@ cdef class Doc: # must be created. self.push_back( self.vocab.get(self.mem, orth), has_space) + # Tough to decide on policy for this. Is an empty doc tagged and parsed? + # There's no information we'd like to add to it, so I guess so? + if self.length == 0: + self.is_tagged = True + self.is_parsed = True def __getitem__(self, object i): ''' @@ -430,6 +435,10 @@ cdef class Doc: yield Span(self, start, self.length) cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: + if self.length == 0: + # Flip these to false when we see the first token. + self.is_tagged = False + self.is_parsed = False if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.c[self.length]