Fix orths_and_spaces in Doc.__init__

This commit is contained in:
Matthew Honnibal 2016-09-21 14:52:05 +02:00
parent 070af4af9d
commit 2735b6247b

View File

@ -83,6 +83,23 @@ cdef class Doc:
self._py_tokens = [] self._py_tokens = []
self._vector = None self._vector = None
self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang) self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)
cdef unicode orth
cdef bint has_space
if orths_and_spaces is not None:
for orth_space in orths_and_spaces:
if isinstance(orth_space, unicode):
orth = orth_space
has_space = True
elif isinstance(orth_space, bytes):
raise ValueError(
"orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
else:
orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC
# must be created.
self.push_back(
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a Token or a Span from the Doc. """Get a Token or a Span from the Doc.