* Fix Issue #14: Improve parsing API

This commit is contained in:
Matthew Honnibal 2015-01-30 18:04:41 +11:00
parent 998b607f65
commit b68f563c2f
2 changed files with 62 additions and 19 deletions

View File

@ -35,6 +35,7 @@ cdef class Tokens:
cdef TokenC* data cdef TokenC* data
cdef list _py_tokens
cdef unicode _string cdef unicode _string
cdef list _tag_strings cdef list _tag_strings
cdef list _dep_strings cdef list _dep_strings

View File

@ -88,6 +88,7 @@ cdef class Tokens:
self.length = 0 self.length = 0
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self._py_tokens = [None] * self.length
self._tag_strings = [] # These will be set by the POS tagger and parser self._tag_strings = [] # These will be set by the POS tagger and parser
self._dep_strings = [] # The strings are arbitrary and model-specific. self._dep_strings = [] # The strings are arbitrary and model-specific.
@ -114,13 +115,18 @@ cdef class Tokens:
def __getitem__(self, i): def __getitem__(self, i):
"""Retrieve a token. """Retrieve a token.
The Python Token objects are created lazily from internal C data, and
cached in _py_tokens
Returns: Returns:
token (Token): token (Token):
""" """
if i < 0: if i < 0:
i = self.length - i i = self.length - i
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self, i) if self._py_tokens[i] is None:
self._py_tokens[i] = Token(self, i)
return self._py_tokens[i]
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -151,6 +157,7 @@ cdef class Tokens:
t.lex = lex_or_tok t.lex = lex_or_tok
t.idx = idx t.idx = idx
self.length += 1 self.length += 1
self._py_tokens.append(None)
return idx + t.lex.length return idx + t.lex.length
@cython.boundscheck(False) @cython.boundscheck(False)
@ -256,24 +263,59 @@ cdef class Token:
def nbor(self, int i=1): def nbor(self, int i=1):
return Token(self._seq, self.i + i) return Token(self._seq, self.i + i)
def child(self, int i=1): @property
def n_lefts(self):
if not self._seq.is_parsed:
msg = _parse_unset_error
raise AttributeError(msg)
cdef const TokenC* tokens = self._seq.data
cdef int n
for i in range(self.i):
if i + tokens[i].head == self.i:
n += 1
return n
@property
def n_rights(self):
if not self._seq.is_parsed:
msg = _parse_unset_error
raise AttributeError(msg)
cdef const TokenC* tokens = self._seq.data
cdef int n
for i in range(self.i+1, self._seq.length):
if (i + tokens[i].head) == self.i:
n += 1
return n
@property
def lefts(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
if not self._seq.is_parsed: if not self._seq.is_parsed:
msg = _parse_unset_error msg = _parse_unset_error
raise AttributeError(msg) raise AttributeError(msg)
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* tokens = self._seq.data
if i == 0: cdef int i
return self for i in range(self.i):
elif i >= 1: if i + tokens[i].head == self.i:
if t.r_kids == 0: yield Token(self._seq, i)
return None
else: @property
return Token(self._seq, _nth_significant_bit(t.r_kids, i)) def rights(self):
else: """The rightward immediate children of the word, in the syntactic
if t.l_kids == 0: dependency parse."""
return None
else: if not self._seq.is_parsed:
return Token(self._seq, _nth_significant_bit(t.l_kids, i)) msg = _parse_unset_error
raise AttributeError(msg)
cdef const TokenC* tokens = self._seq.data
cdef int i
for i in range(self.i, self._seq.length):
if i + tokens[i].head == self.i:
yield Token(self._seq, i)
property head: property head:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
@ -282,7 +324,7 @@ cdef class Token:
msg = _parse_unset_error msg = _parse_unset_error
raise AttributeError(msg) raise AttributeError(msg)
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head) return self._seq[self.i + t.head]
property whitespace_: property whitespace_:
def __get__(self): def __get__(self):
@ -344,9 +386,9 @@ cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
return 0 return 0
_parse_unset_error = """Text has not been parsed, so cannot access head, child or sibling. _parse_unset_error = """Text has not been parsed, so cannot be accessed.
Check that the parser data is installed. Check that the parser data is installed. Run "python -m spacy.en.download" if not.
Check that the parse=True argument was set in the call to English.__call__ Check whether parse=False in the call to English.__call__
""" """