mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation.
This commit is contained in:
parent
88170e6295
commit
77d62d0179
|
@ -37,8 +37,8 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef list _py_tokens
|
cdef list _py_tokens
|
||||||
cdef unicode _string
|
cdef unicode _string
|
||||||
cdef list _tag_strings
|
cdef tuple _tag_strings
|
||||||
cdef list _dep_strings
|
cdef tuple _dep_strings
|
||||||
|
|
||||||
cdef public bint is_tagged
|
cdef public bint is_tagged
|
||||||
cdef public bint is_parsed
|
cdef public bint is_parsed
|
||||||
|
@ -52,24 +52,35 @@ cdef class Tokens:
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef readonly Tokens _seq
|
cdef Vocab vocab
|
||||||
cdef readonly int i
|
cdef Pool mem
|
||||||
|
cdef unicode _string
|
||||||
|
|
||||||
cdef readonly attr_t idx
|
cdef const TokenC* c
|
||||||
cdef readonly attr_t cluster
|
cdef int i
|
||||||
cdef readonly attr_t length
|
cdef int array_len
|
||||||
cdef readonly attr_t orth
|
|
||||||
cdef readonly attr_t lower
|
|
||||||
cdef readonly attr_t norm
|
cdef list _py
|
||||||
cdef readonly attr_t shape
|
cdef tuple _tag_strings
|
||||||
cdef readonly attr_t prefix
|
cdef tuple _dep_strings
|
||||||
cdef readonly attr_t suffix
|
|
||||||
cdef readonly float prob
|
@staticmethod
|
||||||
cdef readonly float sentiment
|
cdef inline Token cinit(Pool mem, Vocab vocab, unicode string,
|
||||||
cdef readonly attr_t flags
|
const TokenC* token, int offset, int array_len,
|
||||||
cdef readonly attr_t lemma
|
list py_tokens, tuple tag_strings, tuple dep_strings):
|
||||||
cdef readonly univ_pos_t pos
|
assert offset >= 0 and offset < array_len
|
||||||
cdef readonly attr_t tag
|
if py_tokens[offset] is not None:
|
||||||
cdef readonly attr_t dep
|
return py_tokens[offset]
|
||||||
cdef readonly ndarray repvec
|
|
||||||
cdef readonly unicode string
|
cdef Token self = Token.__new__(Token, mem, vocab, string)
|
||||||
|
|
||||||
|
self.c = token
|
||||||
|
self.i = offset
|
||||||
|
self.array_len = array_len
|
||||||
|
|
||||||
|
self._py = py_tokens
|
||||||
|
self._tag_strings = tag_strings
|
||||||
|
self._dep_strings = dep_strings
|
||||||
|
py_tokens[offset] = self
|
||||||
|
return self
|
||||||
|
|
320
spacy/tokens.pyx
320
spacy/tokens.pyx
|
@ -88,31 +88,11 @@ cdef class Tokens:
|
||||||
self.length = 0
|
self.length = 0
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = [None] * self.length
|
self._py_tokens = []
|
||||||
self._tag_strings = [] # These will be set by the POS tagger and parser
|
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||||
self._dep_strings = [] # The strings are arbitrary and model-specific.
|
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||||
|
|
||||||
def sentences(self):
|
def __getitem__(self, object i):
|
||||||
cdef int i
|
|
||||||
sentences = []
|
|
||||||
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
|
||||||
cdef attr_t period = self.vocab.strings['.']
|
|
||||||
cdef attr_t question = self.vocab.strings['?']
|
|
||||||
cdef attr_t exclamation = self.vocab.strings['!']
|
|
||||||
spans = []
|
|
||||||
start = None
|
|
||||||
for i in range(self.length):
|
|
||||||
if start is None:
|
|
||||||
start = i
|
|
||||||
if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
|
|
||||||
self.data[i].lex.orth == question:
|
|
||||||
spans.append((start, i+1))
|
|
||||||
start = None
|
|
||||||
if start is not None:
|
|
||||||
spans.append((start, self.length))
|
|
||||||
return spans
|
|
||||||
|
|
||||||
def __getitem__(self, i):
|
|
||||||
"""Retrieve a token.
|
"""Retrieve a token.
|
||||||
|
|
||||||
The Python Token objects are created lazily from internal C data, and
|
The Python Token objects are created lazily from internal C data, and
|
||||||
|
@ -124,9 +104,9 @@ cdef class Tokens:
|
||||||
if i < 0:
|
if i < 0:
|
||||||
i = self.length - i
|
i = self.length - i
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
if self._py_tokens[i] is None:
|
return Token.cinit(self.mem, self.vocab, self._string,
|
||||||
self._py_tokens[i] = Token(self, i)
|
&self.data[i], i, self.length,
|
||||||
return self._py_tokens[i]
|
self._py_tokens, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the tokens.
|
"""Iterate over the tokens.
|
||||||
|
@ -135,7 +115,9 @@ cdef class Tokens:
|
||||||
token (Token):
|
token (Token):
|
||||||
"""
|
"""
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
yield self[i]
|
yield Token.cinit(self.mem, self.vocab, self._string,
|
||||||
|
&self.data[i], i, self.length,
|
||||||
|
self._py_tokens, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.length
|
return self.length
|
||||||
|
@ -157,6 +139,8 @@ cdef class Tokens:
|
||||||
self._py_tokens.append(None)
|
self._py_tokens.append(None)
|
||||||
return idx + t.lex.length
|
return idx + t.lex.length
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef long[:,:] to_array(self, object py_attr_ids):
|
cpdef long[:,:] to_array(self, object py_attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||||
|
@ -224,64 +208,125 @@ cdef class Tokens:
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
self.data[i].lex = &EMPTY_LEXEME
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sents(self):
|
||||||
|
"""This is really only a place-holder for a proper solution."""
|
||||||
|
cdef int i
|
||||||
|
sentences = []
|
||||||
|
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
||||||
|
cdef attr_t period = self.vocab.strings['.']
|
||||||
|
cdef attr_t question = self.vocab.strings['?']
|
||||||
|
cdef attr_t exclamation = self.vocab.strings['!']
|
||||||
|
spans = []
|
||||||
|
start = None
|
||||||
|
for i in range(self.length):
|
||||||
|
if start is None:
|
||||||
|
start = i
|
||||||
|
if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
|
||||||
|
self.data[i].lex.orth == question:
|
||||||
|
spans.append((start, i+1))
|
||||||
|
start = None
|
||||||
|
if start is not None:
|
||||||
|
spans.append((start, self.length))
|
||||||
|
return spans
|
||||||
|
|
||||||
|
|
||||||
@cython.freelist(64)
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token."""
|
"""An individual token."""
|
||||||
def __cinit__(self, Tokens tokens, int i):
|
def __cinit__(self, Pool mem, Vocab vocab, unicode string):
|
||||||
self._seq = tokens
|
self.mem = mem
|
||||||
self.i = i
|
self.vocab = vocab
|
||||||
cdef const TokenC* t = &tokens.data[i]
|
self._string = string
|
||||||
self.idx = t.idx
|
|
||||||
self.cluster = t.lex.cluster
|
|
||||||
self.length = t.lex.length
|
|
||||||
self.orth = t.lex.orth
|
|
||||||
self.lower = t.lex.lower
|
|
||||||
self.norm = t.lex.norm
|
|
||||||
self.shape = t.lex.shape
|
|
||||||
self.prefix = t.lex.prefix
|
|
||||||
self.suffix = t.lex.suffix
|
|
||||||
self.prob = t.lex.prob
|
|
||||||
self.sentiment = t.lex.sentiment
|
|
||||||
self.flags = t.lex.flags
|
|
||||||
self.lemma = t.lemma
|
|
||||||
self.pos = t.pos
|
|
||||||
self.tag = t.tag
|
|
||||||
self.dep = t.dep
|
|
||||||
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
|
|
||||||
cdef int next_idx = (t+1).idx
|
|
||||||
if next_idx <= self.idx:
|
|
||||||
next_idx = self.idx + self.length
|
|
||||||
self.string = tokens._string[self.idx:next_idx]
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self._seq.data[self.i].lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
return Token(self._seq, self.i + i)
|
return Token.cinit(self.mem, self.vocab, self._string,
|
||||||
|
self.c, self.i, self.array_len,
|
||||||
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def string(self):
|
||||||
|
cdef int next_idx = (self.c + 1).idx
|
||||||
|
if next_idx < self.c.idx:
|
||||||
|
next_idx = self.c.idx + self.c.lex.length
|
||||||
|
return self._string[self.c.idx:next_idx]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def idx(self):
|
||||||
|
return self.c.idx
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cluster(self):
|
||||||
|
return self.c.lex.cluster
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cluster(self):
|
||||||
|
return self.c.lex.cluster
|
||||||
|
|
||||||
|
@property
|
||||||
|
def orth(self):
|
||||||
|
return self.c.lex.orth
|
||||||
|
|
||||||
|
@property
|
||||||
|
def lower(self):
|
||||||
|
return self.c.lex.lower
|
||||||
|
|
||||||
|
@property
|
||||||
|
def norm(self):
|
||||||
|
return self.c.lex.norm
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shape(self):
|
||||||
|
return self.c.lex.shape
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prefix(self):
|
||||||
|
return self.c.lex.prefix
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix(self):
|
||||||
|
return self.c.lex.suffix
|
||||||
|
|
||||||
|
@property
|
||||||
|
def lemma(self):
|
||||||
|
return self.c.lemma
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pos(self):
|
||||||
|
return self.c.pos
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tag(self):
|
||||||
|
return self.c.tag
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dep(self):
|
||||||
|
return self.c.dep
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repvec(self):
|
||||||
|
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_lefts(self):
|
def n_lefts(self):
|
||||||
if not self._seq.is_parsed:
|
|
||||||
msg = _parse_unset_error
|
|
||||||
raise AttributeError(msg)
|
|
||||||
cdef const TokenC* tokens = self._seq.data
|
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
for i in range(self.i):
|
cdef const TokenC* ptr = self.c - self.i
|
||||||
if i + tokens[i].head == self.i:
|
while ptr != self.c:
|
||||||
|
if ptr + ptr.head == self.c:
|
||||||
n += 1
|
n += 1
|
||||||
|
ptr += 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_rights(self):
|
def n_rights(self):
|
||||||
if not self._seq.is_parsed:
|
|
||||||
msg = _parse_unset_error
|
|
||||||
raise AttributeError(msg)
|
|
||||||
cdef const TokenC* tokens = self._seq.data
|
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
for i in range(self.i+1, self._seq.length):
|
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
||||||
if (i + tokens[i].head) == self.i:
|
while ptr != self.c:
|
||||||
|
if ptr + ptr.head == self.c:
|
||||||
n += 1
|
n += 1
|
||||||
|
ptr -= 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -289,99 +334,94 @@ cdef class Token:
|
||||||
"""The leftward immediate children of the word, in the syntactic
|
"""The leftward immediate children of the word, in the syntactic
|
||||||
dependency parse.
|
dependency parse.
|
||||||
"""
|
"""
|
||||||
if not self._seq.is_parsed:
|
cdef const TokenC* ptr = self.c - self.i
|
||||||
msg = _parse_unset_error
|
while ptr < self.c:
|
||||||
raise AttributeError(msg)
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
||||||
|
ptr += ptr.head
|
||||||
|
|
||||||
cdef const TokenC* tokens = self._seq.data
|
elif ptr + ptr.head == self.c:
|
||||||
cdef int i
|
yield Token.cinit(self.mem, self.vocab, self._string,
|
||||||
for i in range(self.i):
|
ptr, self.i, self.array_len,
|
||||||
if i + tokens[i].head == self.i:
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
yield Token(self._seq, i)
|
ptr += 1
|
||||||
|
else:
|
||||||
|
ptr += 1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def rights(self):
|
def rights(self):
|
||||||
"""The rightward immediate children of the word, in the syntactic
|
"""The rightward immediate children of the word, in the syntactic
|
||||||
dependency parse."""
|
dependency parse."""
|
||||||
|
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
|
||||||
|
while ptr > self.c:
|
||||||
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
||||||
|
ptr += ptr.head
|
||||||
|
elif ptr + ptr.head == self.c:
|
||||||
|
yield Token.cinit(self.mem, self.vocab, self._string,
|
||||||
|
ptr, self.i, self.array_len,
|
||||||
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
ptr -= 1
|
||||||
|
else:
|
||||||
|
ptr -= 1
|
||||||
|
|
||||||
if not self._seq.is_parsed:
|
@property
|
||||||
msg = _parse_unset_error
|
def head(self):
|
||||||
raise AttributeError(msg)
|
|
||||||
|
|
||||||
cdef const TokenC* tokens = self._seq.data
|
|
||||||
cdef int i
|
|
||||||
for i in range(self.i, self._seq.length):
|
|
||||||
if i + tokens[i].head == self.i:
|
|
||||||
yield Token(self._seq, i)
|
|
||||||
|
|
||||||
property head:
|
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
def __get__(self):
|
return Token.cinit(self.mem, self.vocab, self._string,
|
||||||
if not self._seq.is_parsed:
|
self.c + self.c.head, self.i, self.array_len,
|
||||||
msg = _parse_unset_error
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
raise AttributeError(msg)
|
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
|
||||||
return self._seq[self.i + t.head]
|
|
||||||
|
|
||||||
property whitespace_:
|
@property
|
||||||
def __get__(self):
|
def whitespace_(self):
|
||||||
return self.string[self.length:]
|
return self.string[self.c.lex.length:]
|
||||||
|
|
||||||
property orth_:
|
@property
|
||||||
def __get__(self):
|
def orth_(self):
|
||||||
return self._seq.vocab.strings[self.orth]
|
return self.vocab.strings[self.c.lex.orth]
|
||||||
|
|
||||||
property lower_:
|
@property
|
||||||
def __get__(self):
|
def lower_(self):
|
||||||
return self._seq.vocab.strings[self.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
def __get__(self):
|
def norm_(self):
|
||||||
return self._seq.vocab.strings[self.norm]
|
return self.vocab.strings[self.c.lex.norm]
|
||||||
|
|
||||||
property shape_:
|
@property
|
||||||
def __get__(self):
|
def shape_(self):
|
||||||
return self._seq.vocab.strings[self.shape]
|
return self.vocab.strings[self.c.lex.shape]
|
||||||
|
|
||||||
property prefix_:
|
@property
|
||||||
def __get__(self):
|
def prefix_(self):
|
||||||
return self._seq.vocab.strings[self.prefix]
|
return self.vocab.strings[self.c.lex.prefix]
|
||||||
|
|
||||||
property suffix_:
|
@property
|
||||||
def __get__(self):
|
def suffix_(self):
|
||||||
return self._seq.vocab.strings[self.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
property lemma_:
|
@property
|
||||||
def __get__(self):
|
def lemma_(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
return self.vocab.strings[self.c.lemma]
|
||||||
if t.lemma == 0:
|
|
||||||
return self.string
|
|
||||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
|
||||||
return py_ustr
|
|
||||||
|
|
||||||
property pos_:
|
@property
|
||||||
def __get__(self):
|
def pos_(self):
|
||||||
id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
return _pos_id_to_string[self.c.pos]
|
||||||
return id_to_string[self.pos]
|
|
||||||
|
|
||||||
property tag_:
|
@property
|
||||||
def __get__(self):
|
def tag_(self):
|
||||||
return self._seq._tag_strings[self.tag]
|
return self._tag_strings[self.c.tag]
|
||||||
|
|
||||||
property dep_:
|
@property
|
||||||
def __get__(self):
|
def dep_(self):
|
||||||
return self._seq._dep_strings[self.dep]
|
return self._dep_strings[self.c.dep]
|
||||||
|
|
||||||
|
|
||||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||||
cdef int i
|
|
||||||
for i in range(32):
|
|
||||||
if bits & (1 << i):
|
|
||||||
n -= 1
|
|
||||||
if n < 1:
|
|
||||||
return i
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
_parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
_parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user