mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
* Various tweaks to Tokens class
This commit is contained in:
parent
5928d158ce
commit
9cd0b6b3e9
|
@ -10,6 +10,8 @@ from .typedefs cimport LEMMA
|
||||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .typedefs cimport POS, LEMMA
|
from .typedefs cimport POS, LEMMA
|
||||||
|
|
||||||
|
from unidecode import unidecode
|
||||||
|
|
||||||
cimport numpy
|
cimport numpy
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
@ -127,6 +129,9 @@ cdef class Tokens:
|
||||||
cdef const TokenC* last = &self.data[self.length - 1]
|
cdef const TokenC* last = &self.data[self.length - 1]
|
||||||
return self._string[:last.idx + last.lex.length]
|
return self._string[:last.idx + last.lex.length]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return unidecode(unicode(self))
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
|
@ -161,7 +166,7 @@ cdef class Tokens:
|
||||||
output[i, j] = get_token_attr(&self.data[i], feature)
|
output[i, j] = get_token_attr(&self.data[i], feature)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def count_by(self, attr_id_t attr_id):
|
def count_by(self, attr_id_t attr_id, exclude=None):
|
||||||
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||||
by the values of the given attribute ID.
|
by the values of the given attribute ID.
|
||||||
|
|
||||||
|
@ -182,6 +187,8 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
if exclude is not None and exclude(self[i]):
|
||||||
|
continue
|
||||||
attr = get_token_attr(&self.data[i], attr_id)
|
attr = get_token_attr(&self.data[i], attr_id)
|
||||||
counts.inc(attr, 1)
|
counts.inc(attr, 1)
|
||||||
return dict(counts)
|
return dict(counts)
|
||||||
|
@ -204,11 +211,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token.
|
"""An individual token."""
|
||||||
|
|
||||||
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
|
||||||
object.
|
|
||||||
"""
|
|
||||||
def __init__(self, Tokens tokens, int i):
|
def __init__(self, Tokens tokens, int i):
|
||||||
self._seq = tokens
|
self._seq = tokens
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -228,11 +231,7 @@ cdef class Token:
|
||||||
self.lemma = t.lemma
|
self.lemma = t.lemma
|
||||||
self.tag = t.tag
|
self.tag = t.tag
|
||||||
self.dep = t.dep
|
self.dep = t.dep
|
||||||
|
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
|
||||||
#self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
|
|
||||||
#for i in range(300):
|
|
||||||
# self.vec[i] = t.lex.vec[i]
|
|
||||||
self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
|
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
@ -253,7 +252,7 @@ cdef class Token:
|
||||||
return self._seq.data[self.i].lex.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
def check_flag(self, attr_id_t flag):
|
def check_flag(self, attr_id_t flag):
|
||||||
return False
|
return self.flags & (1 << flag)
|
||||||
|
|
||||||
def is_pos(self, univ_tag_t pos):
|
def is_pos(self, univ_tag_t pos):
|
||||||
return self.tag == pos
|
return self.tag == pos
|
||||||
|
|
Loading…
Reference in New Issue
Block a user