* Various tweaks to Tokens class

This commit is contained in:
Matthew Honnibal 2015-01-22 02:05:37 +11:00
parent 5928d158ce
commit 9cd0b6b3e9

View File

@ -10,6 +10,8 @@ from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from unidecode import unidecode
cimport numpy cimport numpy
import numpy import numpy
@ -127,6 +129,9 @@ cdef class Tokens:
cdef const TokenC* last = &self.data[self.length - 1] cdef const TokenC* last = &self.data[self.length - 1]
return self._string[:last.idx + last.lex.length] return self._string[:last.idx + last.lex.length]
def __str__(self):
return unidecode(unicode(self))
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
@ -161,7 +166,7 @@ cdef class Tokens:
output[i, j] = get_token_attr(&self.data[i], feature) output[i, j] = get_token_attr(&self.data[i], feature)
return output return output
def count_by(self, attr_id_t attr_id): def count_by(self, attr_id_t attr_id, exclude=None):
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID. by the values of the given attribute ID.
@ -182,6 +187,8 @@ cdef class Tokens:
cdef PreshCounter counts = PreshCounter(2 ** 8) cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length): for i in range(self.length):
if exclude is not None and exclude(self[i]):
continue
attr = get_token_attr(&self.data[i], attr_id) attr = get_token_attr(&self.data[i], attr_id)
counts.inc(attr, 1) counts.inc(attr, 1)
return dict(counts) return dict(counts)
@ -204,11 +211,7 @@ cdef class Tokens:
@cython.freelist(64) @cython.freelist(64)
cdef class Token: cdef class Token:
"""An individual token. """An individual token."""
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
object.
"""
def __init__(self, Tokens tokens, int i): def __init__(self, Tokens tokens, int i):
self._seq = tokens self._seq = tokens
self.i = i self.i = i
@ -228,11 +231,7 @@ cdef class Token:
self.lemma = t.lemma self.lemma = t.lemma
self.tag = t.tag self.tag = t.tag
self.dep = t.dep self.dep = t.dep
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
#self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
#for i in range(300):
# self.vec[i] = t.lex.vec[i]
self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
def __unicode__(self): def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
@ -253,7 +252,7 @@ cdef class Token:
return self._seq.data[self.i].lex.length return self._seq.data[self.i].lex.length
def check_flag(self, attr_id_t flag): def check_flag(self, attr_id_t flag):
return False return self.flags & (1 << flag)
def is_pos(self, univ_tag_t pos): def is_pos(self, univ_tag_t pos):
return self.tag == pos return self.tag == pos