* Various tweaks to Tokens class

2025-07-18 20:22:25 +03:00 · 2015-01-22 02:05:37 +11:00 · 2015-01-22 02:05:37 +11:00 · 9cd0b6b3e9
commit 9cd0b6b3e9
parent 5928d158ce
1 changed files with 11 additions and 12 deletions
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -10,6 +10,8 @@ from .typedefs cimport LEMMA
 from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA
 from unidecode import unidecode
 cimport numpy
 import numpy
@ -127,6 +129,9 @@ cdef class Tokens:
        cdef const TokenC* last = &self.data[self.length - 1]
        return self._string[:last.idx + last.lex.length]
    def __str__(self):
        return unidecode(unicode(self))
    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
@ -161,7 +166,7 @@ cdef class Tokens:
                output[i, j] = get_token_attr(&self.data[i], feature)
        return output
-    def count_by(self, attr_id_t attr_id):
+    def count_by(self, attr_id_t attr_id, exclude=None):
        """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
        by the values of the given attribute ID.
@ -182,6 +187,8 @@ cdef class Tokens:
        cdef PreshCounter counts = PreshCounter(2 ** 8)
        for i in range(self.length):
            if exclude is not None and exclude(self[i]):
                continue
            attr = get_token_attr(&self.data[i], attr_id)
            counts.inc(attr, 1)
        return dict(counts)
@ -204,11 +211,7 @@ cdef class Tokens:
@cython.freelist(64)
 cdef class Token:
-    """An individual token.
+    """An individual token."""
    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
    object.
    """
    def __init__(self, Tokens tokens, int i):
        self._seq = tokens
        self.i = i
@ -228,11 +231,7 @@ cdef class Token:
        self.lemma = t.lemma
        self.tag = t.tag
        self.dep = t.dep
-
+        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
        #self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
        #for i in range(300):
        #    self.vec[i] = t.lex.vec[i]
        self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
    def __unicode__(self):
        cdef const TokenC* t = &self._seq.data[self.i]
@ -253,7 +252,7 @@ cdef class Token:
        return self._seq.data[self.i].lex.length
    def check_flag(self, attr_id_t flag):
-        return False
+        return self.flags & (1 << flag)
    def is_pos(self, univ_tag_t pos):
        return self.tag == pos