* Fixed group_by, removed idea of general attr_of function.

2025-10-30 15:37:29 +03:00 · 2014-08-22 00:02:37 +02:00 · 2014-08-22 00:02:37 +02:00 · 07ecf5d2f4
commit 07ecf5d2f4
parent 811b7a6b91
4 changed files with 16 additions and 50 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -10,30 +10,14 @@ from spacy.tokens cimport Tokens
 cimport cython


-ctypedef fused AttrType:
-    ClusterID
-    StringHash
-    cython.char
-
-
-cdef enum AttrName:
-    LEX
-    FIRST
-    LENGTH
-    CLUSTER
-    NORM
-    SHAPE
-    LAST3
-
-
-
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
-    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
+

 cdef English EN

+
 cpdef LexID lookup(unicode word) except 0
 cpdef Tokens tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -76,27 +76,6 @@ cdef class English(spacy.Language):
                i += 1
        return i

-    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
-        cdef Lexeme* w = <Lexeme*>lex_id
-        if attr == LEX:
-            return <AttrType>w.lex
-        elif attr == FIRST:
-            return w.string[0]
-        elif attr == LENGTH:
-            return w.length
-        elif attr == CLUSTER:
-            return w.cluster
-        elif attr == NORM:
-            return w.string_views[0]
-        elif attr == SHAPE:
-            return w.string_views[1]
-        elif attr == LAST3:
-            return w.string_views[2]
-        else:
-            raise AttributeError(attr)
-
-
-

 cdef bint check_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,5 +1,5 @@
 from libcpp.vector cimport vector
-from spacy.spacy cimport Lexeme_addr
+from spacy.lexeme cimport LexID
 from spacy.lexeme cimport Lexeme

 from cython.operator cimport dereference as deref
@ -8,10 +8,10 @@ from spacy.spacy cimport Language

 cdef class Tokens:
    cdef Language lang
-    cdef vector[Lexeme_addr]* vctr
+    cdef vector[LexID]* vctr
    cdef size_t length
    
-    cpdef int append(self, Lexeme_addr token)
+    cpdef int append(self, LexID token)
    cpdef int extend(self, Tokens other) except -1
    
    cpdef object group_by(self, size_t attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -9,14 +9,14 @@ from spacy.spacy cimport StringHash
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
-        self.vctr = new vector[Lexeme_addr]()
+        self.vctr = new vector[LexID]()
        self.length = 0

    def __dealloc__(self):
        del self.vctr

    def __iter__(self):
-        cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
+        cdef vector[LexID].iterator it = self.vctr[0].begin()
        while it != self.vctr[0].end():
            yield deref(it)
            inc(it)
@ -27,16 +27,16 @@ cdef class Tokens:
    def __len__(self):
        return self.length

-    cpdef int append(self, Lexeme_addr token):
+    cpdef int append(self, LexID token):
        self.vctr[0].push_back(token)
        self.length += 1

    cpdef int extend(self, Tokens other) except -1:
-        cdef Lexeme_addr el
+        cdef LexID el
        for el in other:
            self.append(el)

-    cpdef object group_by(self, size_t attr):
+    cpdef object group_by(self, size_t view_idx):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
        
@ -63,9 +63,12 @@ cdef class Tokens:
        cdef list hashes = []

        cdef StringHash key
-        cdef Lexeme_addr t
+        cdef LexID t
        for t in self.vctr[0]:
-            key = self.lang.attr_of(t, attr)
+            if view_idx == 0:
+                key = (<Lexeme*>t).lex
+            else:
+                key = (<Lexeme*>t).string_views[view_idx - 1]
            if key in indices:
                groups[indices[key]].append(t)
            else:
@ -78,7 +81,7 @@ cdef class Tokens:

    cpdef dict count_by(self, size_t attr):
        counts = {}
-        cdef Lexeme_addr t
+        cdef LexID t
        cdef StringHash key
        for t in self.vctr[0]:
            #key = attr_of(t, attr)