* Upd from spacy

2025-08-09 14:44:52 +03:00 · 2014-07-23 17:35:18 +01:00 · 2014-07-23 17:35:18 +01:00 · a895fe5ddb
commit a895fe5ddb
parent 87bf205b82
5 changed files with 51 additions and 17 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,8 +1,17 @@
 from .lexeme import lex_of
 from .lexeme import sic_of

+from .tokens import Tokens

-__all__ = [lex_of, sic_of]
+# Don't know how to get the enum Python visible :(
+
+SIC = 0
+LEX = 1
+NORM = 2
+SHAPE = 3
+LAST3 = 4
+
+__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]


 """
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        # ...Unless we're at 0
        return i == 0
+    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
+        return False
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
-    # Don't count periods as punct if the next char is a number
-    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+    # Don't count periods as punct if the next char is not whitespace
+    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
        return False
    return not word[i].isalnum()

--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -16,12 +16,12 @@ from . import util
 from os import path
 cimport cython

+
 def get_normalized(unicode lex, size_t length):
-    return lex.lower()
-    #if lex.isdigit():
-    #    return '!YEAR' if length == 4 else '!DIGIT'
-    #else:
-    #    return lex.lower()
+    if lex.isalpha() and lex.islower():
+        return lex
+    else:
+        return get_word_shape(lex, length)


 def get_word_shape(lex, length):
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
    return 0


-
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -14,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
    
-    cpdef list group_by(self, StringAttr attr)
+    cpdef object group_by(self, StringAttr attr)
    cpdef dict count_by(self, StringAttr attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -37,21 +37,45 @@ cdef class Tokens:
        for el in other:
            self.append(el)

-    cpdef list group_by(self, StringAttr attr):
+    cpdef object group_by(self, StringAttr attr):
+        '''Group tokens that share the property attr into Tokens instances, and
+        return a list of them. Returns a tuple of three lists:
+        
+        (string names, hashes, tokens)
+
+        The lists are aligned, so the ith entry in string names is the string
+        that the ith entry in hashes unhashes to, which the Tokens instance
+        is grouped by.
+        
+        You can then use count_by or group_by on the Tokens
+        for further processing. Calling group_by and then asking the length
+        of the Tokens objects is equivalent to count_by, but somewhat slower.
+        '''
+        # Implementation here is working around some of the constraints in
+        # Cython about what type of thing can go in what type of container.
+        # Long story short, it's pretty hard to get a Python object like
+        # Tokens into a vector or array. If we really need this to run faster,
+        # we can be tricky and get the Python list access out of the loop. What
+        # we'd do is store pointers to the underlying vectors.
+        # So far, speed isn't mattering here.
        cdef dict indices = {}
-        cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
+        cdef list groups = []
+        cdef list names = []
+        cdef list hashes = []

        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
            key = attr_of(t, attr)
            if key in indices:
-                groups[indices[key]].push_back(t)
+                groups[indices[key]].append(t)
            else:
-                indices[key] = groups.size()
-                groups.push_back(vector[Lexeme_addr]())
-                groups.back().push_back(t)
-        return groups
+                indices[key] = len(groups)
+                groups.append(Tokens(self.lang))
+                names.append(self.lang.unhash(key))
+                hashes.append(key)
+                groups[-1].append(t)
+        return names, hashes, groups

    cpdef dict count_by(self, StringAttr attr):
        counts = {}