* Upd from spacy

2025-07-15 18:52:29 +03:00 · 2014-07-23 17:35:18 +01:00 · 2014-07-23 17:35:18 +01:00 · a895fe5ddb
commit a895fe5ddb
parent 87bf205b82
5 changed files with 51 additions and 17 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,8 +1,17 @@
 from .lexeme import lex_of
 from .lexeme import sic_of
 from .tokens import Tokens
-__all__ = [lex_of, sic_of]
+# Don't know how to get the enum Python visible :(
 SIC = 0
 LEX = 1
 NORM = 2
 SHAPE = 3
 LAST3 = 4
 __all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
 """
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        # ...Unless we're at 0
        return i == 0
    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
        return False
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
-    # Don't count periods as punct if the next char is a number
+    # Don't count periods as punct if the next char is not whitespace
-    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
        return False
    return not word[i].isalnum()
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -16,12 +16,12 @@ from . import util
 from os import path
 cimport cython
 def get_normalized(unicode lex, size_t length):
-    return lex.lower()
+    if lex.isalpha() and lex.islower():
-    #if lex.isdigit():
+        return lex
-    #    return '!YEAR' if length == 4 else '!DIGIT'
+    else:
-    #else:
+        return get_word_shape(lex, length)
    #    return lex.lower()
 def get_word_shape(lex, length):
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
    return 0
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -14,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
-    cpdef list group_by(self, StringAttr attr)
+    cpdef object group_by(self, StringAttr attr)
    cpdef dict count_by(self, StringAttr attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -37,21 +37,45 @@ cdef class Tokens:
        for el in other:
            self.append(el)
-    cpdef list group_by(self, StringAttr attr):
+    cpdef object group_by(self, StringAttr attr):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
        (string names, hashes, tokens)
        The lists are aligned, so the ith entry in string names is the string
        that the ith entry in hashes unhashes to, which the Tokens instance
        is grouped by.
        You can then use count_by or group_by on the Tokens
        for further processing. Calling group_by and then asking the length
        of the Tokens objects is equivalent to count_by, but somewhat slower.
        '''
        # Implementation here is working around some of the constraints in
        # Cython about what type of thing can go in what type of container.
        # Long story short, it's pretty hard to get a Python object like
        # Tokens into a vector or array. If we really need this to run faster,
        # we can be tricky and get the Python list access out of the loop. What
        # we'd do is store pointers to the underlying vectors.
        # So far, speed isn't mattering here.
        cdef dict indices = {}
-        cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
+        cdef list groups = []
        cdef list names = []
        cdef list hashes = []
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
            key = attr_of(t, attr)
            if key in indices:
-                groups[indices[key]].push_back(t)
+                groups[indices[key]].append(t)
            else:
-                indices[key] = groups.size()
+                indices[key] = len(groups)
-                groups.push_back(vector[Lexeme_addr]())
+                groups.append(Tokens(self.lang))
-                groups.back().push_back(t)
+                names.append(self.lang.unhash(key))
-        return groups
+                hashes.append(key)
                groups[-1].append(t)
        return names, hashes, groups
    cpdef dict count_by(self, StringAttr attr):
        counts = {}