* 710k words per second for counts

2025-08-09 06:34:54 +03:00 · 2014-07-07 19:12:19 +02:00 · 2014-07-07 19:12:19 +02:00 · 80b36f9f27
commit 80b36f9f27
parent 057c21969b
4 changed files with 33 additions and 7 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)

 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
+cpdef StringHash shape_of(size_t lex_id) except 0
 #cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
 #                         int split, size_t length)
                         
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.norm


+cpdef StringHash shape_of(size_t lex_id) except 0:
+    return (<Lexeme*>lex_id).orth.shape
+
+
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    pass
+    return (<Lexeme*>lex_id).dist.prob


 cpdef StringHash last3_of(size_t lex_id):
@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
-    cdef Lexeme* w = <Lexeme*>lex_id
-    return w.orth.last3 if w.orth != NULL else 0
+    return (<Lexeme*>lex_id).orth.last3


 cpdef bint is_oft_upper(size_t lex_id):
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):


 def get_word_shape(lex, length):
-    return lex
+    shape = ""
+    last = ""
+    shape_char = ""
+    seq = 0
+    for c in lex:
+        if c.isalpha():
+            if c.isupper():
+                shape_char = "X"
+            else:
+                shape_char = "x"
+        elif c.isdigit():
+            shape_char = "d"
+        else:
+            shape_char = c
+        if shape_char == last:
+            seq += 1
+        else:
+            seq = 0
+            last = shape_char
+        if seq < 3:
+            shape += shape_char
+    assert shape
+    return shape
+


 def set_orth_flags(lex, length):
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc


 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport norm_of
+from spacy.lexeme cimport norm_of, shape_of
 from spacy.spacy cimport StringHash


@ -45,9 +45,8 @@ cdef class Tokens:
        cdef Lexeme_addr t
        cdef StringHash key
        for t in self.vctr[0]:
-            key = norm_of(t)
+            key = (<Lexeme*>t).lex
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
        return counts
-