* 710k words per second for counts

2025-10-21 19:24:39 +03:00 · 2014-07-07 19:12:19 +02:00 · 2014-07-07 19:12:19 +02:00 · 80b36f9f27
commit 80b36f9f27
parent 057c21969b
4 changed files with 33 additions and 7 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 #cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
 #                         int split, size_t length)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.norm
 cpdef StringHash shape_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.shape
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    pass
+    return (<Lexeme*>lex_id).dist.prob
 cpdef StringHash last3_of(size_t lex_id):
@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
-    cdef Lexeme* w = <Lexeme*>lex_id
+    return (<Lexeme*>lex_id).orth.last3
    return w.orth.last3 if w.orth != NULL else 0
 cpdef bint is_oft_upper(size_t lex_id):
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):
 def get_word_shape(lex, length):
-    return lex
+    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape
 def set_orth_flags(lex, length):
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport norm_of
+from spacy.lexeme cimport norm_of, shape_of
 from spacy.spacy cimport StringHash
@ -45,9 +45,8 @@ cdef class Tokens:
        cdef Lexeme_addr t
        cdef StringHash key
        for t in self.vctr[0]:
-            key = norm_of(t)
+            key = (<Lexeme*>t).lex
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
        return counts