* 710k words per second for counts

This commit is contained in:
Matthew Honnibal 2014-07-07 19:12:19 +02:00
parent 057c21969b
commit 80b36f9f27
4 changed files with 33 additions and 7 deletions

View File

@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
# int split, size_t length)

View File

@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.norm
cpdef StringHash shape_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.shape
cpdef ClusterID cluster_of(size_t lex_id):
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word,
@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
>>> prob_of(lookup(u'world'))
-20.10340371976182
'''
pass
return (<Lexeme*>lex_id).dist.prob
cpdef StringHash last3_of(size_t lex_id):
@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
cdef Lexeme* w = <Lexeme*>lex_id
return w.orth.last3 if w.orth != NULL else 0
return (<Lexeme*>lex_id).orth.last3
cpdef bint is_oft_upper(size_t lex_id):

View File

@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):
def get_word_shape(lex, length):
return lex
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
def set_orth_flags(lex, length):

View File

@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport norm_of
from spacy.lexeme cimport norm_of, shape_of
from spacy.spacy cimport StringHash
@ -45,9 +45,8 @@ cdef class Tokens:
cdef Lexeme_addr t
cdef StringHash key
for t in self.vctr[0]:
key = norm_of(t)
key = (<Lexeme*>t).lex
if key not in counts:
counts[key] = 0
counts[key] += 1
return counts