mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* 710k words per second for counts
This commit is contained in:
parent
057c21969b
commit
80b36f9f27
|
@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
|||
|
||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
||||
# int split, size_t length)
|
||||
|
||||
|
|
|
@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
|
|||
return (<Lexeme*>lex_id).orth.norm
|
||||
|
||||
|
||||
cpdef StringHash shape_of(size_t lex_id) except 0:
|
||||
return (<Lexeme*>lex_id).orth.shape
|
||||
|
||||
|
||||
cpdef ClusterID cluster_of(size_t lex_id):
|
||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||
gives an integer representation of the cluster ID of the word,
|
||||
|
@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
|
|||
>>> prob_of(lookup(u'world'))
|
||||
-20.10340371976182
|
||||
'''
|
||||
pass
|
||||
return (<Lexeme*>lex_id).dist.prob
|
||||
|
||||
|
||||
cpdef StringHash last3_of(size_t lex_id):
|
||||
|
@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
|
|||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||
[u'llo', u'!']
|
||||
'''
|
||||
cdef Lexeme* w = <Lexeme*>lex_id
|
||||
return w.orth.last3 if w.orth != NULL else 0
|
||||
return (<Lexeme*>lex_id).orth.last3
|
||||
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
|
|
|
@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):
|
|||
|
||||
|
||||
def get_word_shape(lex, length):
|
||||
return lex
|
||||
shape = ""
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
for c in lex:
|
||||
if c.isalpha():
|
||||
if c.isupper():
|
||||
shape_char = "X"
|
||||
else:
|
||||
shape_char = "x"
|
||||
elif c.isdigit():
|
||||
shape_char = "d"
|
||||
else:
|
||||
shape_char = c
|
||||
if shape_char == last:
|
||||
seq += 1
|
||||
else:
|
||||
seq = 0
|
||||
last = shape_char
|
||||
if seq < 3:
|
||||
shape += shape_char
|
||||
assert shape
|
||||
return shape
|
||||
|
||||
|
||||
|
||||
def set_orth_flags(lex, length):
|
||||
|
|
|
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
|
|||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport norm_of
|
||||
from spacy.lexeme cimport norm_of, shape_of
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
|
@ -45,9 +45,8 @@ cdef class Tokens:
|
|||
cdef Lexeme_addr t
|
||||
cdef StringHash key
|
||||
for t in self.vctr[0]:
|
||||
key = norm_of(t)
|
||||
key = (<Lexeme*>t).lex
|
||||
if key not in counts:
|
||||
counts[key] = 0
|
||||
counts[key] += 1
|
||||
return counts
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user