mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-02 23:03:41 +03:00
* 710k words per second for counts
This commit is contained in:
parent
057c21969b
commit
80b36f9f27
|
@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||||
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
||||||
# int split, size_t length)
|
# int split, size_t length)
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
|
||||||
return (<Lexeme*>lex_id).orth.norm
|
return (<Lexeme*>lex_id).orth.norm
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash shape_of(size_t lex_id) except 0:
|
||||||
|
return (<Lexeme*>lex_id).orth.shape
|
||||||
|
|
||||||
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id):
|
cpdef ClusterID cluster_of(size_t lex_id):
|
||||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||||
gives an integer representation of the cluster ID of the word,
|
gives an integer representation of the cluster ID of the word,
|
||||||
|
@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
|
||||||
>>> prob_of(lookup(u'world'))
|
>>> prob_of(lookup(u'world'))
|
||||||
-20.10340371976182
|
-20.10340371976182
|
||||||
'''
|
'''
|
||||||
pass
|
return (<Lexeme*>lex_id).dist.prob
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash last3_of(size_t lex_id):
|
cpdef StringHash last3_of(size_t lex_id):
|
||||||
|
@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
|
||||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||||
[u'llo', u'!']
|
[u'llo', u'!']
|
||||||
'''
|
'''
|
||||||
cdef Lexeme* w = <Lexeme*>lex_id
|
return (<Lexeme*>lex_id).orth.last3
|
||||||
return w.orth.last3 if w.orth != NULL else 0
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
|
|
|
@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(lex, length):
|
def get_word_shape(lex, length):
|
||||||
return lex
|
shape = ""
|
||||||
|
last = ""
|
||||||
|
shape_char = ""
|
||||||
|
seq = 0
|
||||||
|
for c in lex:
|
||||||
|
if c.isalpha():
|
||||||
|
if c.isupper():
|
||||||
|
shape_char = "X"
|
||||||
|
else:
|
||||||
|
shape_char = "x"
|
||||||
|
elif c.isdigit():
|
||||||
|
shape_char = "d"
|
||||||
|
else:
|
||||||
|
shape_char = c
|
||||||
|
if shape_char == last:
|
||||||
|
seq += 1
|
||||||
|
else:
|
||||||
|
seq = 0
|
||||||
|
last = shape_char
|
||||||
|
if seq < 3:
|
||||||
|
shape += shape_char
|
||||||
|
assert shape
|
||||||
|
return shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def set_orth_flags(lex, length):
|
def set_orth_flags(lex, length):
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport norm_of
|
from spacy.lexeme cimport norm_of, shape_of
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,9 +45,8 @@ cdef class Tokens:
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
cdef StringHash key
|
cdef StringHash key
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
key = norm_of(t)
|
key = (<Lexeme*>t).lex
|
||||||
if key not in counts:
|
if key not in counts:
|
||||||
counts[key] = 0
|
counts[key] = 0
|
||||||
counts[key] += 1
|
counts[key] += 1
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user