* Add length property

This commit is contained in:
Matthew Honnibal 2014-08-02 21:26:44 +01:00
parent 18fb76b2c4
commit 6319ff0f22
3 changed files with 30 additions and 1 deletions

View File

@ -16,6 +16,7 @@ cdef struct Orthography:
StringHash shape StringHash shape
StringHash norm StringHash norm
size_t length
Py_UNICODE first Py_UNICODE first
Bits8 flags Bits8 flags
@ -45,6 +46,7 @@ cdef enum StringAttr:
NORM NORM
SHAPE SHAPE
LAST3 LAST3
LENGTH
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
@ -54,3 +56,4 @@ cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0 cpdef StringHash last3_of(size_t lex_id) except 0
cpdef StringHash length_of(size_t lex_id)

View File

@ -32,6 +32,8 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
return shape_of(lex_id) return shape_of(lex_id)
elif attr == LAST3: elif attr == LAST3:
return last3_of(lex_id) return last3_of(lex_id)
elif attr == LENGTH:
return length_of(lex_id)
else: else:
raise StandardError raise StandardError
@ -118,9 +120,20 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id)) >>> unhash(first_of(lex_id))
u'H' u'H'
''' '''
if (<Lexeme*>lex_id).orth == NULL:
return 0
return (<Lexeme*>lex_id).orth.first return (<Lexeme*>lex_id).orth.first
cpdef StringHash length_of(size_t lex_id):
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
the length of the string hashed by lex_of.'''
cdef Lexeme* word = <Lexeme*>lex_id
if (<Lexeme*>lex_id).orth == NULL:
return 0
return (<Lexeme*>lex_id).orth.length
cpdef double prob_of(size_t lex_id): cpdef double prob_of(size_t lex_id):
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
the smoothed unigram log probability of the word, as estimated from a large the smoothed unigram log probability of the word, as estimated from a large

View File

@ -4,7 +4,7 @@ import pytest
from spacy.en import lookup, unhash from spacy.en import lookup, unhash
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of
from spacy.lexeme import shape_of from spacy.lexeme import shape_of
@pytest.fixture @pytest.fixture
@ -14,3 +14,16 @@ def C3P0():
def test_shape(C3P0): def test_shape(C3P0):
assert unhash(shape_of(C3P0)) == "XdXd" assert unhash(shape_of(C3P0)) == "XdXd"
def test_length():
t = lookup('the')
assert length_of(t) == 3
t = lookup('')
assert length_of(t) == 0
t = lookup("n't")
assert length_of(t) == 3
t = lookup("'s")
assert length_of(t) == 2
t = lookup('Xxxx')
assert length_of(t) == 4