diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 50417e65a..17ea473f9 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -16,6 +16,7 @@ cdef struct Orthography: StringHash shape StringHash norm + size_t length Py_UNICODE first Bits8 flags @@ -45,6 +46,7 @@ cdef enum StringAttr: NORM SHAPE LAST3 + LENGTH cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 @@ -54,3 +56,4 @@ cpdef StringHash lex_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash last3_of(size_t lex_id) except 0 +cpdef StringHash length_of(size_t lex_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e769a6bee..430033db0 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -32,6 +32,8 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: return shape_of(lex_id) elif attr == LAST3: return last3_of(lex_id) + elif attr == LENGTH: + return length_of(lex_id) else: raise StandardError @@ -118,9 +120,20 @@ cpdef Py_UNICODE first_of(size_t lex_id): >>> unhash(first_of(lex_id)) u'H' ''' + if (lex_id).orth == NULL: + return 0 return (lex_id).orth.first +cpdef StringHash length_of(size_t lex_id): + '''Access the `length' field of the Lexeme pointed to by lex_id, which stores + the length of the string hashed by lex_of.''' + cdef Lexeme* word = lex_id + if (lex_id).orth == NULL: + return 0 + return (lex_id).orth.length + + cpdef double prob_of(size_t lex_id): '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores the smoothed unigram log probability of the word, as estimated from a large diff --git a/tests/test_orth.py b/tests/test_orth.py index 8d9939f4c..f13fa90bf 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -4,7 +4,7 @@ import pytest from spacy.en import lookup, unhash -from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of +from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of from spacy.lexeme import shape_of @pytest.fixture @@ -14,3 +14,16 @@ def C3P0(): def test_shape(C3P0): assert unhash(shape_of(C3P0)) == "XdXd" + + +def test_length(): + t = lookup('the') + assert length_of(t) == 3 + t = lookup('') + assert length_of(t) == 0 + t = lookup("n't") + assert length_of(t) == 3 + t = lookup("'s") + assert length_of(t) == 2 + t = lookup('Xxxx') + assert length_of(t) == 4