# cython: embedsignature=True from spacy.lexeme cimport Lexeme def get_normalized(unicode lex): if lex.isalpha() and lex.islower(): return lex else: return get_word_shape(lex) def get_word_shape(unicode lex): cdef size_t length = len(lex) shape = "" last = "" shape_char = "" seq = 0 for c in lex: if c.isalpha(): if c.isupper(): shape_char = "X" else: shape_char = "x" elif c.isdigit(): shape_char = "d" else: shape_char = c if shape_char == last: seq += 1 else: seq = 0 last = shape_char if seq < 3: shape += shape_char assert shape return shape cpdef bint is_alpha(LexID lex_id) except *: """Give the result of unicode.isalpha() for a Lexeme ID. >>> is_alpha(lookup(u'Hello')) True >>> is_alpha(lookup(u'العرب')) True >>> is_alpha(lookup(u'10')) False """ return (lex_id).orth_flags & 1 << IS_ALPHA cpdef bint is_digit(LexID lex_id) except *: """Give the result of unicode.isdigit() for a Lexeme ID. >>> is_digit(lookup(u'10')) True >>> is_digit(lookup(u'๐')) True >>> is_digit(lookup(u'one')) False """ return (lex_id).orth_flags & 1 << IS_DIGIT cpdef bint is_punct(LexID lex_id) except *: """Give the result of checking whether all characters belong to a punctuation unicode data category for a Lexeme ID. >>> is_punct(lookup(u'.')) True >>> is_punct(lookup(u'⁒')) True >>> is_punct(lookup(u' ')) False """ return (lex_id).orth_flags & 1 << IS_PUNCT cpdef bint is_space(LexID lex_id) except *: """Give the result of unicode.isspace() for a Lexeme ID. >>> is_space(lookup(u'\t')) True >>> is_space(lookup(u'')) True >>> is_space(lookup(u'Hi\n')) False """ return (lex_id).orth_flags & 1 << IS_SPACE cpdef bint is_lower(LexID lex_id) except *: """Give the result of unicode.islower() for a Lexeme ID. >>> is_lower(lookup(u'hi')) True >>> is_lower(lookup()) True >>> is_lower(lookup(u'10')) False """ return (lex_id).orth_flags & 1 << IS_LOWER cpdef bint is_upper(LexID lex_id) except *: """Give the result of unicode.isupper() for a Lexeme ID. >>> is_upper(lookup(u'HI')) True >>> is_upper(lookup(u'H10')) True >>> is_upper(lookup(u'10')) False """ return (lex_id).orth_flags & 1 << IS_UPPER cpdef bint is_title(LexID lex_id) except *: """Give the result of unicode.istitle() for a Lexeme ID. >>> is_title(lookup(u'Hi')) True >>> is_title(lookup(u'Hi1')) True >>> is_title(lookup(u'1')) False """ return (lex_id).orth_flags & 1 << IS_TITLE cpdef bint is_ascii(LexID lex_id) except *: """Give the result of checking whether all characters in the string are ascii. >>> is_ascii(lookup(u'Hi')) True >>> is_ascii(lookup(u' ')) True >>> is_title(lookup(u'')) False """ return (lex_id).orth_flags & 1 << IS_ASCII cpdef StringHash norm_of(LexID lex_id) except 0: """Return the hash of a normalized version of the string. >>> unhash(norm_of(lookupu'Hi')) u'hi' >>> unhash(norm_of(lookup(u'255667'))) u'shape=dddd' >>> unhash(norm_of(lookup(u'...'))) u'...' """ return (lex_id).string_views[NORM] cpdef StringHash shape_of(LexID lex_id) except 0: """Return the hash of the string shape. >>> unhash(shape_of(lookupu'Hi')) u'Xx' >>> unhash(shape_of(lookup(u'255667'))) u'dddd' >>> unhash(shape_of(lookup(u'...'))) u'...' """ cdef Lexeme* w = lex_id return w.string_views[SHAPE] cpdef StringHash last3_of(LexID lex_id) except 0: '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores the hash of the last three characters of the word: >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')] >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] [u'llo', u'!'] ''' return (lex_id).string_views[LAST3]