diff --git a/spacy/en.pxd b/spacy/en.pxd index f6dc782f0..c3c605f1f 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -3,42 +3,5 @@ from spacy.word cimport Lexeme from spacy.tokens cimport Tokens -cdef class EnglishTokens(Tokens): - cpdef size_t canon(self, size_t i) - cpdef size_t shape(self, size_t i) - cpdef size_t non_sparse(self, size_t i) - cpdef size_t asciied(self, size_t i) - cpdef unicode canon_string(self, size_t i) - cpdef unicode shape_string(self, size_t i) - cpdef unicode non_sparse_string(self, size_t i) - cpdef unicode asciied_string(self, size_t i) - cpdef bint is_alpha(self, size_t i) - cpdef bint is_ascii(self, size_t i) - cpdef bint is_digit(self, size_t i) - cpdef bint is_lower(self, size_t i) - cpdef bint is_punct(self, size_t i) - cpdef bint is_space(self, size_t i) - cpdef bint is_title(self, size_t i) - cpdef bint is_upper(self, size_t i) - cpdef bint can_adj(self, size_t i) - cpdef bint can_adp(self, size_t i) - cpdef bint can_adv(self, size_t i) - cpdef bint can_conj(self, size_t i) - cpdef bint can_det(self, size_t i) - cpdef bint can_noun(self, size_t i) - cpdef bint can_num(self, size_t i) - cpdef bint can_pdt(self, size_t i) - cpdef bint can_pos(self, size_t i) - cpdef bint can_pron(self, size_t i) - cpdef bint can_prt(self, size_t i) - cpdef bint can_punct(self, size_t i) - cpdef bint can_verb(self, size_t i) - cpdef bint oft_lower(self, size_t i) - cpdef bint oft_title(self, size_t i) - cpdef bint oft_upper(self, size_t i) - - cdef class English(Language): cdef int _split_one(self, Py_UNICODE* characters, size_t length) - - diff --git a/spacy/en.pyx b/spacy/en.pyx index 6f801d96e..88b6d32c8 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -45,187 +45,10 @@ from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_string_view from spacy._hashing cimport PointerHash -from spacy import util from spacy import orth -cdef enum Flags: - Flag_IsAlpha - Flag_IsAscii - Flag_IsDigit - Flag_IsLower - Flag_IsPunct - Flag_IsSpace - Flag_IsTitle - Flag_IsUpper - - Flag_CanAdj - Flag_CanAdp - Flag_CanAdv - Flag_CanConj - Flag_CanDet - Flag_CanNoun - Flag_CanNum - Flag_CanPdt - Flag_CanPos - Flag_CanPron - Flag_CanPrt - Flag_CanPunct - Flag_CanVerb - - Flag_OftLower - Flag_OftTitle - Flag_OftUpper - Flag_N - - -cdef enum Views: - View_CanonForm - View_WordShape - View_NonSparse - View_Asciied - View_N - - -# Assign the flag and view functions by enum value. -# This is verbose, but it ensures we don't get nasty order sensitivities. -STRING_VIEW_FUNCS = [None] * View_N -STRING_VIEW_FUNCS[View_CanonForm] = orth.canon_case -STRING_VIEW_FUNCS[View_WordShape] = orth.word_shape -STRING_VIEW_FUNCS[View_NonSparse] = orth.non_sparse -STRING_VIEW_FUNCS[View_Asciied] = orth.asciied - -FLAG_FUNCS = [None] * Flag_N -FLAG_FUNCS[Flag_IsAlpha] = orth.is_alpha -FLAG_FUNCS[Flag_IsAscii] = orth.is_ascii -FLAG_FUNCS[Flag_IsDigit] = orth.is_digit -FLAG_FUNCS[Flag_IsLower] = orth.is_lower -FLAG_FUNCS[Flag_IsPunct] = orth.is_punct -FLAG_FUNCS[Flag_IsSpace] = orth.is_space -FLAG_FUNCS[Flag_IsTitle] = orth.is_title -FLAG_FUNCS[Flag_IsUpper] = orth.is_upper - -FLAG_FUNCS[Flag_CanAdj] = orth.can_tag('ADJ') -FLAG_FUNCS[Flag_CanAdp] = orth.can_tag('ADP') -FLAG_FUNCS[Flag_CanAdv] = orth.can_tag('ADV') -FLAG_FUNCS[Flag_CanConj] = orth.can_tag('CONJ') -FLAG_FUNCS[Flag_CanDet] = orth.can_tag('DET') -FLAG_FUNCS[Flag_CanNoun] = orth.can_tag('NOUN') -FLAG_FUNCS[Flag_CanNum] = orth.can_tag('NUM') -FLAG_FUNCS[Flag_CanPdt] = orth.can_tag('PDT') -FLAG_FUNCS[Flag_CanPos] = orth.can_tag('POS') -FLAG_FUNCS[Flag_CanPron] = orth.can_tag('PRON') -FLAG_FUNCS[Flag_CanPrt] = orth.can_tag('PRT') -FLAG_FUNCS[Flag_CanPunct] = orth.can_tag('PUNCT') -FLAG_FUNCS[Flag_CanVerb] = orth.can_tag('VERB') - -FLAG_FUNCS[Flag_OftLower] = orth.oft_case('lower', 0.7) -FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7) -FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) - - -cdef class EnglishTokens(Tokens): - # Provide accessor methods for the features supported by the language. - # Without these, clients have to use the underlying string_view and check_flag - # methods, which requires them to know the IDs. - cpdef unicode canon_string(self, size_t i): - return lexeme_string_view(self.lexemes[i], View_CanonForm) - - cpdef unicode shape_string(self, size_t i): - return lexeme_string_view(self.lexemes[i], View_WordShape) - - cpdef unicode non_sparse_string(self, size_t i): - return lexeme_string_view(self.lexemes[i], View_NonSparse) - - cpdef unicode asciied_string(self, size_t i): - return lexeme_string_view(self.lexemes[i], View_Asciied) - - cpdef size_t canon(self, size_t i): - return id(self.lexemes[i].views[View_CanonForm]) - - cpdef size_t shape(self, size_t i): - return id(self.lexemes[i].views[View_WordShape]) - - cpdef size_t non_sparse(self, size_t i): - return id(self.lexemes[i].views[View_NonSparse]) - - cpdef size_t asciied(self, size_t i): - return id(self.lexemes[i].views[View_Asciied]) - - cpdef bint is_alpha(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsAlpha) - - cpdef bint is_ascii(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsAscii) - - cpdef bint is_digit(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsDigit) - - cpdef bint is_lower(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsLower) - - cpdef bint is_punct(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsPunct) - - cpdef bint is_space(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsSpace) - - cpdef bint is_title(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsTitle) - - cpdef bint is_upper(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_IsUpper) - - cpdef bint can_adj(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanAdj) - - cpdef bint can_adp(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanAdp) - - cpdef bint can_adv(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanAdv) - - cpdef bint can_conj(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanConj) - - cpdef bint can_det(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanDet) - - cpdef bint can_noun(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanNoun) - - cpdef bint can_num(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanNum) - - cpdef bint can_pdt(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanPdt) - - cpdef bint can_pos(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanPos) - - cpdef bint can_pron(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanPron) - - cpdef bint can_prt(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanPrt) - - cpdef bint can_punct(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanPunct) - - cpdef bint can_verb(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_CanVerb) - - cpdef bint oft_lower(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_OftLower) - - cpdef bint oft_title(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_OftTitle) - - cpdef bint oft_upper(self, size_t i): - return lexeme_check_flag(self.lexemes[i], Flag_OftUpper) - - cdef class English(Language): """English tokenizer, tightly coupled to lexicon. @@ -233,20 +56,6 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - fl_is_alpha = Flag_IsAlpha - fl_is_digit = Flag_IsDigit - v_shape = View_WordShape - def __cinit__(self, name, user_string_features, user_flag_features): - self.cache = PointerHash(2 ** 25) - self.specials = PointerHash(2 ** 16) - lang_data = util.read_lang_data(name) - rules, words, probs, clusters, case_stats, tag_stats = lang_data - self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, - STRING_VIEW_FUNCS + user_string_features, - FLAG_FUNCS + user_flag_features) - self._load_special_tokenization(rules) - self.tokens_class = EnglishTokens - cdef int _split_one(self, Py_UNICODE* characters, size_t length): if length == 1: return 1 @@ -275,13 +84,15 @@ cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter if characters[i] == "'" and i < (length - 1) and char_i1.isalpha(): return i == 0 - if characters[i] == "-" and i < (length - 1) and characters[i+1] == '-': + if characters[i] == "-": return False + #and i < (length - 1) and characters[i+1] == '-': + #return False # Don't count commas as punct if the next char is a number if characters[i] == "," and i < (length - 1) and char_i1.isdigit(): return False # Don't count periods as punct if the next char is not whitespace - if characters[i] == "." and i < (length - 1) and not char_i1.isspace(): + if characters[i] == ".": return False return not char_i.isalnum() diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 6503002e4..3ea29e53d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -38,7 +38,6 @@ cdef class Language: cdef PointerHash cache cdef PointerHash specials cpdef readonly Lexicon lexicon - cpdef readonly object tokens_class cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 2d5654071..96e052569 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -20,7 +20,87 @@ from spacy.lexeme cimport LexemeC, lexeme_init from murmurhash.mrmr cimport hash64 from spacy._hashing cimport PointerHash -from spacy._hashing cimport Cell +from spacy import orth +from spacy import util + + +cdef enum Flags: + Flag_IsAlpha + Flag_IsAscii + Flag_IsDigit + Flag_IsLower + Flag_IsPunct + Flag_IsSpace + Flag_IsTitle + Flag_IsUpper + + Flag_CanAdj + Flag_CanAdp + Flag_CanAdv + Flag_CanConj + Flag_CanDet + Flag_CanNoun + Flag_CanNum + Flag_CanPdt + Flag_CanPos + Flag_CanPron + Flag_CanPrt + Flag_CanPunct + Flag_CanVerb + + Flag_OftLower + Flag_OftTitle + Flag_OftUpper + Flag_N + + +cdef enum Views: + View_CanonForm + View_WordShape + View_NonSparse + View_Asciied + View_N + + + +# Assign the flag and view functions by enum value. +# This is verbose, but it ensures we don't get nasty order sensitivities. +STRING_VIEW_FUNCS = [None] * View_N +STRING_VIEW_FUNCS[View_CanonForm] = orth.canon_case +STRING_VIEW_FUNCS[View_WordShape] = orth.word_shape +STRING_VIEW_FUNCS[View_NonSparse] = orth.non_sparse +STRING_VIEW_FUNCS[View_Asciied] = orth.asciied + +FLAG_FUNCS = [None] * Flag_N +FLAG_FUNCS[Flag_IsAlpha] = orth.is_alpha +FLAG_FUNCS[Flag_IsAscii] = orth.is_ascii +FLAG_FUNCS[Flag_IsDigit] = orth.is_digit +FLAG_FUNCS[Flag_IsLower] = orth.is_lower +FLAG_FUNCS[Flag_IsPunct] = orth.is_punct +FLAG_FUNCS[Flag_IsSpace] = orth.is_space +FLAG_FUNCS[Flag_IsTitle] = orth.is_title +FLAG_FUNCS[Flag_IsUpper] = orth.is_upper + +FLAG_FUNCS[Flag_CanAdj] = orth.can_tag('ADJ') +FLAG_FUNCS[Flag_CanAdp] = orth.can_tag('ADP') +FLAG_FUNCS[Flag_CanAdv] = orth.can_tag('ADV') +FLAG_FUNCS[Flag_CanConj] = orth.can_tag('CONJ') +FLAG_FUNCS[Flag_CanDet] = orth.can_tag('DET') +FLAG_FUNCS[Flag_CanNoun] = orth.can_tag('NOUN') +FLAG_FUNCS[Flag_CanNum] = orth.can_tag('NUM') +FLAG_FUNCS[Flag_CanPdt] = orth.can_tag('PDT') +FLAG_FUNCS[Flag_CanPos] = orth.can_tag('POS') +FLAG_FUNCS[Flag_CanPron] = orth.can_tag('PRON') +FLAG_FUNCS[Flag_CanPrt] = orth.can_tag('PRT') +FLAG_FUNCS[Flag_CanPunct] = orth.can_tag('PUNCT') +FLAG_FUNCS[Flag_CanVerb] = orth.can_tag('VERB') + +FLAG_FUNCS[Flag_OftLower] = orth.oft_case('lower', 0.7) +FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7) +FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) + + + cdef class Language: """Base class for language-specific tokenizers. @@ -36,20 +116,20 @@ cdef class Language: The language's name is used to look up default data-files, found in data/lexeme) +_unicodes = set() cdef void string_from_unicode(String* s, unicode uni): - string_from_slice(s, uni, 0, len(uni)) + global _unicodes + _unicodes.add(uni) + cdef Py_UNICODE* c_uni = uni + string_from_slice(s, c_uni, 0, len(uni)) cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4d5d08f52..9291be95a 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,17 +1,15 @@ from libc.stdlib cimport calloc, free - cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, list views, set flags): cdef LexemeC* lexeme = calloc(1, sizeof(LexemeC)) lexeme.cluster = cluster lexeme.prob = prob - lexeme.length = len(string) - lexeme.string = intern_and_encode(string) - + lexeme.string = intern_and_encode(string, &lexeme.length) lexeme.views = calloc(len(views), sizeof(char*)) + cdef size_t length = 0 for i, string in enumerate(views): - lexeme.views[i] = intern_and_encode(string) + lexeme.views[i] = intern_and_encode(string, &length) for active_flag in flags: lexeme.flags |= (1 << active_flag) @@ -24,9 +22,11 @@ cdef int lexeme_free(LexemeC* lexeme) except -1: cdef set _strings = set() -cdef char* intern_and_encode(unicode string): +cdef char* intern_and_encode(unicode string, size_t* length): global _strings - cdef bytes utf8_string = intern(string.encode('utf8')) + cdef bytes decoded = string.encode('utf8') + cdef bytes utf8_string = intern(decoded) + length[0] = len(utf8_string) _strings.add(utf8_string) return utf8_string diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 383c79f9e..b8d03632e 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,7 +5,7 @@ cdef class Tokens: cdef size_t size cdef LexemeC** lexemes - cdef int push_back(self, LexemeC* lexeme) except -1 + cdef int push_back(self, LexemeC* lexeme) except 01 cpdef size_t id(self, size_t i) cpdef unicode string(self, size_t i) @@ -13,3 +13,38 @@ cdef class Tokens: cpdef size_t cluster(self, size_t i) cpdef bint check_flag(self, size_t i, size_t flag_id) cpdef unicode string_view(self, size_t i, size_t view_id) + + cpdef size_t canon(self, size_t i) + cpdef size_t shape(self, size_t i) + cpdef size_t non_sparse(self, size_t i) + cpdef size_t asciied(self, size_t i) + cpdef unicode canon_string(self, size_t i) + cpdef unicode shape_string(self, size_t i) + cpdef unicode non_sparse_string(self, size_t i) + cpdef unicode asciied_string(self, size_t i) + cpdef bint is_alpha(self, size_t i) + cpdef bint is_ascii(self, size_t i) + cpdef bint is_digit(self, size_t i) + cpdef bint is_lower(self, size_t i) + cpdef bint is_punct(self, size_t i) + cpdef bint is_space(self, size_t i) + cpdef bint is_title(self, size_t i) + cpdef bint is_upper(self, size_t i) + cpdef bint can_adj(self, size_t i) + cpdef bint can_adp(self, size_t i) + cpdef bint can_adv(self, size_t i) + cpdef bint can_conj(self, size_t i) + cpdef bint can_det(self, size_t i) + cpdef bint can_noun(self, size_t i) + cpdef bint can_num(self, size_t i) + cpdef bint can_pdt(self, size_t i) + cpdef bint can_pos(self, size_t i) + cpdef bint can_pron(self, size_t i) + cpdef bint can_prt(self, size_t i) + cpdef bint can_punct(self, size_t i) + cpdef bint can_verb(self, size_t i) + cpdef bint oft_lower(self, size_t i) + cpdef bint oft_title(self, size_t i) + cpdef bint oft_upper(self, size_t i) + + diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 78f3dbea1..48e4216ac 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,3 +1,4 @@ +# cython: profile=True from libc.stdlib cimport calloc, free, realloc from spacy.word cimport Lexeme @@ -5,6 +6,44 @@ from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_string_view +cdef enum Flags: + Flag_IsAlpha + Flag_IsAscii + Flag_IsDigit + Flag_IsLower + Flag_IsPunct + Flag_IsSpace + Flag_IsTitle + Flag_IsUpper + + Flag_CanAdj + Flag_CanAdp + Flag_CanAdv + Flag_CanConj + Flag_CanDet + Flag_CanNoun + Flag_CanNum + Flag_CanPdt + Flag_CanPos + Flag_CanPron + Flag_CanPrt + Flag_CanPunct + Flag_CanVerb + + Flag_OftLower + Flag_OftTitle + Flag_OftUpper + Flag_N + + +cdef enum Views: + View_CanonForm + View_WordShape + View_NonSparse + View_Asciied + View_N + + cdef class Tokens: """A sequence of references to Lexeme objects. @@ -52,8 +91,9 @@ cdef class Tokens: self.length += 1 cpdef unicode string(self, size_t i): - cdef bytes byte_string = self.lexemes[i].string - return byte_string.decode('utf8') + cdef bytes utf8_string = self.lexemes[i].string[:self.lexemes[i].length] + cdef unicode string = utf8_string.decode('utf8') + return string cpdef size_t id(self, size_t i): return id(self.lexemes[i].string) @@ -69,3 +109,104 @@ cdef class Tokens: cpdef unicode string_view(self, size_t i, size_t view_id): return lexeme_string_view(self.lexemes[i], view_id) + + # Provide accessor methods for the features supported by the language. + # Without these, clients have to use the underlying string_view and check_flag + # methods, which requires them to know the IDs. + cpdef unicode canon_string(self, size_t i): + return lexeme_string_view(self.lexemes[i], View_CanonForm) + + cpdef unicode shape_string(self, size_t i): + return lexeme_string_view(self.lexemes[i], View_WordShape) + + cpdef unicode non_sparse_string(self, size_t i): + return lexeme_string_view(self.lexemes[i], View_NonSparse) + + cpdef unicode asciied_string(self, size_t i): + return lexeme_string_view(self.lexemes[i], View_Asciied) + + cpdef size_t canon(self, size_t i): + return id(self.lexemes[i].views[View_CanonForm]) + + cpdef size_t shape(self, size_t i): + return id(self.lexemes[i].views[View_WordShape]) + + cpdef size_t non_sparse(self, size_t i): + return id(self.lexemes[i].views[View_NonSparse]) + + cpdef size_t asciied(self, size_t i): + return id(self.lexemes[i].views[View_Asciied]) + + cpdef bint is_alpha(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsAlpha) + + cpdef bint is_ascii(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsAscii) + + cpdef bint is_digit(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsDigit) + + cpdef bint is_lower(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsLower) + + cpdef bint is_punct(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsPunct) + + cpdef bint is_space(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsSpace) + + cpdef bint is_title(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsTitle) + + cpdef bint is_upper(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_IsUpper) + + cpdef bint can_adj(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanAdj) + + cpdef bint can_adp(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanAdp) + + cpdef bint can_adv(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanAdv) + + cpdef bint can_conj(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanConj) + + cpdef bint can_det(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanDet) + + cpdef bint can_noun(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanNoun) + + cpdef bint can_num(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanNum) + + cpdef bint can_pdt(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanPdt) + + cpdef bint can_pos(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanPos) + + cpdef bint can_pron(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanPron) + + cpdef bint can_prt(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanPrt) + + cpdef bint can_punct(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanPunct) + + cpdef bint can_verb(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_CanVerb) + + cpdef bint oft_lower(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_OftLower) + + cpdef bint oft_title(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_OftTitle) + + cpdef bint oft_upper(self, size_t i): + return lexeme_check_flag(self.lexemes[i], Flag_OftUpper) + +