# cython: profile=True # cython: embedsignature=True from libc.stdlib cimport calloc, free # Python-visible enum for POS tags PUNCT = 0 CONJ = 1 NUM = 2 X = 3 DET = 4 ADP = 5 ADJ = 6 ADV = 7 VERB = 8 NOUN = 9 PDT = 10 POS = 11 PRON = 12 PRT = 13 DEF OFT_UPPER = 1 DEF OFT_TITLE = 2 cdef class Word: """A lexical type. Attributes: string (bytes): A utf8-encoded byte-string for the word. lex (StringHash): A hash of the word. length (size_t): The (unicode) length of the word. prob (double): An estimate of the word's unigram log probability. Probabilities are calculated from a large text corpus, and smoothed using simple Good-Turing. Estimates are read from data/en/probabilities, and can be replaced using spacy.en.load_probabilities. cluster (int): An integer representation of the word's Brown cluster. A Brown cluster is an address into a binary tree, which gives some (noisy) information about the word's distributional context. >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable') >>> print ["{0:b"} % lookup(s).cluster for s in strings] ["100111110110", "100111100100", "01010111011001", "100111110110"] The clusterings are unideal, but often slightly useful. "pineapple" and "apple" share a long prefix, indicating a similar meaning, while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. """ def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0, orth_flags=0, dist_flags=0, possible_tags=0): self.string = string self.length = len(string) self.views = calloc(len(string_views), sizeof(StringHash)) cdef unicode view for i in range(len(string_views)): view = string_views[i] self.string_views[i] = hash(view) def __dealloc__(self): free(self.string_views) cpdef StringHash get_view(self, size_t i) except 0: return self.string_views[i] cpdef bint check_orth_flag(self, OrthFlags flag) except *: """Access the value of one of the pre-computed boolean orthographic features. Meanings depend on the language-specific orthographic features being loaded. The suggested features for latin-alphabet languages are: TODO """ return self.orth_flags & (1 << flag) cpdef bint check_dist_flag(self, DistFlags flag) except *: """Access the value of one of the pre-computed boolean distribution features. Meanings depend on the language-specific distributional features being loaded. The suggested features for latin-alphabet languages are: TODO """ return self.dist_flags & (1 << flag) cpdef bint can_tag(self, TagFlags flag) except *: """Check whether the word often receives a particular tag in a large text corpus. "Often" is chosen by heuristic. """ return self.possible_tags & (1 << flag) cdef class CasedWord(Word): def __cinit__(self, bytes string): string_views = [get_normaized(string), get_word_shape(string), string[-3:]] Word.__cinit__(self, string, string_views) cpdef bint is_often_uppered(self) except *: '''Check the OFT_UPPER distributional flag for the word. The OFT_UPPER flag records whether a lower-cased version of the word is found in all-upper case frequently in a large sample of text, where "frequently" is defined as P >= 0.95 (chosen for high mutual information for POS tagging). Case statistics are estimated from a large text corpus. Estimates are read from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. >>> is_often_uppered(lookup(u'nato')) True >>> is_often_uppered(lookup(u'the')) False ''' return self.dist_flags & (1 << OFT_UPPER) cpdef bint is_often_titled(self) except *: '''Check the OFT_TITLE distributional flag for the word. The OFT_TITLE flag records whether a lower-cased version of the word is found title-cased (see string.istitle) frequently in a large sample of text, where "frequently" is defined as P >= 0.3 (chosen for high mutual information for POS tagging). Case statistics are estimated from a large text corpus. Estimates are read from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. >>> is_oft_upper(lookup(u'john')) True >>> is_oft_upper(lookup(u'Bill')) False ''' return self.dist_flags & (1 << OFT_TITLE) cpdef bint is_alpha(self) except *: """Check whether all characters in the word's string are alphabetic. Should match the :py:func:`unicode.isalpha()` function. >>> is_alpha(lookup(u'Hello')) True >>> is_alpha(lookup(u'العرب')) True >>> is_alpha(lookup(u'10')) False """ return self.orth_flags & 1 << IS_ALPHA cpdef bint is_digit(self) except *: """Check whether all characters in the word's string are numeric. Should match the :py:func:`unicode.isdigit()` function. >>> is_digit(lookup(u'10')) True >>> is_digit(lookup(u'๐')) True >>> is_digit(lookup(u'one')) False """ return self.orth_flags & 1 << IS_DIGIT cpdef bint is_punct(self) except *: """Check whether all characters belong to a punctuation unicode data category for a Lexeme ID. >>> is_punct(lookup(u'.')) True >>> is_punct(lookup(u'⁒')) True >>> is_punct(lookup(u' ')) False """ return self.orth_flags & 1 << IS_PUNCT cpdef bint is_space(self) except *: """Give the result of unicode.isspace() for a Lexeme ID. >>> is_space(lookup(u'\\t')) True >>> is_space(lookup(u'')) True >>> is_space(lookup(u'Hi\\n')) False """ return self.orth_flags & 1 << IS_SPACE cpdef bint is_lower(self) except *: """Give the result of unicode.islower() for a Lexeme ID. >>> is_lower(lookup(u'hi')) True >>> is_lower(lookup()) True >>> is_lower(lookup(u'10')) False """ return self.orth_flags & 1 << IS_LOWER cpdef bint is_upper(self) except *: """Give the result of unicode.isupper() for a Lexeme ID. >>> is_upper(lookup(u'HI')) True >>> is_upper(lookup(u'H10')) True >>> is_upper(lookup(u'10')) False """ return self.orth_flags & 1 << IS_UPPER cpdef bint is_title(self) except *: """Give the result of unicode.istitle() for a Lexeme ID. >>> is_title(lookup(u'Hi')) True >>> is_title(lookup(u'Hi1')) True >>> is_title(lookup(u'1')) False """ return self.orth_flags & 1 << IS_TITLE cpdef bint is_ascii(self) except *: """Give the result of checking whether all characters in the string are ascii. >>> is_ascii(lookup(u'Hi')) True >>> is_ascii(lookup(u' ')) True >>> is_title(lookup(u'')) False """ return self.orth_flags & 1 << IS_ASCII