diff --git a/spacy/word.pxd b/spacy/word.pxd index 8ca2140e9..90c9c941a 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -17,4 +17,4 @@ cdef class Lexeme: cdef readonly flag_t flags cpdef bint check_flag(self, size_t flag_id) except * - cpdef int set_flag(self, size_t flag_id) except -1 + cpdef unicode string_view(self, size_t view_id) diff --git a/spacy/word.pyx b/spacy/word.pyx index 6a59a1081..dcf2f0f1b 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -5,20 +5,17 @@ from libc.stdlib cimport calloc, free, realloc cdef class Lexeme: - """A lexical type. + """A lexical type --- a word, punctuation symbol, whitespace sequence, etc + keyed by a case-sensitive unicode string. All tokens with the same string, + e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same + Lexeme. - Clients should avoid instantiating Lexemes directly, and instead use get_lexeme - from a language module, e.g. spacy.en.get_lexeme . This allows us to use only - one Lexeme object per lexical type. + You should avoid instantiating Lexemes directly, and instead use the + :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup` + methods on the global object exposed by the language you're working with, + e.g. :py:data:`spacy.en.EN`. Attributes: - id (view_id_t): - A unique ID of the word's string. - - Implemented as the memory-address of the string, - as we use Python's string interning to guarantee that only one copy - of each string is seen. - string (unicode): The unicode string. @@ -34,7 +31,7 @@ cdef class Lexeme: simple Good-Turing. Estimates are read from data/en/probabilities, and can be replaced using spacy.en.load_probabilities. - cluster (int): + cluster (size_t): An integer representation of the word's Brown cluster. A Brown cluster is an address into a binary tree, which gives some (noisy) @@ -62,18 +59,43 @@ cdef class Lexeme: for i, flag_feature in enumerate(flag_features): if flag_feature(string, prob, case_stats, tag_stats): - self.set_flag(i) + self.flags |= (1 << i) def __dealloc__(self): pass cpdef bint check_flag(self, size_t flag_id) except *: - """Access the value of one of the pre-computed boolean distribution features. + """Lexemes may store language-specific boolean features in a bit-field, + with values accessed by providing an ID constant to this function. - Meanings depend on the language-specific distributional features being loaded. - The suggested features for latin-alphabet languages are: TODO + The ID constants are exposed as global variables in the language module, + e.g. + + >>> from spacy.en import EN + >>> lexeme = EN.lookup(u'Nasa') + >>> lexeme.check_flag(EN.IS_UPPER) + False + >>> lexeme.check_flag(EN.OFT_UPPER) + True """ return self.flags & (1 << flag_id) - cpdef int set_flag(self, size_t flag_id) except -1: - self.flags |= (1 << flag_id) + cpdef unicode string_view(self, size_t view_id): + """Lexemes may store language-specific string-view features, obtained + by transforming the string, possibly in light of distributional information. + The string-view features are accessed by providing an ID constant to this + function. + + The ID constants are exposed as global variables in the language module, + e.g. + + >>> from spacy.en import EN + >>> lexeme = EN.lookup(u'Nasa') + >>> lexeme.string_view(EN.CANON_CASED) + u'NASA' + >>> lexeme.string_view(EN.SHAPE) + u'Xxxx' + >>> lexeme.string_view(EN.NON_SPARSE) + u'Xxxx' + """ + return self.views[view_id]