* More docs

2025-06-27 00:13:07 +03:00 · 2014-08-29 03:01:40 +02:00 · 2014-08-29 03:01:40 +02:00 · 4e5b2d47e2
commit 4e5b2d47e2
parent 5233f110c4
2 changed files with 41 additions and 19 deletions
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -17,4 +17,4 @@ cdef class Lexeme:
    cdef readonly flag_t flags
    cpdef bint check_flag(self, size_t flag_id) except *
-    cpdef int set_flag(self, size_t flag_id) except -1
+    cpdef unicode string_view(self, size_t view_id)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -5,20 +5,17 @@
 from libc.stdlib cimport calloc, free, realloc
 cdef class Lexeme:
-    """A lexical type.
+    """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
    keyed by a case-sensitive unicode string. All tokens with the same string,
    e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
    Lexeme.
-    Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
+    You should avoid instantiating Lexemes directly, and instead use the
-    from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
+    :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
-    one Lexeme object per lexical type.
+    methods on the global object exposed by the language you're working with,
    e.g. :py:data:`spacy.en.EN`.
    Attributes:
        id (view_id_t):
            A unique ID of the word's string.
            Implemented as the memory-address of the string,
            as we use Python's string interning to guarantee that only one copy
            of each string is seen.
        string (unicode):
            The unicode string.
@ -34,7 +31,7 @@ cdef class Lexeme:
            simple Good-Turing.  Estimates are read from data/en/probabilities, and
            can be replaced using spacy.en.load_probabilities.
-        cluster (int):
+        cluster (size_t):
            An integer representation of the word's Brown cluster.
            A Brown cluster is an address into a binary tree, which gives some (noisy)
@ -62,18 +59,43 @@ cdef class Lexeme:
        for i, flag_feature in enumerate(flag_features):
            if flag_feature(string, prob, case_stats, tag_stats):
-                self.set_flag(i)
+                self.flags |= (1 << i)
    def __dealloc__(self):
        pass
    cpdef bint check_flag(self, size_t flag_id) except *:
-        """Access the value of one of the pre-computed boolean distribution features.
+        """Lexemes may store language-specific boolean features in a bit-field,
        with values accessed by providing an ID constant to this function.
-        Meanings depend on the language-specific distributional features being loaded.
+        The ID constants are exposed as global variables in the language module,
-        The suggested features for latin-alphabet languages are: TODO
+        e.g.
        >>> from spacy.en import EN
        >>> lexeme = EN.lookup(u'Nasa')
        >>> lexeme.check_flag(EN.IS_UPPER)
        False
        >>> lexeme.check_flag(EN.OFT_UPPER)
        True
        """
        return self.flags & (1 << flag_id)
-    cpdef int set_flag(self, size_t flag_id) except -1:
+    cpdef unicode string_view(self, size_t view_id):
-        self.flags |= (1 << flag_id)
+        """Lexemes may store language-specific string-view features, obtained
        by transforming the string, possibly in light of distributional information.
        The string-view features are accessed by providing an ID constant to this
        function.
        The ID constants are exposed as global variables in the language module,
        e.g.
        >>> from spacy.en import EN
        >>> lexeme = EN.lookup(u'Nasa')
        >>> lexeme.string_view(EN.CANON_CASED)
        u'NASA'
        >>> lexeme.string_view(EN.SHAPE)
        u'Xxxx'
        >>> lexeme.string_view(EN.NON_SPARSE)
        u'Xxxx'
        """
        return self.views[view_id]