* More docs

2026-02-21 22:50:57 +03:00 · 2014-08-29 03:01:40 +02:00 · 2014-08-29 03:01:40 +02:00 · 4e5b2d47e2
commit 4e5b2d47e2
parent 5233f110c4
2 changed files with 41 additions and 19 deletions
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -17,4 +17,4 @@ cdef class Lexeme:
    cdef readonly flag_t flags

    cpdef bint check_flag(self, size_t flag_id) except *
-    cpdef int set_flag(self, size_t flag_id) except -1
+    cpdef unicode string_view(self, size_t view_id)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -5,20 +5,17 @@
 from libc.stdlib cimport calloc, free, realloc

 cdef class Lexeme:
-    """A lexical type.
+    """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
+    keyed by a case-sensitive unicode string. All tokens with the same string,
+    e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
+    Lexeme.

-    Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
-    from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
-    one Lexeme object per lexical type.
+    You should avoid instantiating Lexemes directly, and instead use the
+    :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
+    methods on the global object exposed by the language you're working with,
+    e.g. :py:data:`spacy.en.EN`.

    Attributes:
-        id (view_id_t):
-            A unique ID of the word's string.
-
-            Implemented as the memory-address of the string,
-            as we use Python's string interning to guarantee that only one copy
-            of each string is seen.
-
        string (unicode):
            The unicode string.
            
@ -34,7 +31,7 @@ cdef class Lexeme:
            simple Good-Turing.  Estimates are read from data/en/probabilities, and
            can be replaced using spacy.en.load_probabilities.
        
-        cluster (int):
+        cluster (size_t):
            An integer representation of the word's Brown cluster.

            A Brown cluster is an address into a binary tree, which gives some (noisy)
@ -62,18 +59,43 @@ cdef class Lexeme:

        for i, flag_feature in enumerate(flag_features):
            if flag_feature(string, prob, case_stats, tag_stats):
-                self.set_flag(i)
+                self.flags |= (1 << i)

    def __dealloc__(self):
        pass

    cpdef bint check_flag(self, size_t flag_id) except *:
-        """Access the value of one of the pre-computed boolean distribution features.
+        """Lexemes may store language-specific boolean features in a bit-field,
+        with values accessed by providing an ID constant to this function.

-        Meanings depend on the language-specific distributional features being loaded.
-        The suggested features for latin-alphabet languages are: TODO
+        The ID constants are exposed as global variables in the language module,
+        e.g.
+
+        >>> from spacy.en import EN
+        >>> lexeme = EN.lookup(u'Nasa')
+        >>> lexeme.check_flag(EN.IS_UPPER)
+        False
+        >>> lexeme.check_flag(EN.OFT_UPPER)
+        True
        """
        return self.flags & (1 << flag_id)

-    cpdef int set_flag(self, size_t flag_id) except -1:
-        self.flags |= (1 << flag_id)
+    cpdef unicode string_view(self, size_t view_id):
+        """Lexemes may store language-specific string-view features, obtained
+        by transforming the string, possibly in light of distributional information.
+        The string-view features are accessed by providing an ID constant to this
+        function.
+
+        The ID constants are exposed as global variables in the language module,
+        e.g.
+
+        >>> from spacy.en import EN
+        >>> lexeme = EN.lookup(u'Nasa')
+        >>> lexeme.string_view(EN.CANON_CASED)
+        u'NASA'
+        >>> lexeme.string_view(EN.SHAPE)
+        u'Xxxx'
+        >>> lexeme.string_view(EN.NON_SPARSE)
+        u'Xxxx'
+        """
+        return self.views[view_id]