Update docstrings and API docs for Lexeme

2026-01-10 10:41:14 +03:00 · 2017-05-20 15:13:42 +02:00 · 2017-05-20 15:13:42 +02:00 · 27de0834b2
commit 27de0834b2
parent 7ed8a92ed1
2 changed files with 197 additions and 135 deletions
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


 cdef class Lexeme:
-    """
-    An entry in the vocabulary.  A Lexeme has no string context --- it's a
+    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
-        """
-        Create a Lexeme object.
+        """Create a Lexeme object.

-        Arguments:
-            vocab (Vocab): The parent vocabulary
-            orth (int): The orth id of the lexeme.
+        vocab (Vocab): The parent vocabulary
+        orth (int): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
@ -82,35 +79,28 @@ cdef class Lexeme:
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
-        """
-        Change the value of a boolean flag.
+        """Change the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to set.
-            value (bool): The new value of the flag.
+        flag_id (int): The attribute ID of the flag to set.
+        value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
-        """
-        Check the value of a boolean flag.
+        """Check the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to query.
-        Returns (bool): The value of the flag.
+        flag_id (int): The attribute ID of the flag to query.
+        RETURNS (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
-        """
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """Compute a semantic similarity estimate. Defaults to cosine over
+        vectors.

-        Arguments:
-            other:
-                The object to compare with. By default, accepts Doc, Span,
-                Token and Lexeme objects.
-        Returns:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
@ -140,6 +130,11 @@ cdef class Lexeme:
        self.orth = self.c.orth

    property has_vector:
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
+        """
        def __get__(self):
            cdef int i
            for i in range(self.vocab.vectors_length):
@ -149,6 +144,10 @@ cdef class Lexeme:
                return False

    property vector_norm:
+        """The L2 norm of the lexeme's vector representation.
+
+        RETURNS (float): The L2 norm of the vector representation.
+        """
        def __get__(self):
            return self.c.l2_norm

@ -156,6 +155,11 @@ cdef class Lexeme:
            self.c.l2_norm = value

    property vector:
+        """A real-valued meaning representation.
+
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the lexeme's semantics.
+        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
@ -196,6 +200,14 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.orth]

+    property text:
+        """A unicode representation of the token text.
+
+        RETURNS (unicode): The original verbatim text of the token.
+        """
+        def __get__(self):
+            return self.orth_
+
    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -2,7 +2,154 @@

 include ../../_includes/_mixins

-p An entry in the vocabulary.
+p
+    |  An entry in the vocabulary. A #[code Lexeme] has no string context – it's
+    |  a word-type, as opposed to a word token. It therefore has no
+    |  part-of-speech tag, dependency parse, or lemma (if lemmatization depends
+    |  on the part-of-speech tag).
+
+h(2, "init") Lexeme.__init__
+    +tag method
+
+p Create a #[code Lexeme] object.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell The parent vocabulary.
+
+    +row
+        +cell #[code orth]
+        +cell int
+        +cell The orth id of the lexeme.
+
+    +footrow
+        +cell returns
+        +cell #[code Lexeme]
+        +cell The newly constructed object.
+
+h(2, "set_flag") Lexeme.set_flag
+    +tag method
+
+p Change the value of a boolean flag.
+
+aside-code("Example").
+    COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
+    nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code flag_id]
+        +cell int
+        +cell The attribute ID of the flag to set.
+
+    +row
+        +cell #[code value]
+        +cell bool
+        +cell The new value of the flag.
+
+h(2, "check_flag") Lexeme.check_flag
+    +tag method
+
+p Check the value of a boolean flag.
+
+aside-code("Example").
+    is_my_library = lambda text: text in ['spaCy', 'Thinc']
+    MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
+    assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code flag_id]
+        +cell int
+        +cell The attribute ID of the flag to query.
+
+    +footrow
+        +cell returns
+        +cell bool
+        +cell The value of the flag.
+
+h(2, "similarity") Lexeme.similarity
+    +tag method
+    +tag-model("vectors")
+
+p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    orange = nlp.vocab[u'orange']
+    apple_orange = apple.similarity(orange)
+    orange_apple = orange.similarity(apple)
+    assert apple_orange == orange_apple
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell other
+        +cell -
+        +cell
+            |  The object to compare with. By default, accepts #[code Doc],
+            |  #[code Span], #[code Token] and #[code Lexeme] objects.
+
+    +footrow
+        +cell returns
+        +cell float
+        +cell A scalar similarity score. Higher is more similar.
+
+
+h(2, "has_vector") Lexeme.has_vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  lexeme.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    assert apple.has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the lexeme has a vector data attached.
+
+h(2, "vector") Lexeme.vector
+    +tag property
+    +tag-model("vectors")
+
+p A real-valued meaning representation.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    assert apple.vector.dtype == 'float32'
+    assert apple.vector.shape == (300,)
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the lexeme's semantics.
+
+h(2, "vector_norm") Lexeme.vector_norm
+    +tag property
+    +tag-model("vectors")
+
+p The L2 norm of the lexeme's vector representation.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    pasta = nlp.vocab[u'pasta']
+    apple.vector_norm # 7.1346845626831055
+    pasta.vector_norm # 7.759851932525635
+    assert apple.vector_norm != pasta.vector_norm
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell float
+        +cell The L2 norm of the vector representation.

 +h(2, "attributes") Attributes

@ -12,6 +159,16 @@ p An entry in the vocabulary.
        +cell #[code Vocab]
        +cell

+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell Verbatim text content.
+
+    +row
+        +cell #[code lex_id]
+        +cell int
+        +cell ID of the lexeme's lexical type.
+
    +row
        +cell #[code lower]
        +cell int
@ -124,116 +281,9 @@ p An entry in the vocabulary.
    +row
        +cell #[code prob]
        +cell float
-        +cell Smoothed log probability estimate of token's type.
+        +cell Smoothed log probability estimate of lexeme's type.

    +row
        +cell #[code sentiment]
        +cell float
-        +cell A scalar value indicating the positivity or negativity of the token.
-    +row
-        +cell #[code lex_id]
-        +cell int
-        +cell ID of the token's lexical type.
-
-    +row
-        +cell #[code text]
-        +cell unicode
-        +cell Verbatim text content.
-
-+h(2, "init") Lexeme.__init__
-    +tag method
-
-p Create a #[code Lexeme] object.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The parent vocabulary.
-
-    +row
-        +cell #[code orth]
-        +cell int
-        +cell The orth id of the lexeme.
-
-    +footrow
-        +cell returns
-        +cell #[code Lexeme]
-        +cell The newly constructed object.
-
-+h(2, "set_flag") Lexeme.set_flag
-    +tag method
-
-p Change the value of a boolean flag.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code flag_id]
-        +cell int
-        +cell The attribute ID of the flag to set.
-
-    +row
-        +cell #[code value]
-        +cell bool
-        +cell The new value of the flag.
-
-    +footrow
-        +cell returns
-        +cell #[code None]
-        +cell -
-
-+h(2, "check_flag") Lexeme.check_flag
-    +tag method
-
-p Check the value of a boolean flag.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code flag_id]
-        +cell int
-        +cell The attribute ID of the flag to query.
-
-    +footrow
-        +cell returns
-        +cell bool
-        +cell The value of the flag.
-
-+h(2, "similarity") Lexeme.similarity
-    +tag method
-
-p Compute a semantic similarity estimate. Defaults to cosine over vectors.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code other]
-        +cell -
-        +cell
-            |  The object to compare with. By default, accepts #[code Doc],
-            |  #[code Span], #[code Token] and #[code Lexeme] objects.
-
-    +footrow
-        +cell returns
-        +cell float
-        +cell A scalar similarity score. Higher is more similar.
-
-+h(2, "vector") Lexeme.vector
-    +tag property
-
-p A real-valued meaning representation.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A real-valued meaning representation.
-
-+h(2, "has_vector") Lexeme.has_vector
-    +tag property
-
-p A boolean value indicating whether a word vector is associated with the object.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell returns
-        +cell bool
-        +cell Whether a word vector is associated with the object.
+        +cell A scalar value indicating the positivity or negativity of the lexeme.