Add Token.lex

2025-11-06 02:47:29 +03:00 · 2020-08-10 16:43:52 +02:00 · 2020-08-10 16:43:52 +02:00 · c099f6eece
commit c099f6eece
parent 933a7cf8d1
3 changed files with 85 additions and 70 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,7 @@ import pytest
 import numpy
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
+from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH

@ -389,3 +390,11 @@ def test_doc_lang(en_vocab):
    assert doc.lang == en_vocab.strings["en"]
    assert doc[0].lang_ == "en"
    assert doc[0].lang == en_vocab.strings["en"]
+
+
+def test_token_lexeme(en_vocab):
+    """Test that tokens expose their lexeme."""
+    token = Doc(en_vocab, words=["Hello", "world"])[0]
+    assert isinstance(token.lex, Lexeme)
+    assert token.lex.text == token.text
+    assert en_vocab[token.orth] == token.lex
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -226,6 +226,11 @@ cdef class Token:
            cdef hash_t key = self.vocab.morphology.add(features)
            self.c.morph = key

+    @property
+    def lex(self):
+        """RETURNS (Lexeme): The underlying lexeme."""
+        return self.vocab[self.c.lex.orth]
+
    @property
    def lex_id(self):
        """RETURNS (int): Sequential ID of the token's lexical type."""
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -393,8 +393,9 @@ The L2 norm of the token's vector representation.
 ## Attributes {#attributes}

 | Name                                         | Type                    | Description                                                                                                                                                                                                                                                    |
-| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `doc`                                        | `Doc`                   | The parent document.                                                                                                                                                                                                                                           |
+| `lex` <Tag variant="new">3</Tag>             | [`Lexeme`](/api/lexeme) | The underlying lexeme.                                                                                                                                                                                                                                         |
 | `sent` <Tag variant="new">2.0.12</Tag>       | `Span`                  | The sentence span that this token is a part of.                                                                                                                                                                                                                |
 | `text`                                       | str                     | Verbatim text content.                                                                                                                                                                                                                                         |
 | `text_with_ws`                               | str                     | Text content, with trailing space character if present.                                                                                                                                                                                                        |