Add Token.lex

This commit is contained in:
Ines Montani 2020-08-10 16:43:52 +02:00
parent 933a7cf8d1
commit c099f6eece
3 changed files with 85 additions and 70 deletions

View File

@ -2,6 +2,7 @@ import pytest
import numpy import numpy
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lexeme import Lexeme
from spacy.lang.en import English from spacy.lang.en import English
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
@ -389,3 +390,11 @@ def test_doc_lang(en_vocab):
assert doc.lang == en_vocab.strings["en"] assert doc.lang == en_vocab.strings["en"]
assert doc[0].lang_ == "en" assert doc[0].lang_ == "en"
assert doc[0].lang == en_vocab.strings["en"] assert doc[0].lang == en_vocab.strings["en"]
def test_token_lexeme(en_vocab):
"""Test that tokens expose their lexeme."""
token = Doc(en_vocab, words=["Hello", "world"])[0]
assert isinstance(token.lex, Lexeme)
assert token.lex.text == token.text
assert en_vocab[token.orth] == token.lex

View File

@ -226,6 +226,11 @@ cdef class Token:
cdef hash_t key = self.vocab.morphology.add(features) cdef hash_t key = self.vocab.morphology.add(features)
self.c.morph = key self.c.morph = key
@property
def lex(self):
"""RETURNS (Lexeme): The underlying lexeme."""
return self.vocab[self.c.lex.orth]
@property @property
def lex_id(self): def lex_id(self):
"""RETURNS (int): Sequential ID of the token's lexical type.""" """RETURNS (int): Sequential ID of the token's lexical type."""

View File

@ -393,8 +393,9 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| `lex` <Tag variant="new">3</Tag> | [`Lexeme`](/api/lexeme) | The underlying lexeme. |
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. | | `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
| `text` | str | Verbatim text content. | | `text` | str | Verbatim text content. |
| `text_with_ws` | str | Text content, with trailing space character if present. | | `text_with_ws` | str | Text content, with trailing space character if present. |