spaCy/spacy/lexeme.pyx
2023-09-28 17:09:41 +02:00

522 lines
16 KiB
Cython
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cython: embedsignature=True
# cython: profile=False
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
from libc.string cimport memset
np.import_array()
import warnings
import numpy
from thinc.api import get_array_module
from .attrs cimport (
IS_ALPHA,
IS_ASCII,
IS_BRACKET,
IS_CURRENCY,
IS_DIGIT,
IS_LEFT_PUNCT,
IS_LOWER,
IS_PUNCT,
IS_QUOTE,
IS_RIGHT_PUNCT,
IS_SPACE,
IS_STOP,
IS_TITLE,
IS_UPPER,
LIKE_EMAIL,
LIKE_NUM,
LIKE_URL,
)
from .typedefs cimport attr_t, flags_t
from .attrs import intify_attrs
from .errors import Errors, Warnings
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the
part-of-speech tag).
DOCS: https://spacy.io/api/lexeme
"""
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
vocab (Vocab): The parent vocabulary
orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
def __richcmp__(self, other, int op):
if other is None:
if op == 0 or op == 1 or op == 2:
return False
else:
return True
if isinstance(other, Lexeme):
a = self.orth
b = other.orth
elif isinstance(other, long):
a = self.orth
b = other
elif isinstance(other, str):
a = self.orth_
b = other
else:
a = 0
b = 1
if op == 2: # ==
return a == b
elif op == 3: # !=
return a != b
elif op == 0: # <
return a < b
elif op == 1: # <=
return a <= b
elif op == 4: # >
return a > b
elif op == 5: # >=
return a >= b
else:
raise NotImplementedError(op)
def __hash__(self):
return self.c.orth
def set_attrs(self, **attrs):
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
# skip PROB, e.g. from lexemes.jsonl
if isinstance(value, float):
continue
elif isinstance(value, (int, long)):
Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag.
flag_id (int): The attribute ID of the flag to query.
RETURNS (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
"""Compute a semantic similarity estimate. Defaults to cosine over
vectors.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
# Return 1.0 similarity for matches
if hasattr(other, "orth"):
if self.c.orth == other.orth:
return 1.0
elif (
hasattr(other, "__len__") and len(other) == 1
and hasattr(other[0], "orth")
and self.c.orth == other[0].orth
):
return 1.0
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Lexeme"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
"""
return self.vocab.has_vector(self.c.orth)
@property
def vector_norm(self):
"""RETURNS (float): The L2 norm of the vector representation."""
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
property rank:
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
def __set__(self, value):
self.c.id = value
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
def __set__(self, float x):
if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@property
def orth_(self):
"""RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
"""RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
"""RETURNS (uint64): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
property norm:
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.c.norm
def __set__(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
property shape:
"""RETURNS (uint64): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
property prefix:
"""RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
property suffix:
"""RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
def __set__(self, int x):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
property lang:
"""RETURNS (uint64): Language of the parent vocabulary."""
def __get__(self):
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
property prob:
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
type."""
def __get__(self):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
property lower_:
"""RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_:
"""RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
"""RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
"""RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
"""RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags:
"""RETURNS (uint64): Container of the lexeme's binary flags."""
def __get__(self):
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
@property
def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors
property is_stop:
"""RETURNS (bool): Whether the lexeme is a stop word."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha:
"""RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii:
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit:
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower:
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title:
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct:
"""RETURNS (bool): Whether the lexeme is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space:
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket:
"""RETURNS (bool): Whether the lexeme is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote:
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct:
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct:
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property is_currency:
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
property like_url:
"""RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num:
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email:
"""RETURNS (bool): Whether the lexeme resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)