spaCy/spacy/lexeme.pyx
2024-05-14 17:42:48 +02:00

543 lines
16 KiB
Cython
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cython: embedsignature=True
# cython: profile=False
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
from libc.string cimport memset
np.import_array()
import warnings
import numpy
from thinc.api import get_array_module
from .attrs cimport (
IS_ALPHA,
IS_ASCII,
IS_BRACKET,
IS_CURRENCY,
IS_DIGIT,
IS_LEFT_PUNCT,
IS_LOWER,
IS_PUNCT,
IS_QUOTE,
IS_RIGHT_PUNCT,
IS_SPACE,
IS_STOP,
IS_TITLE,
IS_UPPER,
LIKE_EMAIL,
LIKE_NUM,
LIKE_URL,
)
from .typedefs cimport attr_t, flags_t
from .attrs import intify_attrs
from .errors import Errors, Warnings
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the
part-of-speech tag).
DOCS: https://spacy.io/api/lexeme
"""
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
vocab (Vocab): The parent vocabulary
orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
def __richcmp__(self, other, int op):
if other is None:
if op == 0 or op == 1 or op == 2:
return False
else:
return True
if isinstance(other, Lexeme):
a = self.orth
b = other.orth
elif isinstance(other, long):
a = self.orth
b = other
elif isinstance(other, str):
a = self.orth_
b = other
else:
a = 0
b = 1
if op == 2: # ==
return a == b
elif op == 3: # !=
return a != b
elif op == 0: # <
return a < b
elif op == 1: # <=
return a <= b
elif op == 4: # >
return a > b
elif op == 5: # >=
return a >= b
else:
raise NotImplementedError(op)
def __hash__(self):
return self.c.orth
def set_attrs(self, **attrs):
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
# skip PROB, e.g. from lexemes.jsonl
if isinstance(value, float):
continue
elif isinstance(value, (int, long)):
Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag.
flag_id (int): The attribute ID of the flag to query.
RETURNS (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
"""Compute a semantic similarity estimate. Defaults to cosine over
vectors.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
# Return 1.0 similarity for matches
if hasattr(other, "orth"):
if self.c.orth == other.orth:
return 1.0
elif (
hasattr(other, "__len__") and len(other) == 1
and hasattr(other[0], "orth")
and self.c.orth == other[0].orth
):
return 1.0
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Lexeme"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
"""
return self.vocab.has_vector(self.c.orth)
@property
def vector_norm(self):
"""RETURNS (float): The L2 norm of the vector representation."""
vector = self.vector
return numpy.sqrt((vector**2).sum())
@property
def vector(self):
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
@vector.setter
def vector(self, vector):
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
@property
def rank(self):
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors."""
return self.c.id
@rank.setter
def rank(self, value):
self.c.id = value
@property
def orth_(self):
"""RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
"""RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
@property
def lower(self):
"""RETURNS (uint64): Lowercase form of the lexeme."""
return self.c.lower
@lower.setter
def lower(self, attr_t x):
self.c.lower = x
@property
def norm(self):
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
return self.c.norm
@norm.setter
def norm(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
@property
def shape(self):
"""RETURNS (uint64): Transform of the word's string, to show
orthographic features.
"""
return self.c.shape
@shape.setter
def shape(self, attr_t x):
self.c.shape = x
@property
def prefix(self):
"""RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
return self.c.prefix
@prefix.setter
def prefix(self, attr_t x):
self.c.prefix = x
@property
def suffix(self):
"""RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
return self.c.suffix
@suffix.setter
def suffix(self, attr_t x):
self.c.suffix = x
@property
def cluster(self):
"""RETURNS (int): Brown cluster ID."""
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
@cluster.setter
def cluster(self, int x):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
@property
def lang(self):
"""RETURNS (uint64): Language of the parent vocabulary."""
return self.c.lang
@lang.setter
def lang(self, attr_t x):
self.c.lang = x
@property
def prob(self):
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
type."""
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
@prob.setter
def prob(self, float x):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
@property
def lower_(self):
"""RETURNS (str): Lowercase form of the word."""
return self.vocab.strings[self.c.lower]
@lower_.setter
def lower_(self, str x):
self.c.lower = self.vocab.strings.add(x)
@property
def norm_(self):
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
return self.vocab.strings[self.c.norm]
@norm_.setter
def norm_(self, str x):
self.norm = self.vocab.strings.add(x)
@property
def shape_(self):
"""RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
return self.vocab.strings[self.c.shape]
@shape_.setter
def shape_(self, str x):
self.c.shape = self.vocab.strings.add(x)
@property
def prefix_(self):
"""RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
return self.vocab.strings[self.c.prefix]
@prefix_.setter
def prefix_(self, str x):
self.c.prefix = self.vocab.strings.add(x)
@property
def suffix_(self):
"""RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
return self.vocab.strings[self.c.suffix]
@suffix_.setter
def suffix_(self, str x):
self.c.suffix = self.vocab.strings.add(x)
@property
def lang_(self):
"""RETURNS (str): Language of the parent vocabulary."""
return self.vocab.strings[self.c.lang]
@lang_.setter
def lang_(self, str x):
self.c.lang = self.vocab.strings.add(x)
@property
def flags(self):
"""RETURNS (uint64): Container of the lexeme's binary flags."""
return self.c.flags
@flags.setter
def flags(self, flags_t x):
self.c.flags = x
@property
def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors
@property
def is_stop(self):
"""RETURNS (bool): Whether the lexeme is a stop word."""
return Lexeme.c_check_flag(self.c, IS_STOP)
@is_stop.setter
def is_stop(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
@property
def is_alpha(self):
"""RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`.
"""
return Lexeme.c_check_flag(self.c, IS_ALPHA)
@is_alpha.setter
def is_alpha(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
@property
def is_ascii(self):
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
return Lexeme.c_check_flag(self.c, IS_ASCII)
@is_ascii.setter
def is_ascii(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
@property
def is_digit(self):
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`.
"""
return Lexeme.c_check_flag(self.c, IS_DIGIT)
@is_digit.setter
def is_digit(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
@property
def is_lower(self):
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`.
"""
return Lexeme.c_check_flag(self.c, IS_LOWER)
@is_lower.setter
def is_lower(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
@property
def is_upper(self):
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
return Lexeme.c_check_flag(self.c, IS_UPPER)
@is_upper.setter
def is_upper(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
@property
def is_title(self):
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`.
"""
return Lexeme.c_check_flag(self.c, IS_TITLE)
@is_title.setter
def is_title(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
@property
def is_punct(self):
"""RETURNS (bool): Whether the lexeme is punctuation."""
return Lexeme.c_check_flag(self.c, IS_PUNCT)
@is_punct.setter
def is_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
@property
def is_space(self):
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`.
"""
return Lexeme.c_check_flag(self.c, IS_SPACE)
@is_space.setter
def is_space(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
@property
def is_bracket(self):
"""RETURNS (bool): Whether the lexeme is a bracket."""
return Lexeme.c_check_flag(self.c, IS_BRACKET)
@is_bracket.setter
def is_bracket(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
@property
def is_quote(self):
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
return Lexeme.c_check_flag(self.c, IS_QUOTE)
@is_quote.setter
def is_quote(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
@property
def is_left_punct(self):
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
@is_left_punct.setter
def is_left_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
@property
def is_right_punct(self):
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
@is_right_punct.setter
def is_right_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
@property
def is_currency(self):
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
@is_currency.setter
def is_currency(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
@property
def like_url(self):
"""RETURNS (bool): Whether the lexeme resembles a URL."""
return Lexeme.c_check_flag(self.c, LIKE_URL)
@like_url.setter
def like_url(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
@property
def like_num(self):
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc.
"""
return Lexeme.c_check_flag(self.c, LIKE_NUM)
@like_num.setter
def like_num(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
@property
def like_email(self):
"""RETURNS (bool): Whether the lexeme resembles an email address."""
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
@like_email.setter
def like_email(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)