mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-16 06:37:04 +03:00
0f01f46e02
* Replace all basestring references with unicode `basestring` was a compatability type introduced by Cython to make dealing with utf-8 strings in Python2 easier. In Python3 it is equivalent to the unicode (or str) type. I replaced all references to basestring with unicode, since that was used elsewhere, but we could also just replace them with str, which shoudl also be equivalent. All tests pass locally. * Replace all references to unicode type with str Since we only support python3 this is simpler. * Remove all references to unicode type This removes all references to the unicode type across the codebase and replaces them with `str`, which makes it more drastic than the prior commits. In order to make this work importing `unicode_literals` had to be removed, and one explicit unicode literal also had to be removed (it is unclear why this is necessary in Cython with language level 3, but without doing it there were errors about implicit conversion). When `unicode` is used as a type in comments it was also edited to be `str`. Additionally `coding: utf8` headers were removed from a few files.
502 lines
16 KiB
Cython
502 lines
16 KiB
Cython
# cython: embedsignature=True
|
||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||
from cython.view cimport array as cvarray
|
||
from libc.string cimport memset
|
||
cimport numpy as np
|
||
np.import_array()
|
||
|
||
import numpy
|
||
from thinc.api import get_array_module
|
||
import warnings
|
||
|
||
from .typedefs cimport attr_t, flags_t
|
||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||
from .attrs cimport IS_CURRENCY
|
||
|
||
from .attrs import intify_attrs
|
||
from .errors import Errors, Warnings
|
||
|
||
|
||
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
|
||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||
EMPTY_LEXEME.id = OOV_RANK
|
||
|
||
|
||
cdef class Lexeme:
|
||
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||
tag, dependency parse, or lemma (lemmatization depends on the
|
||
part-of-speech tag).
|
||
|
||
DOCS: https://spacy.io/api/lexeme
|
||
"""
|
||
def __init__(self, Vocab vocab, attr_t orth):
|
||
"""Create a Lexeme object.
|
||
|
||
vocab (Vocab): The parent vocabulary
|
||
orth (uint64): The orth id of the lexeme.
|
||
Returns (Lexeme): The newly constructd object.
|
||
"""
|
||
self.vocab = vocab
|
||
self.orth = orth
|
||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||
if self.c.orth != orth:
|
||
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
||
|
||
def __richcmp__(self, other, int op):
|
||
if other is None:
|
||
if op == 0 or op == 1 or op == 2:
|
||
return False
|
||
else:
|
||
return True
|
||
if isinstance(other, Lexeme):
|
||
a = self.orth
|
||
b = other.orth
|
||
elif isinstance(other, long):
|
||
a = self.orth
|
||
b = other
|
||
elif isinstance(other, str):
|
||
a = self.orth_
|
||
b = other
|
||
else:
|
||
a = 0
|
||
b = 1
|
||
if op == 2: # ==
|
||
return a == b
|
||
elif op == 3: # !=
|
||
return a != b
|
||
elif op == 0: # <
|
||
return a < b
|
||
elif op == 1: # <=
|
||
return a <= b
|
||
elif op == 4: # >
|
||
return a > b
|
||
elif op == 5: # >=
|
||
return a >= b
|
||
else:
|
||
raise NotImplementedError(op)
|
||
|
||
def __hash__(self):
|
||
return self.c.orth
|
||
|
||
def set_attrs(self, **attrs):
|
||
cdef attr_id_t attr
|
||
attrs = intify_attrs(attrs)
|
||
for attr, value in attrs.items():
|
||
# skip PROB, e.g. from lexemes.jsonl
|
||
if isinstance(value, float):
|
||
continue
|
||
elif isinstance(value, (int, long)):
|
||
Lexeme.set_struct_attr(self.c, attr, value)
|
||
else:
|
||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||
|
||
def set_flag(self, attr_id_t flag_id, bint value):
|
||
"""Change the value of a boolean flag.
|
||
|
||
flag_id (int): The attribute ID of the flag to set.
|
||
value (bool): The new value of the flag.
|
||
"""
|
||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||
|
||
def check_flag(self, attr_id_t flag_id):
|
||
"""Check the value of a boolean flag.
|
||
|
||
flag_id (int): The attribute ID of the flag to query.
|
||
RETURNS (bool): The value of the flag.
|
||
"""
|
||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||
|
||
def similarity(self, other):
|
||
"""Compute a semantic similarity estimate. Defaults to cosine over
|
||
vectors.
|
||
|
||
other (object): The object to compare with. By default, accepts `Doc`,
|
||
`Span`, `Token` and `Lexeme` objects.
|
||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||
"""
|
||
# Return 1.0 similarity for matches
|
||
if hasattr(other, "orth"):
|
||
if self.c.orth == other.orth:
|
||
return 1.0
|
||
elif hasattr(other, "__len__") and len(other) == 1 \
|
||
and hasattr(other[0], "orth"):
|
||
if self.c.orth == other[0].orth:
|
||
return 1.0
|
||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
||
return 0.0
|
||
vector = self.vector
|
||
xp = get_array_module(vector)
|
||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||
|
||
@property
|
||
def has_vector(self):
|
||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||
"""
|
||
return self.vocab.has_vector(self.c.orth)
|
||
|
||
@property
|
||
def vector_norm(self):
|
||
"""RETURNS (float): The L2 norm of the vector representation."""
|
||
vector = self.vector
|
||
return numpy.sqrt((vector**2).sum())
|
||
|
||
property vector:
|
||
"""A real-valued meaning representation.
|
||
|
||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||
representing the lexeme's semantics.
|
||
"""
|
||
def __get__(self):
|
||
cdef int length = self.vocab.vectors_length
|
||
if length == 0:
|
||
raise ValueError(Errors.E010)
|
||
return self.vocab.get_vector(self.c.orth)
|
||
|
||
def __set__(self, vector):
|
||
if len(vector) != self.vocab.vectors_length:
|
||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||
length=self.vocab.vectors_length))
|
||
self.vocab.set_vector(self.c.orth, vector)
|
||
|
||
property rank:
|
||
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||
to index into tables, e.g. for word vectors."""
|
||
def __get__(self):
|
||
return self.c.id
|
||
|
||
def __set__(self, value):
|
||
self.c.id = value
|
||
|
||
property sentiment:
|
||
"""RETURNS (float): A scalar value indicating the positivity or
|
||
negativity of the lexeme."""
|
||
def __get__(self):
|
||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||
return sentiment_table.get(self.c.orth, 0.0)
|
||
|
||
def __set__(self, float x):
|
||
if "lexeme_sentiment" not in self.vocab.lookups:
|
||
self.vocab.lookups.add_table("lexeme_sentiment")
|
||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||
sentiment_table[self.c.orth] = x
|
||
|
||
@property
|
||
def orth_(self):
|
||
"""RETURNS (str): The original verbatim text of the lexeme
|
||
(identical to `Lexeme.text`). Exists mostly for consistency with
|
||
the other attributes."""
|
||
return self.vocab.strings[self.c.orth]
|
||
|
||
@property
|
||
def text(self):
|
||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||
return self.orth_
|
||
|
||
property lower:
|
||
"""RETURNS (str): Lowercase form of the lexeme."""
|
||
def __get__(self):
|
||
return self.c.lower
|
||
|
||
def __set__(self, attr_t x):
|
||
self.c.lower = x
|
||
|
||
property norm:
|
||
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||
lexeme text.
|
||
"""
|
||
def __get__(self):
|
||
return self.c.norm
|
||
|
||
def __set__(self, attr_t x):
|
||
if "lexeme_norm" not in self.vocab.lookups:
|
||
self.vocab.lookups.add_table("lexeme_norm")
|
||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||
norm_table[self.c.orth] = self.vocab.strings[x]
|
||
self.c.norm = x
|
||
|
||
property shape:
|
||
"""RETURNS (uint64): Transform of the word's string, to show
|
||
orthographic features.
|
||
"""
|
||
def __get__(self):
|
||
return self.c.shape
|
||
|
||
def __set__(self, attr_t x):
|
||
self.c.shape = x
|
||
|
||
property prefix:
|
||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||
Defaults to `N=1`.
|
||
"""
|
||
def __get__(self):
|
||
return self.c.prefix
|
||
|
||
def __set__(self, attr_t x):
|
||
self.c.prefix = x
|
||
|
||
property suffix:
|
||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||
Defaults to `N=3`.
|
||
"""
|
||
def __get__(self):
|
||
return self.c.suffix
|
||
|
||
def __set__(self, attr_t x):
|
||
self.c.suffix = x
|
||
|
||
property cluster:
|
||
"""RETURNS (int): Brown cluster ID."""
|
||
def __get__(self):
|
||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||
return cluster_table.get(self.c.orth, 0)
|
||
|
||
def __set__(self, int x):
|
||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||
cluster_table[self.c.orth] = x
|
||
|
||
property lang:
|
||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||
def __get__(self):
|
||
return self.c.lang
|
||
|
||
def __set__(self, attr_t x):
|
||
self.c.lang = x
|
||
|
||
property prob:
|
||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||
type."""
|
||
def __get__(self):
|
||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||
return prob_table.get(self.c.orth, default_oov_prob)
|
||
|
||
def __set__(self, float x):
|
||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||
prob_table[self.c.orth] = x
|
||
|
||
property lower_:
|
||
"""RETURNS (str): Lowercase form of the word."""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.lower]
|
||
|
||
def __set__(self, str x):
|
||
self.c.lower = self.vocab.strings.add(x)
|
||
|
||
property norm_:
|
||
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||
lexeme text.
|
||
"""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.norm]
|
||
|
||
def __set__(self, str x):
|
||
self.norm = self.vocab.strings.add(x)
|
||
|
||
property shape_:
|
||
"""RETURNS (str): Transform of the word's string, to show
|
||
orthographic features.
|
||
"""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.shape]
|
||
|
||
def __set__(self, str x):
|
||
self.c.shape = self.vocab.strings.add(x)
|
||
|
||
property prefix_:
|
||
"""RETURNS (str): Length-N substring from the start of the word.
|
||
Defaults to `N=1`.
|
||
"""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.prefix]
|
||
|
||
def __set__(self, str x):
|
||
self.c.prefix = self.vocab.strings.add(x)
|
||
|
||
property suffix_:
|
||
"""RETURNS (str): Length-N substring from the end of the word.
|
||
Defaults to `N=3`.
|
||
"""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.suffix]
|
||
|
||
def __set__(self, str x):
|
||
self.c.suffix = self.vocab.strings.add(x)
|
||
|
||
property lang_:
|
||
"""RETURNS (str): Language of the parent vocabulary."""
|
||
def __get__(self):
|
||
return self.vocab.strings[self.c.lang]
|
||
|
||
def __set__(self, str x):
|
||
self.c.lang = self.vocab.strings.add(x)
|
||
|
||
property flags:
|
||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||
def __get__(self):
|
||
return self.c.flags
|
||
|
||
def __set__(self, flags_t x):
|
||
self.c.flags = x
|
||
|
||
@property
|
||
def is_oov(self):
|
||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||
return self.orth not in self.vocab.vectors
|
||
|
||
property is_stop:
|
||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||
|
||
property is_alpha:
|
||
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||
characters. Equivalent to `lexeme.text.isalpha()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||
|
||
property is_ascii:
|
||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||
|
||
property is_digit:
|
||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||
to `lexeme.text.isdigit()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||
|
||
property is_lower:
|
||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||
`lexeme.text.islower()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||
|
||
property is_upper:
|
||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||
`lexeme.text.isupper()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||
|
||
property is_title:
|
||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||
`lexeme.text.istitle()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||
|
||
property is_punct:
|
||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||
|
||
property is_space:
|
||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||
Equivalent to `lexeme.text.isspace()`.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||
|
||
property is_bracket:
|
||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||
|
||
property is_quote:
|
||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||
|
||
property is_left_punct:
|
||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||
|
||
property is_right_punct:
|
||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||
|
||
property is_currency:
|
||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||
|
||
property like_url:
|
||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||
|
||
property like_num:
|
||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||
"10", "ten", etc.
|
||
"""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||
|
||
property like_email:
|
||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||
def __get__(self):
|
||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||
|
||
def __set__(self, bint x):
|
||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|