spaCy/spacy/lexeme.pyx

# cython: embedsignature=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

from libc.string cimport memset

from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy


memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
                              const float* empty_vec) except -1:
    lex.length = props['length']
    lex.orth = string_store[props['orth']]
    lex.lower = string_store[props['lower']]
    lex.norm = string_store[props['norm']]
    lex.shape = string_store[props['shape']]
    lex.prefix = string_store[props['prefix']]
    lex.suffix = string_store[props['suffix']]

    lex.cluster = props['cluster']
    lex.prob = props['prob']
    lex.sentiment = props['sentiment']

    lex.flags = props['flags']
    cdef flags_t sense_id = 0
    cdef flags_t one = 1
    lex.senses = 0
    for _sense_id in props.get('senses', []):
        sense_id = _sense_id
        lex.senses |= one << sense_id
    lex.repvec = empty_vec


cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __cinit__(self, int vec_size):
        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)

    @property
    def has_repvec(self):
        return self.l2_norm != 0

    cpdef bint check(self, attr_id_t flag_id) except -1:
        cdef flags_t one = 1
        return self.flags & (one << flag_id)

    cpdef bint has_sense(self, flags_t flag_id) except -1:
        cdef flags_t one = 1
        return self.senses & (one << flag_id)