mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
a5cd203284
* Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
89 lines
2.6 KiB
Cython
89 lines
2.6 KiB
Cython
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
|
from .attrs cimport attr_id_t
|
|
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
|
|
|
from .structs cimport LexemeC
|
|
from .strings cimport StringStore
|
|
from .vocab cimport Vocab
|
|
|
|
from numpy cimport ndarray
|
|
|
|
|
|
cdef LexemeC EMPTY_LEXEME
|
|
cdef attr_t OOV_RANK
|
|
|
|
cdef class Lexeme:
|
|
cdef LexemeC* c
|
|
cdef readonly Vocab vocab
|
|
cdef readonly attr_t orth
|
|
|
|
@staticmethod
|
|
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
|
|
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
|
|
self.c = lex
|
|
self.vocab = vocab
|
|
self.orth = lex.orth
|
|
|
|
@staticmethod
|
|
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
|
if name < (sizeof(flags_t) * 8):
|
|
Lexeme.c_set_flag(lex, name, value)
|
|
elif name == ID:
|
|
lex.id = value
|
|
elif name == LOWER:
|
|
lex.lower = value
|
|
elif name == NORM:
|
|
lex.norm = value
|
|
elif name == SHAPE:
|
|
lex.shape = value
|
|
elif name == PREFIX:
|
|
lex.prefix = value
|
|
elif name == SUFFIX:
|
|
lex.suffix = value
|
|
elif name == LANG:
|
|
lex.lang = value
|
|
|
|
@staticmethod
|
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|
if feat_name < (sizeof(flags_t) * 8):
|
|
if Lexeme.c_check_flag(lex, feat_name):
|
|
return 1
|
|
else:
|
|
return 0
|
|
elif feat_name == ID:
|
|
return lex.id
|
|
elif feat_name == ORTH:
|
|
return lex.orth
|
|
elif feat_name == LOWER:
|
|
return lex.lower
|
|
elif feat_name == NORM:
|
|
return lex.norm
|
|
elif feat_name == SHAPE:
|
|
return lex.shape
|
|
elif feat_name == PREFIX:
|
|
return lex.prefix
|
|
elif feat_name == SUFFIX:
|
|
return lex.suffix
|
|
elif feat_name == LENGTH:
|
|
return lex.length
|
|
elif feat_name == LANG:
|
|
return lex.lang
|
|
else:
|
|
return 0
|
|
|
|
@staticmethod
|
|
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
|
cdef flags_t one = 1
|
|
if lexeme.flags & (one << flag_id):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
@staticmethod
|
|
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
|
|
cdef flags_t one = 1
|
|
if value:
|
|
lex.flags |= one << flag_id
|
|
else:
|
|
lex.flags &= ~(one << flag_id)
|