mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-01 02:13:07 +03:00
* Tmp
This commit is contained in:
parent
0f2cb74433
commit
cad0cca4e3
|
@ -80,7 +80,6 @@ class English(object):
|
||||||
Packer=None,
|
Packer=None,
|
||||||
load_vectors=True
|
load_vectors=True
|
||||||
):
|
):
|
||||||
|
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
|
|
||||||
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
|
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
|
||||||
|
|
126
spacy/lexeme.pxd
126
spacy/lexeme.pxd
|
@ -8,97 +8,53 @@ from .strings cimport StringStore
|
||||||
from numpy cimport ndarray
|
from numpy cimport ndarray
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC EMPTY_LEXEME
|
cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
|
||||||
const float* empty_vec) except -1
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
cdef readonly ndarray repvec
|
cdef LexemeC* c
|
||||||
|
cdef readonly Vocab vocab
|
||||||
cdef readonly flags_t flags
|
|
||||||
cdef readonly attr_t id
|
|
||||||
cdef readonly attr_t length
|
|
||||||
|
|
||||||
cdef readonly attr_t orth
|
cdef readonly attr_t orth
|
||||||
cdef readonly attr_t lower
|
|
||||||
cdef readonly attr_t norm
|
|
||||||
cdef readonly attr_t shape
|
|
||||||
cdef readonly attr_t prefix
|
|
||||||
cdef readonly attr_t suffix
|
|
||||||
|
|
||||||
cdef readonly unicode orth_
|
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
||||||
cdef readonly unicode lower_
|
lex.length = props['length']
|
||||||
cdef readonly unicode norm_
|
lex.orth = vocab.strings[props['orth']]
|
||||||
cdef readonly unicode shape_
|
lex.lower = vocab.strings[props['lower']]
|
||||||
cdef readonly unicode prefix_
|
lex.norm = vocab.strings[props['norm']]
|
||||||
cdef readonly unicode suffix_
|
lex.shape = vocab.strings[props['shape']]
|
||||||
|
lex.prefix = vocab.strings[props['prefix']]
|
||||||
|
lex.suffix = vocab.strings[props['suffix']]
|
||||||
|
|
||||||
cdef readonly attr_t cluster
|
lex.cluster = props['cluster']
|
||||||
cdef readonly float prob
|
lex.prob = props['prob']
|
||||||
cdef readonly float sentiment
|
lex.sentiment = props['sentiment']
|
||||||
cdef readonly float l2_norm
|
|
||||||
|
lex.flags = props['flags']
|
||||||
|
lex.repvec = empty_vec
|
||||||
|
|
||||||
# Workaround for an apparent bug in the way the decorator is handled ---
|
|
||||||
# TODO: post bug report / patch to Cython.
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
for i in range(repvec_length):
|
return Lexeme.check_flag(lex, feat_name)
|
||||||
py.repvec[i] = ptr.repvec[i]
|
elif feat_name == ID:
|
||||||
py.l2_norm = ptr.l2_norm
|
return lex.id
|
||||||
py.flags = ptr.flags
|
elif feat_name == ORTH:
|
||||||
py.id = ptr.id
|
return lex.orth
|
||||||
py.length = ptr.length
|
elif feat_name == LOWER:
|
||||||
|
return lex.lower
|
||||||
|
elif feat_name == NORM:
|
||||||
|
return lex.norm
|
||||||
|
elif feat_name == SHAPE:
|
||||||
|
return lex.shape
|
||||||
|
elif feat_name == PREFIX:
|
||||||
|
return lex.prefix
|
||||||
|
elif feat_name == SUFFIX:
|
||||||
|
return lex.suffix
|
||||||
|
elif feat_name == LENGTH:
|
||||||
|
return lex.length
|
||||||
|
elif feat_name == CLUSTER:
|
||||||
|
return lex.cluster
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
py.orth = ptr.orth
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
py.lower = ptr.lower
|
return lexeme.flags & (1 << flag_id)
|
||||||
py.norm = ptr.norm
|
|
||||||
py.shape = ptr.shape
|
|
||||||
py.prefix = ptr.prefix
|
|
||||||
py.suffix = ptr.suffix
|
|
||||||
|
|
||||||
py.orth_ = strings[ptr.orth]
|
|
||||||
py.lower_ = strings[ptr.lower]
|
|
||||||
py.norm_ = strings[ptr.norm]
|
|
||||||
py.shape_ = strings[ptr.shape]
|
|
||||||
py.prefix_ = strings[ptr.prefix]
|
|
||||||
py.suffix_ = strings[ptr.suffix]
|
|
||||||
|
|
||||||
py.cluster = ptr.cluster
|
|
||||||
py.prob = ptr.prob
|
|
||||||
py.sentiment = ptr.sentiment
|
|
||||||
return py
|
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
|
||||||
return lexeme.flags & (1 << flag_id)
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|
||||||
if feat_name < (sizeof(flags_t) * 8):
|
|
||||||
return check_flag(lex, feat_name)
|
|
||||||
elif feat_name == ID:
|
|
||||||
return lex.id
|
|
||||||
elif feat_name == ORTH:
|
|
||||||
return lex.orth
|
|
||||||
elif feat_name == LOWER:
|
|
||||||
return lex.lower
|
|
||||||
elif feat_name == NORM:
|
|
||||||
return lex.norm
|
|
||||||
elif feat_name == SHAPE:
|
|
||||||
return lex.shape
|
|
||||||
elif feat_name == PREFIX:
|
|
||||||
return lex.prefix
|
|
||||||
elif feat_name == SUFFIX:
|
|
||||||
return lex.suffix
|
|
||||||
elif feat_name == LENGTH:
|
|
||||||
return lex.length
|
|
||||||
elif feat_name == CLUSTER:
|
|
||||||
return lex.cluster
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
109
spacy/lexeme.pyx
109
spacy/lexeme.pyx
|
@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
|
||||||
const float* empty_vec) except -1:
|
|
||||||
lex.length = props['length']
|
|
||||||
lex.orth = string_store[props['orth']]
|
|
||||||
lex.lower = string_store[props['lower']]
|
|
||||||
lex.norm = string_store[props['norm']]
|
|
||||||
lex.shape = string_store[props['shape']]
|
|
||||||
lex.prefix = string_store[props['prefix']]
|
|
||||||
lex.suffix = string_store[props['suffix']]
|
|
||||||
|
|
||||||
lex.cluster = props['cluster']
|
|
||||||
lex.prob = props['prob']
|
|
||||||
lex.sentiment = props['sentiment']
|
|
||||||
|
|
||||||
lex.flags = props['flags']
|
|
||||||
lex.repvec = empty_vec
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||||
tag).
|
tag).
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, int vec_size):
|
def __init__(self, Vocab vocab, int orth):
|
||||||
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
self.vocab = vocab
|
||||||
|
self.orth = orth
|
||||||
|
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||||
|
|
||||||
@property
|
property orth:
|
||||||
def has_repvec(self):
|
def __get__(self):
|
||||||
return self.l2_norm != 0
|
return self.c.orth
|
||||||
|
|
||||||
|
property lower:
|
||||||
|
def __get__(self): return self.c.lower
|
||||||
|
def __set__(self, int x): self.c.lower = x
|
||||||
|
|
||||||
|
property norm:
|
||||||
|
def __get__(self): return self.c.norm
|
||||||
|
def __set__(self, int x): self.c.norm = x
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
property shape:
|
||||||
cdef flags_t one = 1
|
def __get__(self): return self.c.shape
|
||||||
return self.flags & (one << flag_id)
|
def __set__(self, int x): self.c.shape = x
|
||||||
|
|
||||||
|
property prefix:
|
||||||
|
def __get__(self): return self.c.prefix
|
||||||
|
def __set__(self, int x): self.c.prefix = x
|
||||||
|
|
||||||
|
property suffix:
|
||||||
|
def __get__(self): return self.c.suffix
|
||||||
|
def __set__(self, int x): self.c.suffix = x
|
||||||
|
|
||||||
|
property orth_:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.orth]
|
||||||
|
|
||||||
|
property lower_:
|
||||||
|
def __get__(self): return self.vocab.strings[self.c.lower]
|
||||||
|
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property norm_:
|
||||||
|
def __get__(self): return self.c.norm
|
||||||
|
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property shape_:
|
||||||
|
def __get__(self): return self.vocab.strings[self.c.shape]
|
||||||
|
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property prefix_:
|
||||||
|
def __get__(self): return self.c.prefix
|
||||||
|
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property suffix_:
|
||||||
|
def __get__(self): return self.c.suffix
|
||||||
|
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return self.check_flag(IS_OOV)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return self.check_flag(IS_ALPHA)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
|
||||||
|
|
||||||
property is_ascii:
|
property is_ascii:
|
||||||
def __get__(self): return self.check_flag(IS_ASCII)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
|
||||||
|
|
||||||
property is_digit:
|
property is_digit:
|
||||||
def __get__(self): return self.check_flag(IS_DIGIT)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
|
||||||
|
|
||||||
property is_lower:
|
property is_lower:
|
||||||
def __get__(self): return self.check_flag(IS_LOWER)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
|
||||||
|
|
||||||
property is_title:
|
property is_title:
|
||||||
def __get__(self): return self.check_flag(IS_TITLE)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
|
||||||
|
|
||||||
property is_punct:
|
property is_punct:
|
||||||
def __get__(self): return self.check_flag(IS_PUNCT)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return self.check_flag(IS_SPACE)
|
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return self.check_flag(LIKE_URL)
|
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
def __get__(self): return self.check_flag(LIKE_NUM)
|
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
def __get__(self): return self.check_flag(LIKE_EMAIL)
|
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
|
||||||
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
|
||||||
|
|
|
@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -96,28 +98,26 @@ def map_attr_name(attr):
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef Pattern** patterns
|
cdef vector[Pattern*] patterns
|
||||||
cdef readonly int n_patterns
|
cdef readonly int n_patterns
|
||||||
|
|
||||||
def __init__(self, vocab, patterns):
|
def __init__(self, vocab, patterns):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
|
|
||||||
self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
|
|
||||||
cdef int i = 0
|
|
||||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||||
if isinstance(entity_key, basestring):
|
self.add(entity_key, etype, attrs, specs)
|
||||||
entity_key = vocab.strings[entity_key]
|
|
||||||
if isinstance(etype, basestring):
|
def add(self, entity_key, etype, attrs, specs):
|
||||||
etype = vocab.strings[etype]
|
if isinstance(entity_key, basestring):
|
||||||
elif etype is None:
|
entity_key = vocab.strings[entity_key]
|
||||||
etype = -1
|
if isinstance(etype, basestring):
|
||||||
# TODO: Do something more clever about multiple patterns for single
|
etype = vocab.strings[etype]
|
||||||
# entity
|
elif etype is None:
|
||||||
for spec in specs:
|
etype = -1
|
||||||
spec = _convert_strings(spec, vocab.strings)
|
# TODO: Do something more clever about multiple patterns for single
|
||||||
self.patterns[i] = init_pattern(self.mem, spec, etype)
|
# entity
|
||||||
i += 1
|
for spec in specs:
|
||||||
self.n_patterns = len(patterns)
|
spec = _convert_strings(spec, vocab.strings)
|
||||||
|
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, vocab, data_dir):
|
def from_dir(cls, vocab, data_dir):
|
||||||
|
|
|
@ -108,6 +108,11 @@ cdef class StringStore:
|
||||||
else:
|
else:
|
||||||
raise TypeError(type(string_or_id))
|
raise TypeError(type(string_or_id))
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
cdef int i
|
||||||
|
for i in range(self.size):
|
||||||
|
yield self[i]
|
||||||
|
|
||||||
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
|
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
key = hash64(chars, length * sizeof(char), 0)
|
key = hash64(chars, length * sizeof(char), 0)
|
||||||
|
|
|
@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
|
def __init__(self, data_dir=None, get_lex_attr=None):
|
||||||
pos_tags=None, oov_prob=-30):
|
|
||||||
if oov_prob is None:
|
|
||||||
oov_prob = -30
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.pos_tags = pos_tags if pos_tags is not None else {}
|
self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||||
|
|
||||||
self.lexeme_props_getter = get_lex_props
|
self.get_lex_attr = get_lex_attr
|
||||||
self.repvec_length = 0
|
self.repvec_length = 0
|
||||||
self.length = 0
|
self.length = 0
|
||||||
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
if data_dir is not None:
|
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
||||||
|
@ -63,7 +59,6 @@ cdef class Vocab:
|
||||||
|
|
||||||
self._serializer = None
|
self._serializer = None
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.oov_prob = oov_prob
|
|
||||||
|
|
||||||
property serializer:
|
property serializer:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -91,18 +86,8 @@ cdef class Vocab:
|
||||||
lex = <LexemeC*>self._by_hash.get(key)
|
lex = <LexemeC*>self._by_hash.get(key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
cdef bint is_oov = mem is not self.mem
|
|
||||||
if len(string) < 3:
|
|
||||||
mem = self.mem
|
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
|
||||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
|
||||||
if is_oov:
|
|
||||||
lex.id = 0
|
|
||||||
else:
|
else:
|
||||||
self._add_lex_to_vocab(key, lex)
|
return self._new_lexeme(mem, string)
|
||||||
assert lex != NULL, string
|
|
||||||
return lex
|
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
|
@ -114,18 +99,21 @@ cdef class Vocab:
|
||||||
lex = <LexemeC*>self._by_orth.get(orth)
|
lex = <LexemeC*>self._by_orth.get(orth)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
cdef unicode string = self.strings[orth]
|
else:
|
||||||
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||||
cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
if len(string) < 3:
|
if len(string) < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
for attr, func in self.lex_attr_getters.items():
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
Lexeme.set_struct_attr(lex, attr, func(string))
|
||||||
if is_oov:
|
if is_oov:
|
||||||
lex.id = 0
|
lex.id = 0
|
||||||
else:
|
else:
|
||||||
self._add_lex_to_vocab(hash_string(string), lex)
|
self._add_lex_to_vocab(key, lex)
|
||||||
assert lex != NULL, orth
|
assert lex != NULL, string
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
|
@ -171,15 +159,6 @@ cdef class Vocab:
|
||||||
"int --> Lexeme" % str(type(id_or_string)))
|
"int --> Lexeme" % str(type(id_or_string)))
|
||||||
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
|
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
|
||||||
|
|
||||||
def __setitem__(self, unicode string, dict props):
|
|
||||||
cdef hash_t key = hash_string(string)
|
|
||||||
cdef LexemeC* lex
|
|
||||||
lex = <LexemeC*>self._by_hash.get(key)
|
|
||||||
if lex == NULL:
|
|
||||||
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
|
||||||
self._add_lex_to_vocab(key, lex)
|
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
assert not path.isdir(loc)
|
assert not path.isdir(loc)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user