mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Tmp
This commit is contained in:
parent
0f2cb74433
commit
cad0cca4e3
|
@ -80,7 +80,6 @@ class English(object):
|
|||
Packer=None,
|
||||
load_vectors=True
|
||||
):
|
||||
|
||||
self.data_dir = data_dir
|
||||
|
||||
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
|
||||
|
|
126
spacy/lexeme.pxd
126
spacy/lexeme.pxd
|
@ -8,97 +8,53 @@ from .strings cimport StringStore
|
|||
from numpy cimport ndarray
|
||||
|
||||
|
||||
|
||||
cdef LexemeC EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
||||
const float* empty_vec) except -1
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef readonly ndarray repvec
|
||||
|
||||
cdef readonly flags_t flags
|
||||
cdef readonly attr_t id
|
||||
cdef readonly attr_t length
|
||||
|
||||
cdef LexemeC* c
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly attr_t orth
|
||||
cdef readonly attr_t lower
|
||||
cdef readonly attr_t norm
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
||||
cdef readonly unicode orth_
|
||||
cdef readonly unicode lower_
|
||||
cdef readonly unicode norm_
|
||||
cdef readonly unicode shape_
|
||||
cdef readonly unicode prefix_
|
||||
cdef readonly unicode suffix_
|
||||
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = vocab.strings[props['orth']]
|
||||
lex.lower = vocab.strings[props['lower']]
|
||||
lex.norm = vocab.strings[props['norm']]
|
||||
lex.shape = vocab.strings[props['shape']]
|
||||
lex.prefix = vocab.strings[props['prefix']]
|
||||
lex.suffix = vocab.strings[props['suffix']]
|
||||
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly float prob
|
||||
cdef readonly float sentiment
|
||||
cdef readonly float l2_norm
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
||||
lex.flags = props['flags']
|
||||
lex.repvec = empty_vec
|
||||
|
||||
# Workaround for an apparent bug in the way the decorator is handled ---
|
||||
# TODO: post bug report / patch to Cython.
|
||||
@staticmethod
|
||||
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
|
||||
cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
|
||||
for i in range(repvec_length):
|
||||
py.repvec[i] = ptr.repvec[i]
|
||||
py.l2_norm = ptr.l2_norm
|
||||
py.flags = ptr.flags
|
||||
py.id = ptr.id
|
||||
py.length = ptr.length
|
||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return Lexeme.check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == LOWER:
|
||||
return lex.lower
|
||||
elif feat_name == NORM:
|
||||
return lex.norm
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
return lex.prefix
|
||||
elif feat_name == SUFFIX:
|
||||
return lex.suffix
|
||||
elif feat_name == LENGTH:
|
||||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
else:
|
||||
return 0
|
||||
|
||||
py.orth = ptr.orth
|
||||
py.lower = ptr.lower
|
||||
py.norm = ptr.norm
|
||||
py.shape = ptr.shape
|
||||
py.prefix = ptr.prefix
|
||||
py.suffix = ptr.suffix
|
||||
|
||||
py.orth_ = strings[ptr.orth]
|
||||
py.lower_ = strings[ptr.lower]
|
||||
py.norm_ = strings[ptr.norm]
|
||||
py.shape_ = strings[ptr.shape]
|
||||
py.prefix_ = strings[ptr.prefix]
|
||||
py.suffix_ = strings[ptr.suffix]
|
||||
|
||||
py.cluster = ptr.cluster
|
||||
py.prob = ptr.prob
|
||||
py.sentiment = ptr.sentiment
|
||||
return py
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1
|
||||
|
||||
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == LOWER:
|
||||
return lex.lower
|
||||
elif feat_name == NORM:
|
||||
return lex.norm
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
return lex.prefix
|
||||
elif feat_name == SUFFIX:
|
||||
return lex.suffix
|
||||
elif feat_name == LENGTH:
|
||||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
else:
|
||||
return 0
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
|
109
spacy/lexeme.pyx
109
spacy/lexeme.pyx
|
@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
|
|||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
||||
|
||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
||||
const float* empty_vec) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = string_store[props['orth']]
|
||||
lex.lower = string_store[props['lower']]
|
||||
lex.norm = string_store[props['norm']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
lex.prefix = string_store[props['prefix']]
|
||||
lex.suffix = string_store[props['suffix']]
|
||||
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
||||
lex.flags = props['flags']
|
||||
lex.repvec = empty_vec
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __cinit__(self, int vec_size):
|
||||
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||
|
||||
@property
|
||||
def has_repvec(self):
|
||||
return self.l2_norm != 0
|
||||
property orth:
|
||||
def __get__(self):
|
||||
return self.c.orth
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, int x): self.c.lower = x
|
||||
|
||||
property norm:
|
||||
def __get__(self): return self.c.norm
|
||||
def __set__(self, int x): self.c.norm = x
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
cdef flags_t one = 1
|
||||
return self.flags & (one << flag_id)
|
||||
property shape:
|
||||
def __get__(self): return self.c.shape
|
||||
def __set__(self, int x): self.c.shape = x
|
||||
|
||||
property prefix:
|
||||
def __get__(self): return self.c.prefix
|
||||
def __set__(self, int x): self.c.prefix = x
|
||||
|
||||
property suffix:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
property lower_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lower]
|
||||
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
|
||||
|
||||
property norm_:
|
||||
def __get__(self): return self.c.norm
|
||||
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
|
||||
|
||||
property shape_:
|
||||
def __get__(self): return self.vocab.strings[self.c.shape]
|
||||
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
|
||||
|
||||
property prefix_:
|
||||
def __get__(self): return self.c.prefix
|
||||
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
|
||||
|
||||
property suffix_:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return self.check_flag(IS_OOV)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return self.check_flag(IS_ALPHA)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return self.check_flag(IS_ASCII)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return self.check_flag(IS_DIGIT)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return self.check_flag(IS_LOWER)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return self.check_flag(IS_TITLE)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return self.check_flag(IS_PUNCT)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return self.check_flag(IS_SPACE)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return self.check_flag(LIKE_URL)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return self.check_flag(LIKE_NUM)
|
||||
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return self.check_flag(LIKE_EMAIL)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
|
||||
|
|
|
@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
|
|||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
|
@ -96,28 +98,26 @@ def map_attr_name(attr):
|
|||
|
||||
cdef class Matcher:
|
||||
cdef Pool mem
|
||||
cdef Pattern** patterns
|
||||
cdef vector[Pattern*] patterns
|
||||
cdef readonly int n_patterns
|
||||
|
||||
def __init__(self, vocab, patterns):
|
||||
self.mem = Pool()
|
||||
n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
|
||||
self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
|
||||
cdef int i = 0
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
if isinstance(entity_key, basestring):
|
||||
entity_key = vocab.strings[entity_key]
|
||||
if isinstance(etype, basestring):
|
||||
etype = vocab.strings[etype]
|
||||
elif etype is None:
|
||||
etype = -1
|
||||
# TODO: Do something more clever about multiple patterns for single
|
||||
# entity
|
||||
for spec in specs:
|
||||
spec = _convert_strings(spec, vocab.strings)
|
||||
self.patterns[i] = init_pattern(self.mem, spec, etype)
|
||||
i += 1
|
||||
self.n_patterns = len(patterns)
|
||||
self.add(entity_key, etype, attrs, specs)
|
||||
|
||||
def add(self, entity_key, etype, attrs, specs):
|
||||
if isinstance(entity_key, basestring):
|
||||
entity_key = vocab.strings[entity_key]
|
||||
if isinstance(etype, basestring):
|
||||
etype = vocab.strings[etype]
|
||||
elif etype is None:
|
||||
etype = -1
|
||||
# TODO: Do something more clever about multiple patterns for single
|
||||
# entity
|
||||
for spec in specs:
|
||||
spec = _convert_strings(spec, vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, vocab, data_dir):
|
||||
|
|
|
@ -108,6 +108,11 @@ cdef class StringStore:
|
|||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
|
||||
def __iter__(self):
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
yield self[i]
|
||||
|
||||
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
key = hash64(chars, length * sizeof(char), 0)
|
||||
|
|
|
@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
|||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
|
||||
pos_tags=None, oov_prob=-30):
|
||||
if oov_prob is None:
|
||||
oov_prob = -30
|
||||
def __init__(self, data_dir=None, get_lex_attr=None):
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||
|
||||
self.lexeme_props_getter = get_lex_props
|
||||
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.repvec_length = 0
|
||||
self.length = 0
|
||||
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
||||
if data_dir is not None:
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if data_dir is not None:
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
||||
|
@ -63,7 +59,6 @@ cdef class Vocab:
|
|||
|
||||
self._serializer = None
|
||||
self.data_dir = data_dir
|
||||
self.oov_prob = oov_prob
|
||||
|
||||
property serializer:
|
||||
def __get__(self):
|
||||
|
@ -91,18 +86,8 @@ cdef class Vocab:
|
|||
lex = <LexemeC*>self._by_hash.get(key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
else:
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
assert lex != NULL, string
|
||||
return lex
|
||||
return self._new_lexeme(mem, string)
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
|
@ -114,18 +99,21 @@ cdef class Vocab:
|
|||
lex = <LexemeC*>self._by_orth.get(orth)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
cdef unicode string = self.strings[orth]
|
||||
else:
|
||||
return self._new_lexeme(mem, self.strings[orth])
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
for attr, func in self.lex_attr_getters.items():
|
||||
Lexeme.set_struct_attr(lex, attr, func(string))
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
else:
|
||||
self._add_lex_to_vocab(hash_string(string), lex)
|
||||
assert lex != NULL, orth
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
assert lex != NULL, string
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
|
@ -171,15 +159,6 @@ cdef class Vocab:
|
|||
"int --> Lexeme" % str(type(id_or_string)))
|
||||
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
|
||||
|
||||
def __setitem__(self, unicode string, dict props):
|
||||
cdef hash_t key = hash_string(string)
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._by_hash.get(key)
|
||||
if lex == NULL:
|
||||
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc)
|
||||
|
|
Loading…
Reference in New Issue
Block a user