This commit is contained in:
Matthew Honnibal 2015-08-22 22:04:34 +02:00
parent 0f2cb74433
commit cad0cca4e3
6 changed files with 147 additions and 173 deletions

View File

@ -80,7 +80,6 @@ class English(object):
Packer=None,
load_vectors=True
):
self.data_dir = data_dir
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):

View File

@ -8,97 +8,53 @@ from .strings cimport StringStore
from numpy cimport ndarray
cdef LexemeC EMPTY_LEXEME
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1
cdef class Lexeme:
cdef readonly ndarray repvec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef LexemeC* c
cdef readonly Vocab vocab
cdef readonly attr_t orth
cdef readonly attr_t lower
cdef readonly attr_t norm
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly unicode orth_
cdef readonly unicode lower_
cdef readonly unicode norm_
cdef readonly unicode shape_
cdef readonly unicode prefix_
cdef readonly unicode suffix_
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
lex.length = props['length']
lex.orth = vocab.strings[props['orth']]
lex.lower = vocab.strings[props['lower']]
lex.norm = vocab.strings[props['norm']]
lex.shape = vocab.strings[props['shape']]
lex.prefix = vocab.strings[props['prefix']]
lex.suffix = vocab.strings[props['suffix']]
cdef readonly attr_t cluster
cdef readonly float prob
cdef readonly float sentiment
cdef readonly float l2_norm
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
lex.repvec = empty_vec
# Workaround for an apparent bug in the way the decorator is handled ---
# TODO: post bug report / patch to Cython.
@staticmethod
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
for i in range(repvec_length):
py.repvec[i] = ptr.repvec[i]
py.l2_norm = ptr.l2_norm
py.flags = ptr.flags
py.id = ptr.id
py.length = ptr.length
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return Lexeme.check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
py.orth = ptr.orth
py.lower = ptr.lower
py.norm = ptr.norm
py.shape = ptr.shape
py.prefix = ptr.prefix
py.suffix = ptr.suffix
py.orth_ = strings[ptr.orth]
py.lower_ = strings[ptr.lower]
py.norm_ = strings[ptr.norm]
py.shape_ = strings[ptr.shape]
py.prefix_ = strings[ptr.prefix]
py.suffix_ = strings[ptr.suffix]
py.cluster = ptr.cluster
py.prob = ptr.prob
py.sentiment = ptr.sentiment
return py
cpdef bint check_flag(self, attr_id_t flag_id) except -1
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)

View File

@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
const float* empty_vec) except -1:
lex.length = props['length']
lex.orth = string_store[props['orth']]
lex.lower = string_store[props['lower']]
lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
lex.repvec = empty_vec
cdef class Lexeme:
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag).
"""
def __cinit__(self, int vec_size):
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
def __init__(self, Vocab vocab, int orth):
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
@property
def has_repvec(self):
return self.l2_norm != 0
property orth:
def __get__(self):
return self.c.orth
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
cdef flags_t one = 1
return self.flags & (one << flag_id)
property shape:
def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
property norm_:
def __get__(self): return self.c.norm
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
property shape_:
def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
property prefix_:
def __get__(self): return self.c.prefix
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
property suffix_:
def __get__(self): return self.c.suffix
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property is_oov:
def __get__(self): return self.check_flag(IS_OOV)
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA)
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
property is_ascii:
def __get__(self): return self.check_flag(IS_ASCII)
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
property is_digit:
def __get__(self): return self.check_flag(IS_DIGIT)
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
property is_lower:
def __get__(self): return self.check_flag(IS_LOWER)
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
property is_title:
def __get__(self): return self.check_flag(IS_TITLE)
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
property is_punct:
def __get__(self): return self.check_flag(IS_PUNCT)
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
property is_space:
def __get__(self): return self.check_flag(IS_SPACE)
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
property like_url:
def __get__(self): return self.check_flag(LIKE_URL)
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
property like_num:
def __get__(self): return self.check_flag(LIKE_NUM)
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
property like_email:
def __get__(self): return self.check_flag(LIKE_EMAIL)
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)

View File

@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab
from libcpp.vector cimport vector
try:
import ujson as json
except ImportError:
@ -96,28 +98,26 @@ def map_attr_name(attr):
cdef class Matcher:
cdef Pool mem
cdef Pattern** patterns
cdef vector[Pattern*] patterns
cdef readonly int n_patterns
def __init__(self, vocab, patterns):
self.mem = Pool()
n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
cdef int i = 0
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
if isinstance(entity_key, basestring):
entity_key = vocab.strings[entity_key]
if isinstance(etype, basestring):
etype = vocab.strings[etype]
elif etype is None:
etype = -1
# TODO: Do something more clever about multiple patterns for single
# entity
for spec in specs:
spec = _convert_strings(spec, vocab.strings)
self.patterns[i] = init_pattern(self.mem, spec, etype)
i += 1
self.n_patterns = len(patterns)
self.add(entity_key, etype, attrs, specs)
def add(self, entity_key, etype, attrs, specs):
if isinstance(entity_key, basestring):
entity_key = vocab.strings[entity_key]
if isinstance(etype, basestring):
etype = vocab.strings[etype]
elif etype is None:
etype = -1
# TODO: Do something more clever about multiple patterns for single
# entity
for spec in specs:
spec = _convert_strings(spec, vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype))
@classmethod
def from_dir(cls, vocab, data_dir):

View File

@ -108,6 +108,11 @@ cdef class StringStore:
else:
raise TypeError(type(string_or_id))
def __iter__(self):
cdef int i
for i in range(self.size):
yield self[i]
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0)

View File

@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
pos_tags=None, oov_prob=-30):
if oov_prob is None:
oov_prob = -30
def __init__(self, data_dir=None, get_lex_attr=None):
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {}
self.lexeme_props_getter = get_lex_props
self.get_lex_attr = get_lex_attr
self.repvec_length = 0
self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None:
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if data_dir is not None:
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.load_lexemes(path.join(data_dir, 'strings.txt'),
@ -63,7 +59,6 @@ cdef class Vocab:
self._serializer = None
self.data_dir = data_dir
self.oov_prob = oov_prob
property serializer:
def __get__(self):
@ -91,18 +86,8 @@ cdef class Vocab:
lex = <LexemeC*>self._by_hash.get(key)
if lex != NULL:
return lex
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
return lex
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -114,18 +99,21 @@ cdef class Vocab:
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
cdef unicode string = self.strings[orth]
else:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
for attr, func in self.lex_attr_getters.items():
Lexeme.set_struct_attr(lex, attr, func(string))
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
assert lex != NULL, orth
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -171,15 +159,6 @@ cdef class Vocab:
"int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def __setitem__(self, unicode string, dict props):
cdef hash_t key = hash_string(string)
cdef LexemeC* lex
lex = <LexemeC*>self._by_hash.get(key)
if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(key, lex)
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)