This commit is contained in:
Matthew Honnibal 2015-08-22 22:04:34 +02:00
parent 0f2cb74433
commit cad0cca4e3
6 changed files with 147 additions and 173 deletions

View File

@ -80,7 +80,6 @@ class English(object):
Packer=None, Packer=None,
load_vectors=True load_vectors=True
): ):
self.data_dir = data_dir self.data_dir = data_dir
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):

View File

@ -8,97 +8,53 @@ from .strings cimport StringStore
from numpy cimport ndarray from numpy cimport ndarray
cdef LexemeC EMPTY_LEXEME cdef LexemeC EMPTY_LEXEME
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1
cdef class Lexeme: cdef class Lexeme:
cdef readonly ndarray repvec cdef LexemeC* c
cdef readonly Vocab vocab
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef readonly attr_t orth cdef readonly attr_t orth
cdef readonly attr_t lower
cdef readonly attr_t norm
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly unicode orth_ cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
cdef readonly unicode lower_ lex.length = props['length']
cdef readonly unicode norm_ lex.orth = vocab.strings[props['orth']]
cdef readonly unicode shape_ lex.lower = vocab.strings[props['lower']]
cdef readonly unicode prefix_ lex.norm = vocab.strings[props['norm']]
cdef readonly unicode suffix_ lex.shape = vocab.strings[props['shape']]
lex.prefix = vocab.strings[props['prefix']]
lex.suffix = vocab.strings[props['suffix']]
cdef readonly attr_t cluster lex.cluster = props['cluster']
cdef readonly float prob lex.prob = props['prob']
cdef readonly float sentiment lex.sentiment = props['sentiment']
cdef readonly float l2_norm
lex.flags = props['flags']
lex.repvec = empty_vec
# Workaround for an apparent bug in the way the decorator is handled ---
# TODO: post bug report / patch to Cython.
@staticmethod @staticmethod
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length): cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length) if feat_name < (sizeof(flags_t) * 8):
for i in range(repvec_length): return Lexeme.check_flag(lex, feat_name)
py.repvec[i] = ptr.repvec[i] elif feat_name == ID:
py.l2_norm = ptr.l2_norm return lex.id
py.flags = ptr.flags elif feat_name == ORTH:
py.id = ptr.id return lex.orth
py.length = ptr.length elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
py.orth = ptr.orth cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
py.lower = ptr.lower return lexeme.flags & (1 << flag_id)
py.norm = ptr.norm
py.shape = ptr.shape
py.prefix = ptr.prefix
py.suffix = ptr.suffix
py.orth_ = strings[ptr.orth]
py.lower_ = strings[ptr.lower]
py.norm_ = strings[ptr.norm]
py.shape_ = strings[ptr.shape]
py.prefix_ = strings[ptr.prefix]
py.suffix_ = strings[ptr.suffix]
py.cluster = ptr.cluster
py.prob = ptr.prob
py.sentiment = ptr.sentiment
return py
cpdef bint check_flag(self, attr_id_t flag_id) except -1
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0

View File

@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
const float* empty_vec) except -1:
lex.length = props['length']
lex.orth = string_store[props['orth']]
lex.lower = string_store[props['lower']]
lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
lex.repvec = empty_vec
cdef class Lexeme: cdef class Lexeme:
"""An entry in the vocabulary. A Lexeme has no string context --- it's a """An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __cinit__(self, int vec_size): def __init__(self, Vocab vocab, int orth):
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
@property property orth:
def has_repvec(self): def __get__(self):
return self.l2_norm != 0 return self.c.orth
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x
cpdef bint check_flag(self, attr_id_t flag_id) except -1: property shape:
cdef flags_t one = 1 def __get__(self): return self.c.shape
return self.flags & (one << flag_id) def __set__(self, int x): self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
property norm_:
def __get__(self): return self.c.norm
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
property shape_:
def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
property prefix_:
def __get__(self): return self.c.prefix
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
property suffix_:
def __get__(self): return self.c.suffix
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property is_oov: property is_oov:
def __get__(self): return self.check_flag(IS_OOV) def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
property is_alpha: property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
property is_ascii: property is_ascii:
def __get__(self): return self.check_flag(IS_ASCII) def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
property is_digit: property is_digit:
def __get__(self): return self.check_flag(IS_DIGIT) def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
property is_lower: property is_lower:
def __get__(self): return self.check_flag(IS_LOWER) def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
property is_title: property is_title:
def __get__(self): return self.check_flag(IS_TITLE) def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
property is_punct: property is_punct:
def __get__(self): return self.check_flag(IS_PUNCT) def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
property is_space: property is_space:
def __get__(self): return self.check_flag(IS_SPACE) def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
property like_url: property like_url:
def __get__(self): return self.check_flag(LIKE_URL) def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
property like_num: property like_num:
def __get__(self): return self.check_flag(LIKE_NUM) def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
property like_email: property like_email:
def __get__(self): return self.check_flag(LIKE_EMAIL) def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)

View File

@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .vocab cimport Vocab from .vocab cimport Vocab
from libcpp.vector cimport vector
try: try:
import ujson as json import ujson as json
except ImportError: except ImportError:
@ -96,28 +98,26 @@ def map_attr_name(attr):
cdef class Matcher: cdef class Matcher:
cdef Pool mem cdef Pool mem
cdef Pattern** patterns cdef vector[Pattern*] patterns
cdef readonly int n_patterns cdef readonly int n_patterns
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):
self.mem = Pool() self.mem = Pool()
n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
cdef int i = 0
for entity_key, (etype, attrs, specs) in sorted(patterns.items()): for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
if isinstance(entity_key, basestring): self.add(entity_key, etype, attrs, specs)
entity_key = vocab.strings[entity_key]
if isinstance(etype, basestring): def add(self, entity_key, etype, attrs, specs):
etype = vocab.strings[etype] if isinstance(entity_key, basestring):
elif etype is None: entity_key = vocab.strings[entity_key]
etype = -1 if isinstance(etype, basestring):
# TODO: Do something more clever about multiple patterns for single etype = vocab.strings[etype]
# entity elif etype is None:
for spec in specs: etype = -1
spec = _convert_strings(spec, vocab.strings) # TODO: Do something more clever about multiple patterns for single
self.patterns[i] = init_pattern(self.mem, spec, etype) # entity
i += 1 for spec in specs:
self.n_patterns = len(patterns) spec = _convert_strings(spec, vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype))
@classmethod @classmethod
def from_dir(cls, vocab, data_dir): def from_dir(cls, vocab, data_dir):

View File

@ -108,6 +108,11 @@ cdef class StringStore:
else: else:
raise TypeError(type(string_or_id)) raise TypeError(type(string_or_id))
def __iter__(self):
cdef int i
for i in range(self.size):
yield self[i]
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0) key = hash64(chars, length * sizeof(char), 0)

View File

@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, def __init__(self, data_dir=None, get_lex_attr=None):
pos_tags=None, oov_prob=-30):
if oov_prob is None:
oov_prob = -30
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {} self.pos_tags = pos_tags if pos_tags is not None else {}
self.lexeme_props_getter = get_lex_props self.get_lex_attr = get_lex_attr
self.repvec_length = 0 self.repvec_length = 0
self.length = 0 self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME) self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None: if data_dir is not None:
if not path.exists(data_dir): if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if data_dir is not None:
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.load_lexemes(path.join(data_dir, 'strings.txt'), self.load_lexemes(path.join(data_dir, 'strings.txt'),
@ -63,7 +59,6 @@ cdef class Vocab:
self._serializer = None self._serializer = None
self.data_dir = data_dir self.data_dir = data_dir
self.oov_prob = oov_prob
property serializer: property serializer:
def __get__(self): def __get__(self):
@ -91,18 +86,8 @@ cdef class Vocab:
lex = <LexemeC*>self._by_hash.get(key) lex = <LexemeC*>self._by_hash.get(key)
if lex != NULL: if lex != NULL:
return lex return lex
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else: else:
self._add_lex_to_vocab(key, lex) return self._new_lexeme(mem, string)
assert lex != NULL, string
return lex
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -114,18 +99,21 @@ cdef class Vocab:
lex = <LexemeC*>self._by_orth.get(orth) lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL: if lex != NULL:
return lex return lex
cdef unicode string = self.strings[orth] else:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef bint is_oov = mem is not self.mem cdef bint is_oov = mem is not self.mem
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) for attr, func in self.lex_attr_getters.items():
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) Lexeme.set_struct_attr(lex, attr, func(string))
if is_oov: if is_oov:
lex.id = 0 lex.id = 0
else: else:
self._add_lex_to_vocab(hash_string(string), lex) self._add_lex_to_vocab(key, lex)
assert lex != NULL, orth assert lex != NULL, string
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -171,15 +159,6 @@ cdef class Vocab:
"int --> Lexeme" % str(type(id_or_string))) "int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def __setitem__(self, unicode string, dict props):
cdef hash_t key = hash_string(string)
cdef LexemeC* lex
lex = <LexemeC*>self._by_hash.get(key)
if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(key, lex)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
assert not path.isdir(loc) assert not path.isdir(loc)