From cad0cca4e3b7c50f45e1e1084d7d3d2fbc6db7ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:04:34 +0200 Subject: [PATCH] * Tmp --- spacy/en/__init__.py | 1 - spacy/lexeme.pxd | 126 ++++++++++++++----------------------------- spacy/lexeme.pyx | 109 ++++++++++++++++++++++++------------- spacy/matcher.pyx | 34 ++++++------ spacy/strings.pyx | 5 ++ spacy/vocab.pyx | 45 +++++----------- 6 files changed, 147 insertions(+), 173 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index c81630a72..a04b615da 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -80,7 +80,6 @@ class English(object): Packer=None, load_vectors=True ): - self.data_dir = data_dir if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f7b210281..321f7c616 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -8,97 +8,53 @@ from .strings cimport StringStore from numpy cimport ndarray - cdef LexemeC EMPTY_LEXEME - -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, - const float* empty_vec) except -1 - cdef class Lexeme: - cdef readonly ndarray repvec - - cdef readonly flags_t flags - cdef readonly attr_t id - cdef readonly attr_t length - + cdef LexemeC* c + cdef readonly Vocab vocab cdef readonly attr_t orth - cdef readonly attr_t lower - cdef readonly attr_t norm - cdef readonly attr_t shape - cdef readonly attr_t prefix - cdef readonly attr_t suffix - cdef readonly unicode orth_ - cdef readonly unicode lower_ - cdef readonly unicode norm_ - cdef readonly unicode shape_ - cdef readonly unicode prefix_ - cdef readonly unicode suffix_ + cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + lex.length = props['length'] + lex.orth = vocab.strings[props['orth']] + lex.lower = vocab.strings[props['lower']] + lex.norm = vocab.strings[props['norm']] + lex.shape = vocab.strings[props['shape']] + lex.prefix = vocab.strings[props['prefix']] + lex.suffix = vocab.strings[props['suffix']] - cdef readonly attr_t cluster - cdef readonly float prob - cdef readonly float sentiment - cdef readonly float l2_norm + lex.cluster = props['cluster'] + lex.prob = props['prob'] + lex.sentiment = props['sentiment'] + + lex.flags = props['flags'] + lex.repvec = empty_vec - # Workaround for an apparent bug in the way the decorator is handled --- - # TODO: post bug report / patch to Cython. @staticmethod - cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length): - cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length) - for i in range(repvec_length): - py.repvec[i] = ptr.repvec[i] - py.l2_norm = ptr.l2_norm - py.flags = ptr.flags - py.id = ptr.id - py.length = ptr.length + cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + return Lexeme.check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == ORTH: + return lex.orth + elif feat_name == LOWER: + return lex.lower + elif feat_name == NORM: + return lex.norm + elif feat_name == SHAPE: + return lex.shape + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + else: + return 0 - py.orth = ptr.orth - py.lower = ptr.lower - py.norm = ptr.norm - py.shape = ptr.shape - py.prefix = ptr.prefix - py.suffix = ptr.suffix - - py.orth_ = strings[ptr.orth] - py.lower_ = strings[ptr.lower] - py.norm_ = strings[ptr.norm] - py.shape_ = strings[ptr.shape] - py.prefix_ = strings[ptr.prefix] - py.suffix_ = strings[ptr.suffix] - - py.cluster = ptr.cluster - py.prob = ptr.prob - py.sentiment = ptr.sentiment - return py - - cpdef bint check_flag(self, attr_id_t flag_id) except -1 - - -cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) - - -cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: - if feat_name < (sizeof(flags_t) * 8): - return check_flag(lex, feat_name) - elif feat_name == ID: - return lex.id - elif feat_name == ORTH: - return lex.orth - elif feat_name == LOWER: - return lex.lower - elif feat_name == NORM: - return lex.norm - elif feat_name == SHAPE: - return lex.shape - elif feat_name == PREFIX: - return lex.prefix - elif feat_name == SUFFIX: - return lex.suffix - elif feat_name == LENGTH: - return lex.length - elif feat_name == CLUSTER: - return lex.cluster - else: - return 0 + cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: + return lexeme.flags & (1 << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 07f151114..f0b3303f1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,70 +17,105 @@ from .attrs cimport IS_OOV memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store, - const float* empty_vec) except -1: - lex.length = props['length'] - lex.orth = string_store[props['orth']] - lex.lower = string_store[props['lower']] - lex.norm = string_store[props['norm']] - lex.shape = string_store[props['shape']] - lex.prefix = string_store[props['prefix']] - lex.suffix = string_store[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] - lex.repvec = empty_vec - - cdef class Lexeme: """An entry in the vocabulary. A Lexeme has no string context --- it's a word-type, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). """ - def __cinit__(self, int vec_size): - self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) + def __init__(self, Vocab vocab, int orth): + self.vocab = vocab + self.orth = orth + self.c = vocab.get_by_orth(orth) - @property - def has_repvec(self): - return self.l2_norm != 0 + property orth: + def __get__(self): + return self.c.orth + + property lower: + def __get__(self): return self.c.lower + def __set__(self, int x): self.c.lower = x + + property norm: + def __get__(self): return self.c.norm + def __set__(self, int x): self.c.norm = x - cpdef bint check_flag(self, attr_id_t flag_id) except -1: - cdef flags_t one = 1 - return self.flags & (one << flag_id) + property shape: + def __get__(self): return self.c.shape + def __set__(self, int x): self.c.shape = x + + property prefix: + def __get__(self): return self.c.prefix + def __set__(self, int x): self.c.prefix = x + + property suffix: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower_: + def __get__(self): return self.vocab.strings[self.c.lower] + def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] + + property norm_: + def __get__(self): return self.c.norm + def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] + + property shape_: + def __get__(self): return self.vocab.strings[self.c.shape] + def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] + + property prefix_: + def __get__(self): return self.c.prefix + def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x] + + property suffix_: + def __get__(self): return self.c.suffix + def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] property is_oov: - def __get__(self): return self.check_flag(IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) property is_alpha: - def __get__(self): return self.check_flag(IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) property is_ascii: - def __get__(self): return self.check_flag(IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) property is_digit: - def __get__(self): return self.check_flag(IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) property is_lower: - def __get__(self): return self.check_flag(IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) property is_title: - def __get__(self): return self.check_flag(IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) property is_punct: - def __get__(self): return self.check_flag(IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) property is_space: - def __get__(self): return self.check_flag(IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) property like_url: - def __get__(self): return self.check_flag(LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return self.check_flag(LIKE_NUM) + def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: - def __get__(self): return self.check_flag(LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ee2ceaecc..72473b073 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab +from libcpp.vector cimport vector + try: import ujson as json except ImportError: @@ -96,28 +98,26 @@ def map_attr_name(attr): cdef class Matcher: cdef Pool mem - cdef Pattern** patterns + cdef vector[Pattern*] patterns cdef readonly int n_patterns def __init__(self, vocab, patterns): self.mem = Pool() - n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()]) - self.patterns = self.mem.alloc(n_patterns, sizeof(Pattern*)) - cdef int i = 0 for entity_key, (etype, attrs, specs) in sorted(patterns.items()): - if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] - if isinstance(etype, basestring): - etype = vocab.strings[etype] - elif etype is None: - etype = -1 - # TODO: Do something more clever about multiple patterns for single - # entity - for spec in specs: - spec = _convert_strings(spec, vocab.strings) - self.patterns[i] = init_pattern(self.mem, spec, etype) - i += 1 - self.n_patterns = len(patterns) + self.add(entity_key, etype, attrs, specs) + + def add(self, entity_key, etype, attrs, specs): + if isinstance(entity_key, basestring): + entity_key = vocab.strings[entity_key] + if isinstance(etype, basestring): + etype = vocab.strings[etype] + elif etype is None: + etype = -1 + # TODO: Do something more clever about multiple patterns for single + # entity + for spec in specs: + spec = _convert_strings(spec, vocab.strings) + self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod def from_dir(cls, vocab, data_dir): diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b35ed2ccb..c187a6aa6 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -108,6 +108,11 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) + def __iter__(self): + cdef int i + for i in range(self.size): + yield self[i] + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac2e11e11..dcb7d575c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, - pos_tags=None, oov_prob=-30): - if oov_prob is None: - oov_prob = -30 + def __init__(self, data_dir=None, get_lex_attr=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() self.pos_tags = pos_tags if pos_tags is not None else {} - - self.lexeme_props_getter = get_lex_props + + self.get_lex_attr = get_lex_attr self.repvec_length = 0 self.length = 0 self._add_lex_to_vocab(0, &EMPTY_LEXEME) if data_dir is not None: if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if data_dir is not None: if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) self.load_lexemes(path.join(data_dir, 'strings.txt'), @@ -63,7 +59,6 @@ cdef class Vocab: self._serializer = None self.data_dir = data_dir - self.oov_prob = oov_prob property serializer: def __get__(self): @@ -91,18 +86,8 @@ cdef class Vocab: lex = self._by_hash.get(key) if lex != NULL: return lex - cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem - lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - if is_oov: - lex.id = 0 else: - self._add_lex_to_vocab(key, lex) - assert lex != NULL, string - return lex + return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -114,18 +99,21 @@ cdef class Vocab: lex = self._by_orth.get(orth) if lex != NULL: return lex - cdef unicode string = self.strings[orth] + else: + return self._new_lexeme(mem, self.strings[orth]) + + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef bint is_oov = mem is not self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) + for attr, func in self.lex_attr_getters.items(): + Lexeme.set_struct_attr(lex, attr, func(string)) if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(hash_string(string), lex) - assert lex != NULL, orth + self._add_lex_to_vocab(key, lex) + assert lex != NULL, string return lex cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: @@ -171,15 +159,6 @@ cdef class Vocab: "int --> Lexeme" % str(type(id_or_string))) return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) - def __setitem__(self, unicode string, dict props): - cdef hash_t key = hash_string(string) - cdef LexemeC* lex - lex = self._by_hash.get(key) - if lex == NULL: - lex = self.mem.alloc(sizeof(LexemeC), 1) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - self._add_lex_to_vocab(key, lex) - def dump(self, loc): if path.exists(loc): assert not path.isdir(loc)