From d2fc104a26b8832162847b946d0d3973e95cfaaa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 19:45:15 +0200 Subject: [PATCH] * Begin merge of Gazetteer and DE branches --- spacy/lexeme.pxd | 50 +++++++++++++++++++++++++++++------------- spacy/lexeme.pyx | 30 +++++++++++-------------- spacy/matcher.pyx | 7 +++--- spacy/tokens/doc.pyx | 5 ++--- spacy/tokens/token.pyx | 26 +++++++++++----------- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 12 +++++----- 7 files changed, 74 insertions(+), 57 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 321f7c616..130966765 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray @@ -15,21 +16,31 @@ cdef class Lexeme: cdef readonly Vocab vocab cdef readonly attr_t orth - cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: - lex.length = props['length'] - lex.orth = vocab.strings[props['orth']] - lex.lower = vocab.strings[props['lower']] - lex.norm = vocab.strings[props['norm']] - lex.shape = vocab.strings[props['shape']] - lex.prefix = vocab.strings[props['prefix']] - lex.suffix = vocab.strings[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] - lex.repvec = empty_vec + @staticmethod + cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): + cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) + self.c = lex + self.vocab = vocab + self.orth = lex.orth + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -56,5 +67,14 @@ cdef class Lexeme: else: return 0 + @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + @staticmethod + cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil: + cdef flags_t one = 1 + if value: + lexeme.flags |= one << flag_id + else: + lexeme.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0b3303f1..832f4fec7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,12 +26,8 @@ cdef class Lexeme: def __init__(self, Vocab vocab, int orth): self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(orth) + self.c = vocab.get_by_orth(vocab.mem, orth) - property orth: - def __get__(self): - return self.c.orth - property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -78,44 +74,44 @@ cdef class Lexeme: property is_oov: def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x) property is_alpha: def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x) property is_ascii: def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x) property is_digit: def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x) property is_lower: def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x) property is_title: def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x) property is_punct: def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x) property is_space: def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x) property like_url: def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 72473b073..caafe6498 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -102,21 +102,22 @@ cdef class Matcher: cdef readonly int n_patterns def __init__(self, vocab, patterns): + self.vocab = vocab self.mem = Pool() for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] + entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): - etype = vocab.strings[etype] + etype = self.vocab.strings[etype] elif etype is None: etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: - spec = _convert_strings(spec, vocab.strings) + spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 955e9b45f..4ba0d675a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t import numpy import struct +from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t @@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index cc50fdd08..2fa1366a1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -9,6 +8,7 @@ np.import_array() import numpy +from ..lexeme cimport Lexeme from ..parts_of_speech import UNIV_POS_NAMES from ..attrs cimport LEMMA @@ -42,7 +42,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -286,37 +286,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..710a1b5ec 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -37,6 +37,7 @@ cdef class Vocab: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef PreshMap _by_hash cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dcb7d575c..2d67e59f2 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,7 +12,6 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape @@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None): + def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} + #self.pos_tags = pos_tags if pos_tags is not None else {} + self.pos_tags = {} self.get_lex_attr = get_lex_attr self.repvec_length = 0 @@ -112,7 +112,7 @@ cdef class Vocab: if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(key, lex) + self._add_lex_to_vocab(hash_string(string), lex) assert lex != NULL, string return lex @@ -125,7 +125,7 @@ cdef class Vocab: cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme.from_ptr(addr, self, self.repvec_length) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -157,7 +157,7 @@ cdef class Vocab: raise ValueError("Vocab unable to map type: " "%s. Maps unicode --> Lexeme or " "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) + return Lexeme.from_ptr(lexeme, self, self.repvec_length) def dump(self, loc): if path.exists(loc):