* Tmp

2025-07-22 22:20:08 +03:00 · 2015-08-22 22:04:34 +02:00 · 2015-08-22 22:04:34 +02:00 · cad0cca4e3
commit cad0cca4e3
parent 0f2cb74433
6 changed files with 147 additions and 173 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -80,7 +80,6 @@ class English(object):
      Packer=None,
      load_vectors=True
    ):
        self.data_dir = data_dir
        if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -8,97 +8,53 @@ from .strings cimport StringStore
 from numpy cimport ndarray
 cdef LexemeC EMPTY_LEXEME
 cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
                              const float* empty_vec) except -1
 cdef class Lexeme:
-    cdef readonly ndarray repvec
+    cdef LexemeC* c
-
+    cdef readonly Vocab vocab
    cdef readonly flags_t flags
    cdef readonly attr_t id
    cdef readonly attr_t length
    cdef readonly attr_t orth
    cdef readonly attr_t lower
    cdef readonly attr_t norm
    cdef readonly attr_t shape
    cdef readonly attr_t prefix
    cdef readonly attr_t suffix
-    cdef readonly unicode orth_
+    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
-    cdef readonly unicode lower_
+        lex.length = props['length']
-    cdef readonly unicode norm_
+        lex.orth = vocab.strings[props['orth']]
-    cdef readonly unicode shape_
+        lex.lower = vocab.strings[props['lower']]
-    cdef readonly unicode prefix_
+        lex.norm = vocab.strings[props['norm']]
-    cdef readonly unicode suffix_
+        lex.shape = vocab.strings[props['shape']]
        lex.prefix = vocab.strings[props['prefix']]
        lex.suffix = vocab.strings[props['suffix']]
-    cdef readonly attr_t cluster
+        lex.cluster = props['cluster']
-    cdef readonly float prob
+        lex.prob = props['prob']
-    cdef readonly float sentiment
+        lex.sentiment = props['sentiment']
-    cdef readonly float l2_norm
+
        lex.flags = props['flags']
        lex.repvec = empty_vec
    # Workaround for an apparent bug in the way the decorator is handled ---
    # TODO: post bug report / patch to Cython.
    @staticmethod
-    cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
+    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
-        cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
+        if feat_name < (sizeof(flags_t) * 8):
-        for i in range(repvec_length):
+            return Lexeme.check_flag(lex, feat_name)
-            py.repvec[i] = ptr.repvec[i]
+        elif feat_name == ID:
-        py.l2_norm = ptr.l2_norm
+            return lex.id
-        py.flags = ptr.flags
+        elif feat_name == ORTH:
-        py.id = ptr.id
+            return lex.orth
-        py.length = ptr.length
+        elif feat_name == LOWER:
            return lex.lower
        elif feat_name == NORM:
            return lex.norm
        elif feat_name == SHAPE:
            return lex.shape
        elif feat_name == PREFIX:
            return lex.prefix
        elif feat_name == SUFFIX:
            return lex.suffix
        elif feat_name == LENGTH:
            return lex.length
        elif feat_name == CLUSTER:
            return lex.cluster
        else:
            return 0
-        py.orth = ptr.orth
+    cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
-        py.lower = ptr.lower
+        return lexeme.flags & (1 << flag_id)
        py.norm = ptr.norm
        py.shape = ptr.shape
        py.prefix = ptr.prefix
        py.suffix = ptr.suffix
        py.orth_ = strings[ptr.orth]
        py.lower_ = strings[ptr.lower]
        py.norm_ = strings[ptr.norm]
        py.shape_ = strings[ptr.shape]
        py.prefix_ = strings[ptr.prefix]
        py.suffix_ = strings[ptr.suffix]
        py.cluster = ptr.cluster
        py.prob = ptr.prob
        py.sentiment = ptr.sentiment
        return py
    cpdef bint check_flag(self, attr_id_t flag_id) except -1
 cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
 cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
    elif feat_name == ORTH:
        return lex.orth
    elif feat_name == LOWER:
        return lex.lower
    elif feat_name == NORM:
        return lex.norm
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
        return lex.suffix
    elif feat_name == LENGTH:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
                              const float* empty_vec) except -1:
    lex.length = props['length']
    lex.orth = string_store[props['orth']]
    lex.lower = string_store[props['lower']]
    lex.norm = string_store[props['norm']]
    lex.shape = string_store[props['shape']]
    lex.prefix = string_store[props['prefix']]
    lex.suffix = string_store[props['suffix']]
    lex.cluster = props['cluster']
    lex.prob = props['prob']
    lex.sentiment = props['sentiment']
    lex.flags = props['flags']
    lex.repvec = empty_vec
 cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
-    def __cinit__(self, int vec_size):
+    def __init__(self, Vocab vocab, int orth):
-        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
+        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
-    @property
+    property orth:
-    def has_repvec(self):
+        def __get__(self): 
-        return self.l2_norm != 0
+            return self.c.orth
    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x
    property norm:
        def __get__(self): return self.c.norm
        def __set__(self, int x): self.c.norm = x
-    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
+    property shape:
-        cdef flags_t one = 1
+        def __get__(self): return self.c.shape
-        return self.flags & (one << flag_id)
+        def __set__(self, int x): self.c.shape = x
    property prefix:
        def __get__(self): return self.c.prefix
        def __set__(self, int x): self.c.prefix = x
    property suffix:
        def __get__(self): return self.c.suffix
        def __set__(self, int x): self.c.suffix = x
    property orth_:
        def __get__(self):
            return self.vocab.strings[self.c.orth]
    property lower_:
        def __get__(self): return self.vocab.strings[self.c.lower]
        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
    property norm_:
        def __get__(self): return self.c.norm
        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
    property shape_:
        def __get__(self): return self.vocab.strings[self.c.shape]
        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
    property prefix_:
        def __get__(self): return self.c.prefix
        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
    property suffix_:
        def __get__(self): return self.c.suffix
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
    property is_oov:
-        def __get__(self): return self.check_flag(IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
    property is_alpha:
-        def __get__(self): return self.check_flag(IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
    property is_ascii:
-        def __get__(self): return self.check_flag(IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
    property is_digit:
-        def __get__(self): return self.check_flag(IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
    property is_lower:
-        def __get__(self): return self.check_flag(IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
    property is_title:
-        def __get__(self): return self.check_flag(IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
    property is_punct:
-        def __get__(self): return self.check_flag(IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
    property is_space: 
-        def __get__(self): return self.check_flag(IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
    property like_url:
-        def __get__(self): return self.check_flag(LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
    property like_num:
-        def __get__(self): return self.check_flag(LIKE_NUM)
+        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
    property like_email:
-        def __get__(self): return self.check_flag(LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 from libcpp.vector cimport vector
 try:
    import ujson as json
 except ImportError:
@ -96,28 +98,26 @@ def map_attr_name(attr):
 cdef class Matcher:
    cdef Pool mem
-    cdef Pattern** patterns
+    cdef vector[Pattern*] patterns
    cdef readonly int n_patterns
    def __init__(self, vocab, patterns):
        self.mem = Pool()
        n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
        self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
        cdef int i = 0
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            if isinstance(entity_key, basestring):
+            self.add(entity_key, etype, attrs, specs)
-                entity_key = vocab.strings[entity_key]
+
-            if isinstance(etype, basestring):
+    def add(self, entity_key, etype, attrs, specs):
-                etype = vocab.strings[etype]
+        if isinstance(entity_key, basestring):
-            elif etype is None:
+            entity_key = vocab.strings[entity_key]
-                etype = -1
+        if isinstance(etype, basestring):
-            # TODO: Do something more clever about multiple patterns for single
+            etype = vocab.strings[etype]
-            # entity
+        elif etype is None:
-            for spec in specs:
+            etype = -1
-                spec = _convert_strings(spec, vocab.strings)
+        # TODO: Do something more clever about multiple patterns for single
-                self.patterns[i] = init_pattern(self.mem, spec, etype)
+        # entity
-                i += 1
+        for spec in specs:
-        self.n_patterns = len(patterns)
+            spec = _convert_strings(spec, vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, spec, etype))
    @classmethod
    def from_dir(cls, vocab, data_dir):
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -108,6 +108,11 @@ cdef class StringStore:
        else:
            raise TypeError(type(string_or_id))
    def __iter__(self):
        cdef int i
        for i in range(self.size):
            yield self[i]
    cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index.
        key = hash64(chars, length * sizeof(char), 0)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
+    def __init__(self, data_dir=None, get_lex_attr=None):
                 pos_tags=None, oov_prob=-30):
        if oov_prob is None:
            oov_prob = -30
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.pos_tags = pos_tags if pos_tags is not None else {}
-
+        
-        self.lexeme_props_getter = get_lex_props
+        self.get_lex_attr = get_lex_attr
        self.repvec_length = 0
        self.length = 0
        self._add_lex_to_vocab(0, &EMPTY_LEXEME)
        if data_dir is not None:
            if not path.exists(data_dir):
                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
        if data_dir is not None:
            if not path.isdir(data_dir):
                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
            self.load_lexemes(path.join(data_dir, 'strings.txt'),
@ -63,7 +59,6 @@ cdef class Vocab:
        self._serializer = None
        self.data_dir = data_dir
        self.oov_prob = oov_prob
    property serializer:
        def __get__(self):
@ -91,18 +86,8 @@ cdef class Vocab:
        lex = <LexemeC*>self._by_hash.get(key)
        if lex != NULL:
            return lex
        cdef bint is_oov = mem is not self.mem
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
        if is_oov:
            lex.id = 0
        else:
-            self._add_lex_to_vocab(key, lex)
+            return self._new_lexeme(mem, string)
        assert lex != NULL, string
        return lex
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -114,18 +99,21 @@ cdef class Vocab:
        lex = <LexemeC*>self._by_orth.get(orth)
        if lex != NULL:
            return lex
-        cdef unicode string = self.strings[orth]
+        else:
            return self._new_lexeme(mem, self.strings[orth])
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
        cdef bint is_oov = mem is not self.mem
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
+        for attr, func in self.lex_attr_getters.items():
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
+            Lexeme.set_struct_attr(lex, attr, func(string))
        if is_oov:
            lex.id = 0
        else:
-            self._add_lex_to_vocab(hash_string(string), lex)
+            self._add_lex_to_vocab(key, lex)
-        assert lex != NULL, orth
+        assert lex != NULL, string
        return lex
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -171,15 +159,6 @@ cdef class Vocab:
                "int --> Lexeme" % str(type(id_or_string)))
        return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
    def __setitem__(self, unicode string, dict props):
        cdef hash_t key = hash_string(string)
        cdef LexemeC* lex
        lex = <LexemeC*>self._by_hash.get(key)
        if lex == NULL:
            lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
        self._add_lex_to_vocab(key, lex)
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)