* Tmp

2025-08-24 05:54:55 +03:00 · 2015-08-22 22:04:34 +02:00 · 2015-08-22 22:04:34 +02:00 · cad0cca4e3
commit cad0cca4e3
parent 0f2cb74433
6 changed files with 147 additions and 173 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -80,7 +80,6 @@ class English(object):
      Packer=None,
      load_vectors=True
    ):
-        
        self.data_dir = data_dir

        if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -8,97 +8,53 @@ from .strings cimport StringStore
 from numpy cimport ndarray


-
 cdef LexemeC EMPTY_LEXEME

-
-cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
-                              const float* empty_vec) except -1
-
 cdef class Lexeme:
-    cdef readonly ndarray repvec
-
-    cdef readonly flags_t flags
-    cdef readonly attr_t id
-    cdef readonly attr_t length
-
+    cdef LexemeC* c
+    cdef readonly Vocab vocab
    cdef readonly attr_t orth
-    cdef readonly attr_t lower
-    cdef readonly attr_t norm
-    cdef readonly attr_t shape
-    cdef readonly attr_t prefix
-    cdef readonly attr_t suffix

-    cdef readonly unicode orth_
-    cdef readonly unicode lower_
-    cdef readonly unicode norm_
-    cdef readonly unicode shape_
-    cdef readonly unicode prefix_
-    cdef readonly unicode suffix_
+    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
+        lex.length = props['length']
+        lex.orth = vocab.strings[props['orth']]
+        lex.lower = vocab.strings[props['lower']]
+        lex.norm = vocab.strings[props['norm']]
+        lex.shape = vocab.strings[props['shape']]
+        lex.prefix = vocab.strings[props['prefix']]
+        lex.suffix = vocab.strings[props['suffix']]

-    cdef readonly attr_t cluster
-    cdef readonly float prob
-    cdef readonly float sentiment
-    cdef readonly float l2_norm
+        lex.cluster = props['cluster']
+        lex.prob = props['prob']
+        lex.sentiment = props['sentiment']
+
+        lex.flags = props['flags']
+        lex.repvec = empty_vec

-    # Workaround for an apparent bug in the way the decorator is handled ---
-    # TODO: post bug report / patch to Cython.
    @staticmethod
-    cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
-        cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
-        for i in range(repvec_length):
-            py.repvec[i] = ptr.repvec[i]
-        py.l2_norm = ptr.l2_norm
-        py.flags = ptr.flags
-        py.id = ptr.id
-        py.length = ptr.length
+    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
+        if feat_name < (sizeof(flags_t) * 8):
+            return Lexeme.check_flag(lex, feat_name)
+        elif feat_name == ID:
+            return lex.id
+        elif feat_name == ORTH:
+            return lex.orth
+        elif feat_name == LOWER:
+            return lex.lower
+        elif feat_name == NORM:
+            return lex.norm
+        elif feat_name == SHAPE:
+            return lex.shape
+        elif feat_name == PREFIX:
+            return lex.prefix
+        elif feat_name == SUFFIX:
+            return lex.suffix
+        elif feat_name == LENGTH:
+            return lex.length
+        elif feat_name == CLUSTER:
+            return lex.cluster
+        else:
+            return 0

-        py.orth = ptr.orth
-        py.lower = ptr.lower
-        py.norm = ptr.norm
-        py.shape = ptr.shape
-        py.prefix = ptr.prefix
-        py.suffix = ptr.suffix
-
-        py.orth_ = strings[ptr.orth]
-        py.lower_ = strings[ptr.lower]
-        py.norm_ = strings[ptr.norm]
-        py.shape_ = strings[ptr.shape]
-        py.prefix_ = strings[ptr.prefix]
-        py.suffix_ = strings[ptr.suffix]
-
-        py.cluster = ptr.cluster
-        py.prob = ptr.prob
-        py.sentiment = ptr.sentiment
-        return py
-
-    cpdef bint check_flag(self, attr_id_t flag_id) except -1
-    
-
-cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
-    return lexeme.flags & (1 << flag_id)
-
-
-cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
-    if feat_name < (sizeof(flags_t) * 8):
-        return check_flag(lex, feat_name)
-    elif feat_name == ID:
-        return lex.id
-    elif feat_name == ORTH:
-        return lex.orth
-    elif feat_name == LOWER:
-        return lex.lower
-    elif feat_name == NORM:
-        return lex.norm
-    elif feat_name == SHAPE:
-        return lex.shape
-    elif feat_name == PREFIX:
-        return lex.prefix
-    elif feat_name == SUFFIX:
-        return lex.suffix
-    elif feat_name == LENGTH:
-        return lex.length
-    elif feat_name == CLUSTER:
-        return lex.cluster
-    else:
-        return 0
+    cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
+        return lexeme.flags & (1 << flag_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


-cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
-                              const float* empty_vec) except -1:
-    lex.length = props['length']
-    lex.orth = string_store[props['orth']]
-    lex.lower = string_store[props['lower']]
-    lex.norm = string_store[props['norm']]
-    lex.shape = string_store[props['shape']]
-    lex.prefix = string_store[props['prefix']]
-    lex.suffix = string_store[props['suffix']]
-
-    lex.cluster = props['cluster']
-    lex.prob = props['prob']
-    lex.sentiment = props['sentiment']
-
-    lex.flags = props['flags']
-    lex.repvec = empty_vec
-
-
 cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
-    def __cinit__(self, int vec_size):
-        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
+    def __init__(self, Vocab vocab, int orth):
+        self.vocab = vocab
+        self.orth = orth
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)

-    @property
-    def has_repvec(self):
-        return self.l2_norm != 0
+    property orth:
+        def __get__(self): 
+            return self.c.orth
+    
+    property lower:
+        def __get__(self): return self.c.lower
+        def __set__(self, int x): self.c.lower = x
+    
+    property norm:
+        def __get__(self): return self.c.norm
+        def __set__(self, int x): self.c.norm = x

-    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        cdef flags_t one = 1
-        return self.flags & (one << flag_id)
+    property shape:
+        def __get__(self): return self.c.shape
+        def __set__(self, int x): self.c.shape = x
+
+    property prefix:
+        def __get__(self): return self.c.prefix
+        def __set__(self, int x): self.c.prefix = x
+
+    property suffix:
+        def __get__(self): return self.c.suffix
+        def __set__(self, int x): self.c.suffix = x
+    
+    property orth_:
+        def __get__(self):
+            return self.vocab.strings[self.c.orth]
+
+    property lower_:
+        def __get__(self): return self.vocab.strings[self.c.lower]
+        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
+ 
+    property norm_:
+        def __get__(self): return self.c.norm
+        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
+    
+    property shape_:
+        def __get__(self): return self.vocab.strings[self.c.shape]
+        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
+
+    property prefix_:
+        def __get__(self): return self.c.prefix
+        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
+
+    property suffix_:
+        def __get__(self): return self.c.suffix
+        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

    property is_oov:
-        def __get__(self): return self.check_flag(IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)

    property is_alpha:
-        def __get__(self): return self.check_flag(IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
    
    property is_ascii:
-        def __get__(self): return self.check_flag(IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)

    property is_digit:
-        def __get__(self): return self.check_flag(IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)

    property is_lower:
-        def __get__(self): return self.check_flag(IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)

    property is_title:
-        def __get__(self): return self.check_flag(IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)

    property is_punct:
-        def __get__(self): return self.check_flag(IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)

    property is_space: 
-        def __get__(self): return self.check_flag(IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)

    property like_url:
-        def __get__(self): return self.check_flag(LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
    
    property like_num:
-        def __get__(self): return self.check_flag(LIKE_NUM)
+        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)

    property like_email:
-        def __get__(self): return self.check_flag(LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab

+from libcpp.vector cimport vector
+
 try:
    import ujson as json
 except ImportError:
@ -96,28 +98,26 @@ def map_attr_name(attr):

 cdef class Matcher:
    cdef Pool mem
-    cdef Pattern** patterns
+    cdef vector[Pattern*] patterns
    cdef readonly int n_patterns

    def __init__(self, vocab, patterns):
        self.mem = Pool()
-        n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
-        self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
-        cdef int i = 0
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            if isinstance(entity_key, basestring):
-                entity_key = vocab.strings[entity_key]
-            if isinstance(etype, basestring):
-                etype = vocab.strings[etype]
-            elif etype is None:
-                etype = -1
-            # TODO: Do something more clever about multiple patterns for single
-            # entity
-            for spec in specs:
-                spec = _convert_strings(spec, vocab.strings)
-                self.patterns[i] = init_pattern(self.mem, spec, etype)
-                i += 1
-        self.n_patterns = len(patterns)
+            self.add(entity_key, etype, attrs, specs)
+
+    def add(self, entity_key, etype, attrs, specs):
+        if isinstance(entity_key, basestring):
+            entity_key = vocab.strings[entity_key]
+        if isinstance(etype, basestring):
+            etype = vocab.strings[etype]
+        elif etype is None:
+            etype = -1
+        # TODO: Do something more clever about multiple patterns for single
+        # entity
+        for spec in specs:
+            spec = _convert_strings(spec, vocab.strings)
+            self.patterns.push_back(init_pattern(self.mem, spec, etype))

    @classmethod
    def from_dir(cls, vocab, data_dir):
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -108,6 +108,11 @@ cdef class StringStore:
        else:
            raise TypeError(type(string_or_id))

+    def __iter__(self):
+        cdef int i
+        for i in range(self.size):
+            yield self[i]
+
    cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index.
        key = hash64(chars, length * sizeof(char), 0)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
-                 pos_tags=None, oov_prob=-30):
-        if oov_prob is None:
-            oov_prob = -30
+    def __init__(self, data_dir=None, get_lex_attr=None):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.pos_tags = pos_tags if pos_tags is not None else {}
-
-        self.lexeme_props_getter = get_lex_props
+        
+        self.get_lex_attr = get_lex_attr
        self.repvec_length = 0
        self.length = 0
        self._add_lex_to_vocab(0, &EMPTY_LEXEME)
        if data_dir is not None:
            if not path.exists(data_dir):
                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-        if data_dir is not None:
            if not path.isdir(data_dir):
                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
            self.load_lexemes(path.join(data_dir, 'strings.txt'),
@ -63,7 +59,6 @@ cdef class Vocab:

        self._serializer = None
        self.data_dir = data_dir
-        self.oov_prob = oov_prob

    property serializer:
        def __get__(self):
@ -91,18 +86,8 @@ cdef class Vocab:
        lex = <LexemeC*>self._by_hash.get(key)
        if lex != NULL:
            return lex
-        cdef bint is_oov = mem is not self.mem
-        if len(string) < 3:
-            mem = self.mem
-        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
-        if is_oov:
-            lex.id = 0
        else:
-            self._add_lex_to_vocab(key, lex)
-        assert lex != NULL, string
-        return lex
+            return self._new_lexeme(mem, string)

    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -114,18 +99,21 @@ cdef class Vocab:
        lex = <LexemeC*>self._by_orth.get(orth)
        if lex != NULL:
            return lex
-        cdef unicode string = self.strings[orth]
+        else:
+            return self._new_lexeme(mem, self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
        cdef bint is_oov = mem is not self.mem
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
+        for attr, func in self.lex_attr_getters.items():
+            Lexeme.set_struct_attr(lex, attr, func(string))
        if is_oov:
            lex.id = 0
        else:
-            self._add_lex_to_vocab(hash_string(string), lex)
-        assert lex != NULL, orth
+            self._add_lex_to_vocab(key, lex)
+        assert lex != NULL, string
        return lex

    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -171,15 +159,6 @@ cdef class Vocab:
                "int --> Lexeme" % str(type(id_or_string)))
        return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)

-    def __setitem__(self, unicode string, dict props):
-        cdef hash_t key = hash_string(string)
-        cdef LexemeC* lex
-        lex = <LexemeC*>self._by_hash.get(key)
-        if lex == NULL:
-            lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
-        self._add_lex_to_vocab(key, lex)
-
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)