* Begin merge of Gazetteer and DE branches

2025-10-19 02:04:19 +03:00 · 2015-09-06 19:45:15 +02:00 · 2015-09-06 19:45:15 +02:00 · d2fc104a26
commit d2fc104a26
parent dbf8dce109
7 changed files with 74 additions and 57 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE

 from .structs cimport LexemeC
 from .strings cimport StringStore
+from .vocab cimport Vocab

 from numpy cimport ndarray

@ -15,21 +16,31 @@ cdef class Lexeme:
    cdef readonly Vocab vocab
    cdef readonly attr_t orth

-    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
-        lex.length = props['length']
-        lex.orth = vocab.strings[props['orth']]
-        lex.lower = vocab.strings[props['lower']]
-        lex.norm = vocab.strings[props['norm']]
-        lex.shape = vocab.strings[props['shape']]
-        lex.prefix = vocab.strings[props['prefix']]
-        lex.suffix = vocab.strings[props['suffix']]
+    @staticmethod
+    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
+        cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
+        self.c = lex
+        self.vocab = vocab
+        self.orth = lex.orth
    
-        lex.cluster = props['cluster']
-        lex.prob = props['prob']
-        lex.sentiment = props['sentiment']
-
-        lex.flags = props['flags']
-        lex.repvec = empty_vec
+    @staticmethod
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
+        if name < (sizeof(flags_t) * 8):
+            Lexeme.set_flag(lex, name, value)
+        elif name == ID:
+            lex.id = value
+        elif name == LOWER:
+            lex.lower = value
+        elif name == NORM:
+            lex.norm = value
+        elif name == SHAPE:
+            lex.shape = value
+        elif name == PREFIX:
+            lex.prefix = value
+        elif name == SUFFIX:
+            lex.suffix = value
+        elif name == CLUSTER:
+            lex.cluster = value

    @staticmethod
    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -56,5 +67,14 @@ cdef class Lexeme:
        else:
            return 0

+    @staticmethod
    cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
        return lexeme.flags & (1 << flag_id)
+
+    @staticmethod
+    cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil:
+        cdef flags_t one = 1
+        if value:
+            lexeme.flags |= one << flag_id
+        else:
+            lexeme.flags &= ~(one << flag_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -26,11 +26,7 @@ cdef class Lexeme:
    def __init__(self, Vocab vocab, int orth):
        self.vocab = vocab
        self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
-
-    property orth:
-        def __get__(self): 
-            return self.c.orth
+        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)

    property lower:
        def __get__(self): return self.c.lower
@ -78,44 +74,44 @@ cdef class Lexeme:

    property is_oov:
        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)

    property is_alpha:
        def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x)
    
    property is_ascii:
        def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x)

    property is_digit:
        def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x)

    property is_lower:
        def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x)

    property is_title:
        def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x)

    property is_punct:
        def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x)

    property is_space: 
        def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x)

    property like_url:
        def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x)
    
    property like_num:
-        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x)

    property like_email:
        def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -102,21 +102,22 @@ cdef class Matcher:
    cdef readonly int n_patterns

    def __init__(self, vocab, patterns):
+        self.vocab = vocab
        self.mem = Pool()
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
            self.add(entity_key, etype, attrs, specs)

    def add(self, entity_key, etype, attrs, specs):
        if isinstance(entity_key, basestring):
-            entity_key = vocab.strings[entity_key]
+            entity_key = self.vocab.strings[entity_key]
        if isinstance(etype, basestring):
-            etype = vocab.strings[etype]
+            etype = self.vocab.strings[etype]
        elif etype is None:
            etype = -1
        # TODO: Do something more clever about multiple patterns for single
        # entity
        for spec in specs:
-            spec = _convert_strings(spec, vocab.strings)
+            spec = _convert_strings(spec, self.vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, spec, etype))

    @classmethod
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t
 import numpy
 import struct

+from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
-from ..lexeme cimport check_flag
-from ..lexeme cimport get_attr as get_lex_attr
 from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    elif feat_name == ENT_TYPE:
        return token.ent_type
    else:
-        return get_lex_attr(token.lex, feat_name)
+        return Lexeme.get_struct_attr(token.lex, feat_name)


 cdef class Doc:
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -1,6 +1,5 @@
 from libc.string cimport memcpy
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
-from ..lexeme cimport check_flag
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@ -9,6 +8,7 @@ np.import_array()
 import numpy


+from ..lexeme cimport Lexeme
 from ..parts_of_speech import UNIV_POS_NAMES

 from ..attrs cimport LEMMA
@ -42,7 +42,7 @@ cdef class Token:
        return self.string

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        return check_flag(self.c.lex, flag_id)
+        return Lexeme.check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
        return self.doc[self.i+i]
@ -286,37 +286,37 @@ cdef class Token:
            return self.vocab.strings[self.c.dep]

    property is_oov:
-        def __get__(self): return check_flag(self.c.lex, IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)

    property is_alpha:
-        def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)

    property is_ascii:
-        def __get__(self): return check_flag(self.c.lex, IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)

    property is_digit:
-        def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)

    property is_lower:
-        def __get__(self): return check_flag(self.c.lex, IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)

    property is_title:
-        def __get__(self): return check_flag(self.c.lex, IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)

    property is_punct:
-        def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)

    property is_space: 
-        def __get__(self): return check_flag(self.c.lex, IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)

    property like_url:
-        def __get__(self): return check_flag(self.c.lex, LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)

    property like_num:
-        def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)

    property like_email:
-        def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)


 _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -37,6 +37,7 @@ cdef class Vocab:
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
    
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL

    cdef PreshMap _by_hash
    cdef PreshMap _by_orth
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -12,7 +12,6 @@ import math
 import json

 from .lexeme cimport EMPTY_LEXEME
-from .lexeme cimport set_lex_struct_props
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .orth cimport word_shape
@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    def __init__(self, data_dir=None, get_lex_attr=None):
+    def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
-        self.pos_tags = pos_tags if pos_tags is not None else {}
+        #self.pos_tags = pos_tags if pos_tags is not None else {}
+        self.pos_tags = {}
        
        self.get_lex_attr = get_lex_attr
        self.repvec_length = 0
@ -112,7 +112,7 @@ cdef class Vocab:
        if is_oov:
            lex.id = 0
        else:
-            self._add_lex_to_vocab(key, lex)
+            self._add_lex_to_vocab(hash_string(string), lex)
        assert lex != NULL, string
        return lex

@ -125,7 +125,7 @@ cdef class Vocab:
        cdef attr_t orth
        cdef size_t addr
        for orth, addr in self._by_orth.items():
-            yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
+            yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)

    def __getitem__(self,  id_or_string):
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
@ -157,7 +157,7 @@ cdef class Vocab:
            raise ValueError("Vocab unable to map type: "
                "%s. Maps unicode --> Lexeme or "
                "int --> Lexeme" % str(type(id_or_string)))
-        return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
+        return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)

    def dump(self, loc):
        if path.exists(loc):