* Tmp. Refactoring, introducing a Lexeme PyObject.

2025-07-21 21:49:49 +03:00 · 2015-01-12 11:23:44 +11:00 · 2015-01-12 11:23:44 +11:00 · 46da3d74d2
commit 46da3d74d2
parent ce2edd6312
8 changed files with 91 additions and 65 deletions
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -3,9 +3,9 @@ from ..typedefs cimport FLAG8, FLAG9
 from ..typedefs cimport ID as _ID
 from ..typedefs cimport SIC as _SIC
 from ..typedefs cimport SHAPE as _SHAPE
-from ..typedefs cimport DENSE as _DENSE
+from ..typedefs cimport NORM1 as _NORM1
 from ..typedefs cimport NORM2 as _NORM2
 from ..typedefs cimport CLUSTER as _CLUSTER
 from ..typedefs cimport SHAPE as _SHAPE
 from ..typedefs cimport PREFIX as _PREFIX
 from ..typedefs cimport SUFFIX as _SUFFIX
 from ..typedefs cimport LEMMA as _LEMMA
@ -28,7 +28,8 @@ cpdef enum:
    ID = _ID
    SIC = _SIC
    SHAPE = _SHAPE
-    DENSE = _DENSE
+    NORM1 = _NORM1
    NORM2 = _NORM2
    PREFIX = _PREFIX
    SUFFIX = _SUFFIX
    CLUSTER = _CLUSTER
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -77,7 +77,6 @@ cpdef enum:
    P2_suffix
    P2_pos
    P2_lemma
    P2_pos_type
    P1_sic
    P1_cluster
@ -86,7 +85,6 @@ cpdef enum:
    P1_suffix
    P1_pos
    P1_lemma
    P1_pos_type
    W_sic
    W_cluster
@ -95,7 +93,6 @@ cpdef enum:
    W_suffix
    W_pos
    W_lemma
    W_pos_type
    N1_sic
    N1_cluster
@ -104,7 +101,6 @@ cpdef enum:
    N1_suffix
    N1_pos
    N1_lemma
    N1_pos_type
    N2_sic
    N2_cluster
@ -113,7 +109,6 @@ cpdef enum:
    N2_suffix
    N2_pos
    N2_lemma
    N2_pos_type
    N_CONTEXT_FIELDS
@ -196,11 +191,6 @@ POS_TEMPLATES = (
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),
    (W_pos_type,),
    (N1_pos_type,),
    (N1_pos_type,),
    (P1_pos, W_pos_type, N1_pos_type),
 )
@ -339,4 +329,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.lemma
    context[7] = t.lex.pos_type
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,5 +1,5 @@
 from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
-from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
+from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .structs cimport LexemeC
 from .strings cimport StringStore
@ -11,6 +11,35 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
                  dict props) except *
 cdef class Lexeme:
    cdef const float* vec
    cdef readonly flags_t flags
    cdef readonly attr_t id
    cdef readonly attr_t length
    cdef readonly unicode sic
    cdef readonly unicode norm1
    cdef readonly unicode norm2
    cdef readonly unicode shape
    cdef readonly unicode prefix
    cdef readonly unicode suffix
    cdef readonly attr_t sic_id
    cdef readonly attr_t norm1_id
    cdef readonly attr_t norm2_id
    cdef readonly attr_t shape_id
    cdef readonly attr_t prefix_id
    cdef readonly attr_t suffix_id
    cdef readonly attr_t cluster
    cdef readonly float prob
    cdef readonly float sentiment
 cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
 cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
@ -22,8 +51,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
-    elif feat_name == DENSE:
+    elif feat_name == NORM1:
-        return lex.dense
+        return lex.norm1
    elif feat_name == NORM2:
        return lex.norm2
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
@ -34,7 +65,5 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,7 +18,6 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
    lex.sic = string_store[string]
    lex.cluster = props.get('cluster', 0)
    lex.pos_type = props.get('pos_type', 0)
    lex.prob = props.get('prob', 0)
    lex.prefix = string_store[string[:1]]
@ -29,4 +28,36 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
    return lex
 cdef class Lexeme:
    def __init__(self):
        pass
 cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
    cdef Lexeme py = Lexeme.__new__(Lexeme)
    py.vec = c.vec
    py.flags = c.flags
    py.id = c.id
    py.length = c.length
    py.sic = strings[c.sic]
    py.norm1 = strings[c.norm1]
    py.norm2 = strings[c.norm2]
    py.shape = strings[c.shape]
    py.prefix = strings[c.prefix]
    py.suffix = strings[c.suffix]
    py.sic_id = c.sic
    py.norm1_id = c.norm1
    py.norm2_id = c.norm2
    py.shape_id = c.shape
    py.prefix_id = c.prefix
    py.suffix_id = c.suffix
    py.cluster = c.cluster
    py.prob = c.prob
    py.sentiment = c.sentiment
    return py
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -9,15 +9,16 @@ cdef struct LexemeC:
    flags_t flags
    attr_t id
    attr_t length
    attr_t sic
-    attr_t dense
+    attr_t norm1
    attr_t norm2
    attr_t shape
    attr_t prefix
    attr_t suffix
    attr_t length
    attr_t cluster
    attr_t pos_type
    float prob
    float sentiment
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
 from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
-from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
+from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA
 cimport cython
@ -39,8 +39,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
-    elif feat_name == DENSE:
+    elif feat_name == NORM1:
-        return lex.dense
+        return lex.norm1
    elif feat_name == NORM2:
        return lex.norm2
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
@ -51,8 +53,6 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    else:
        return 0
@ -175,26 +175,7 @@ cdef class Tokens:
 cdef Token cinit_token(const TokenC* c_tok):
    cdef const LexemeC* lex = c_tok.lex
    cdef Token py_tok = Token.__new__(Token)
    cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
    py_tok.vec = cyarr
    py_tok.flags = lex.flags
    py_tok.id = lex.id
    py_tok.sic = lex.sic
    py_tok.dense = lex.dense
    py_tok.shape = lex.shape
    py_tok.prefix = lex.prefix
    py_tok.suffix = lex.suffix
    py_tok.length = lex.length
    py_tok.cluster = lex.cluster
    py_tok.pos_type = lex.pos_type
    py_tok.prob = lex.prob
    py_tok.sentiment = lex.sentiment
    py_tok.morph = c_tok.morph
    py_tok.pos = c_tok.pos
    py_tok.fine_pos = c_tok.fine_pos
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -90,14 +90,14 @@ cpdef enum attr_id_t:
    ID
    SIC
-    DENSE
+    NORM1
    NORM2
    SHAPE
    PREFIX
    SUFFIX
    LENGTH
    CLUSTER
    POS_TYPE
    LEMMA
    POS
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -6,6 +6,7 @@ import codecs
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .lexeme cimport Lexeme_cinit
 from .strings cimport slice_unicode
 from .strings cimport hash_string
 from .orth cimport word_shape
@ -28,7 +29,6 @@ cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
    lex.sic = string_store[string]
    lex.cluster = props.get('cluster', 0)
    lex.pos_type = props.get('pos_type', 0)
    lex.prob = props.get('prob', 0)
    lex.prefix = string_store[string[:1]]
@ -90,12 +90,6 @@ cdef class Vocab:
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
        unseen unicode string is given, a new LexemeC is created and stored.
        This function relies on Cython's struct-to-dict conversion.  Python clients
        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
        with int values.  Cython clients can instead receive a LexemeC struct value.
        More efficient Cython access is provided by Lexicon.get, which returns
        a LexemeC*.
        Args:
            id_or_string (int or unicode): The integer ID of a word, or its unicode
                string.  If an int >= Lexicon.size, IndexError is raised.
@ -103,19 +97,19 @@ cdef class Vocab:
                is raised.
        Returns:
-            lexeme (dict): A LexemeC struct instance, which Cython translates into
+            lexeme (Lexeme): An instance of the Lexeme Python class, with data
-                a dict if the operator is called from Python.
+                copied on instantiation.
        '''
        cdef UniStr string
        cdef const LexemeC* lexeme
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
-            return {}
+            lexeme = self.lexemes.at(id_or_string)
-            #return self.lexemes.at(id_or_string)[0]
+        else:
-        cdef UniStr string
+            slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        slice_unicode(&string, id_or_string, 0, len(id_or_string))
+            lexeme = self.get(self.mem, &string)
-        cdef const LexemeC* lexeme = self.get(self.mem, &string)
+        return Lexeme_cinit(lexeme, self.strings)
        return {}
        #return lexeme[0]
    def __setitem__(self, unicode uni_string, dict props):
        cdef UniStr s