* Tmp. Refactoring, introducing a Lexeme PyObject.

2025-07-25 15:39:46 +03:00 · 2015-01-12 11:23:44 +11:00 · 2015-01-12 11:23:44 +11:00 · 46da3d74d2
commit 46da3d74d2
parent ce2edd6312
8 changed files with 91 additions and 65 deletions
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -3,9 +3,9 @@ from ..typedefs cimport FLAG8, FLAG9
 from ..typedefs cimport ID as _ID
 from ..typedefs cimport SIC as _SIC
 from ..typedefs cimport SHAPE as _SHAPE
-from ..typedefs cimport DENSE as _DENSE
+from ..typedefs cimport NORM1 as _NORM1
+from ..typedefs cimport NORM2 as _NORM2
 from ..typedefs cimport CLUSTER as _CLUSTER
-from ..typedefs cimport SHAPE as _SHAPE
 from ..typedefs cimport PREFIX as _PREFIX
 from ..typedefs cimport SUFFIX as _SUFFIX
 from ..typedefs cimport LEMMA as _LEMMA
@ -28,7 +28,8 @@ cpdef enum:
    ID = _ID
    SIC = _SIC
    SHAPE = _SHAPE
-    DENSE = _DENSE
+    NORM1 = _NORM1
+    NORM2 = _NORM2
    PREFIX = _PREFIX
    SUFFIX = _SUFFIX
    CLUSTER = _CLUSTER
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -77,7 +77,6 @@ cpdef enum:
    P2_suffix
    P2_pos
    P2_lemma
-    P2_pos_type

    P1_sic
    P1_cluster
@ -86,7 +85,6 @@ cpdef enum:
    P1_suffix
    P1_pos
    P1_lemma
-    P1_pos_type

    W_sic
    W_cluster
@ -95,7 +93,6 @@ cpdef enum:
    W_suffix
    W_pos
    W_lemma
-    W_pos_type

    N1_sic
    N1_cluster
@ -104,7 +101,6 @@ cpdef enum:
    N1_suffix
    N1_pos
    N1_lemma
-    N1_pos_type

    N2_sic
    N2_cluster
@ -113,7 +109,6 @@ cpdef enum:
    N2_suffix
    N2_pos
    N2_lemma
-    N2_pos_type

    N_CONTEXT_FIELDS

@ -196,11 +191,6 @@ POS_TEMPLATES = (
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),
-
-    (W_pos_type,),
-    (N1_pos_type,),
-    (N1_pos_type,),
-    (P1_pos, W_pos_type, N1_pos_type),
 )


@ -339,4 +329,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.lemma
-    context[7] = t.lex.pos_type
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,5 +1,5 @@
 from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
-from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
+from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .structs cimport LexemeC
 from .strings cimport StringStore

@ -11,6 +11,35 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
                  dict props) except *
 

+cdef class Lexeme:
+    cdef const float* vec
+
+    cdef readonly flags_t flags
+    cdef readonly attr_t id
+    cdef readonly attr_t length
+
+    cdef readonly unicode sic
+    cdef readonly unicode norm1
+    cdef readonly unicode norm2
+    cdef readonly unicode shape
+    cdef readonly unicode prefix
+    cdef readonly unicode suffix
+
+    cdef readonly attr_t sic_id
+    cdef readonly attr_t norm1_id
+    cdef readonly attr_t norm2_id
+    cdef readonly attr_t shape_id
+    cdef readonly attr_t prefix_id
+    cdef readonly attr_t suffix_id
+
+    cdef readonly attr_t cluster
+    cdef readonly float prob
+    cdef readonly float sentiment
+
+
+cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
+
+
 cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)

@ -22,8 +51,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
-    elif feat_name == DENSE:
-        return lex.dense
+    elif feat_name == NORM1:
+        return lex.norm1
+    elif feat_name == NORM2:
+        return lex.norm2
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
@ -34,7 +65,5 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
-    elif feat_name == POS_TYPE:
-        return lex.pos_type
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,7 +18,6 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
    lex.sic = string_store[string]
    
    lex.cluster = props.get('cluster', 0)
-    lex.pos_type = props.get('pos_type', 0)
    lex.prob = props.get('prob', 0)

    lex.prefix = string_store[string[:1]]
@ -29,4 +28,36 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
    return lex


+cdef class Lexeme:
+    def __init__(self):
+        pass
+        

+cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
+    cdef Lexeme py = Lexeme.__new__(Lexeme)
+
+    py.vec = c.vec
+
+    py.flags = c.flags
+    py.id = c.id
+    py.length = c.length
+
+    py.sic = strings[c.sic]
+    py.norm1 = strings[c.norm1]
+    py.norm2 = strings[c.norm2]
+    py.shape = strings[c.shape]
+    py.prefix = strings[c.prefix]
+    py.suffix = strings[c.suffix]
+
+    py.sic_id = c.sic
+    py.norm1_id = c.norm1
+    py.norm2_id = c.norm2
+    py.shape_id = c.shape
+    py.prefix_id = c.prefix
+    py.suffix_id = c.suffix
+    
+    py.cluster = c.cluster
+
+    py.prob = c.prob
+    py.sentiment = c.sentiment
+    return py
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -9,15 +9,16 @@ cdef struct LexemeC:
    flags_t flags
   
    attr_t id
+    attr_t length
+
    attr_t sic
-    attr_t dense
+    attr_t norm1
+    attr_t norm2
    attr_t shape
    attr_t prefix
    attr_t suffix
 
-    attr_t length
    attr_t cluster
-    attr_t pos_type

    float prob
    float sentiment
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
 from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
-from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
+from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA

 cimport cython
@ -39,8 +39,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
-    elif feat_name == DENSE:
-        return lex.dense
+    elif feat_name == NORM1:
+        return lex.norm1
+    elif feat_name == NORM2:
+        return lex.norm2
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
@ -51,8 +53,6 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
-    elif feat_name == POS_TYPE:
-        return lex.pos_type
    else:
        return 0

@ -175,26 +175,7 @@ cdef class Tokens:


 cdef Token cinit_token(const TokenC* c_tok):
-    cdef const LexemeC* lex = c_tok.lex
    cdef Token py_tok = Token.__new__(Token)
-
-    cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
-    py_tok.vec = cyarr
-
-    py_tok.flags = lex.flags
-    py_tok.id = lex.id
-    py_tok.sic = lex.sic
-    py_tok.dense = lex.dense
-    py_tok.shape = lex.shape
-    py_tok.prefix = lex.prefix
-    py_tok.suffix = lex.suffix
-    py_tok.length = lex.length
-    py_tok.cluster = lex.cluster
-    py_tok.pos_type = lex.pos_type
-
-    py_tok.prob = lex.prob
-    py_tok.sentiment = lex.sentiment
-
    py_tok.morph = c_tok.morph
    py_tok.pos = c_tok.pos
    py_tok.fine_pos = c_tok.fine_pos
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -90,14 +90,14 @@ cpdef enum attr_id_t:

    ID
    SIC
-    DENSE
+    NORM1
+    NORM2
    SHAPE
    PREFIX
    SUFFIX

    LENGTH
    CLUSTER
-    POS_TYPE
    LEMMA
    POS

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -6,6 +6,7 @@ import codecs

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
+from .lexeme cimport Lexeme_cinit
 from .strings cimport slice_unicode
 from .strings cimport hash_string
 from .orth cimport word_shape
@ -28,7 +29,6 @@ cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
    lex.sic = string_store[string]
    
    lex.cluster = props.get('cluster', 0)
-    lex.pos_type = props.get('pos_type', 0)
    lex.prob = props.get('prob', 0)

    lex.prefix = string_store[string[:1]]
@ -90,12 +90,6 @@ cdef class Vocab:
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
        unseen unicode string is given, a new LexemeC is created and stored.

-        This function relies on Cython's struct-to-dict conversion.  Python clients
-        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
-        with int values.  Cython clients can instead receive a LexemeC struct value.
-        More efficient Cython access is provided by Lexicon.get, which returns
-        a LexemeC*.
-
        Args:
            id_or_string (int or unicode): The integer ID of a word, or its unicode
                string.  If an int >= Lexicon.size, IndexError is raised.
@ -103,19 +97,19 @@ cdef class Vocab:
                is raised.

        Returns:
-            lexeme (dict): A LexemeC struct instance, which Cython translates into
-                a dict if the operator is called from Python.
+            lexeme (Lexeme): An instance of the Lexeme Python class, with data
+                copied on instantiation.
        '''
+        cdef UniStr string
+        cdef const LexemeC* lexeme
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
-            return {}
-            #return self.lexemes.at(id_or_string)[0]
-        cdef UniStr string
-        slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef const LexemeC* lexeme = self.get(self.mem, &string)
-        return {}
-        #return lexeme[0]
+            lexeme = self.lexemes.at(id_or_string)
+        else:
+            slice_unicode(&string, id_or_string, 0, len(id_or_string))
+            lexeme = self.get(self.mem, &string)
+        return Lexeme_cinit(lexeme, self.strings)

    def __setitem__(self, unicode uni_string, dict props):
        cdef UniStr s