* Moving back to lexeme structs

2025-11-06 10:57:34 +03:00 · 2014-09-10 20:41:47 +02:00 · 2014-09-10 20:41:47 +02:00 · e567713429
commit e567713429
parent b488224c09
3 changed files with 31 additions and 25 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -105,7 +105,6 @@ cdef class Language:
            for i, substring in enumerate(substrings):
                lexemes.append(self.lexicon.lookup(substring))
            self.cache[string] = lexemes
-        
        cdef Lexeme lexeme
        for lexeme in lexemes:
            tokens.append(lexeme)
@ -178,9 +177,11 @@ cdef class Lexicon:
        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
+        cdef Lexeme lexeme
        assert len(string) != 0
        if string in self._dict:
-            return self._dict[string]
+            lexeme = self._dict[string]
+            return lexeme
        
        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
                                  self._flag_features)
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -1,20 +1,11 @@
 from .typedefs cimport hash_t, utf8_t, flag_t, id_t
-
+from spacy.lexeme cimport LexemeC

 DEF MAX_FLAG = 64


 cdef class Lexeme:
-    # NB: the readonly keyword refers to _Python_ access. The attributes are
-    # writeable from Cython.
-    cpdef readonly size_t length
-    cpdef readonly double prob
-    cpdef readonly size_t cluster
-
-    cpdef readonly unicode string
-    cpdef readonly list views
-
-    cdef readonly flag_t flags
+    cdef LexemeC* _c

    cpdef bint check_flag(self, size_t flag_id) except *
    cpdef unicode string_view(self, size_t view_id)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -1,9 +1,12 @@
 # cython: profile=True
 # cython: embedsignature=True

-
 from libc.stdlib cimport calloc, free, realloc

+from spacy.lexeme cimport lexeme_free, lexeme_init
+from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view
+
+
 cdef class Lexeme:
    """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
    keyed by a case-sensitive unicode string. All tokens with the same string,
@ -48,23 +51,34 @@ cdef class Lexeme:
    """
    def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
                  dict tag_stats, list string_features, list flag_features):
-        self.prob = prob
-        self.cluster = cluster
-        self.length = len(string)
-        self.string = string
-
-        self.views = []
+        views = []
        cdef unicode view
        for string_feature in string_features:
            view = string_feature(string, prob, cluster, case_stats, tag_stats)
-            self.views.append(view)
+            views.append(view)

+        flags = set()
        for i, flag_feature in enumerate(flag_features):
            if flag_feature(string, prob, case_stats, tag_stats):
-                self.flags |= (1 << i)
+                if (1 << i):
+                    flags.add(i)
+        self._c = lexeme_init(string, prob, cluster, views, flags)

    def __dealloc__(self):
-        pass
+        lexeme_free(self._c)
+
+    property string:
+        def __get__(self):
+            cdef bytes utf8_string = self._c.string
+            cdef unicode string = utf8_string.decode('utf8')
+            return string
+
+    property prob:
+        def __get__(self): return self._c.prob
+    property cluster:
+        def __get__(self): return self._c.cluster
+    property length:
+        def __get__(self): return self._c.length

    cpdef bint check_flag(self, size_t flag_id) except *:
        """Lexemes may store language-specific boolean features in a bit-field,
@ -80,7 +94,7 @@ cdef class Lexeme:
        >>> lexeme.check_flag(EN.OFT_UPPER)
        True
        """
-        return self.flags & (1 << flag_id)
+        return lexeme_check_flag(self._c, flag_id)

    cpdef unicode string_view(self, size_t view_id):
        """Lexemes may store language-specific string-view features, obtained
@ -100,4 +114,4 @@ cdef class Lexeme:
        >>> lexeme.string_view(EN.NON_SPARSE)
        u'Xxxx'
        """
-        return self.views[view_id]
+        return lexeme_string_view(self._c, view_id)