* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon.

2025-07-09 15:52:31 +03:00 · 2014-09-11 12:28:38 +02:00 · 2014-09-11 12:28:38 +02:00 · 5b1c651661
commit 5b1c651661
parent b5b31c6b6e
3 changed files with 43 additions and 32 deletions
--- a/fabfile.py
+++ b/fabfile.py
@ -1,14 +1,25 @@
 import json
 from fabric.api import local, run, lcd, cd, env
 def make():
    local('python setup.py build_ext --inplace')
 def clean():
    local('python setup.py clean --all')
 def docs():
-    with lcd('docs'):
+    local('sphinx-build -b html docs/ .')
-        local('sphinx-build -b html . ./_build')
+
 def test():
    local('py.test -x')
 def sbox():
    local('python sb_setup.py build_ext --inplace')
 def sbclean():
    local('python sb_setup.py clean --all')
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -15,6 +15,7 @@ from os import path
 from .util import read_lang_data
 from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, lexeme_init
 cdef class Language:
@ -76,9 +77,10 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
        assert string
        cdef size_t length = len(string)
        if length == 0:
            return []
        cdef size_t start = 0
        cdef size_t i = 0
        cdef Tokens tokens = self.tokens_class()
@ -162,10 +164,18 @@ cdef class Lexicon:
        self.size = 0
        cdef Lexeme word
        for string in words:
-            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
+            prob = probs.get(string, 0.0)
-                          case_stats.get(string, {}), tag_stats.get(string, {}),
+            cluster = clusters.get(string, 0.0)
-                          self._string_features, self._flag_features)
+            cases = case_stats.get(string, {})
-            self._dict[string] = word
+            tags = tag_stats.get(string, {})
            views = [string_view(string, prob, cluster, cases, tags)
                     for string_view in self._string_features]
            flags = set()
            for i, flag_feature in enumerate(self._flag_features):
                if flag_feature(string, prob, cluster, cases, tags):
                    flags.add(i)
            lexeme = lexeme_init(string, prob, cluster, views, flags)
            self._dict[string] = <size_t>lexeme
            self.size += 1
    cpdef Lexeme lookup(self, unicode string):
@ -177,14 +187,19 @@ cdef class Lexicon:
        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
-        cdef Lexeme lexeme
+        cdef LexemeC* lexeme
        assert len(string) != 0
        if string in self._dict:
-            lexeme = self._dict[string]
+            return Lexeme(self._dict[string])
            return lexeme
-        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
+        views = [string_view(string, 0.0, 0, {}, {})
-                                  self._flag_features)
+                 for string_view in self._string_features]
-        self._dict[string] = word
+        flags = set()
        for i, flag_feature in enumerate(self._flag_features):
            if flag_feature(string, 0.0, {}, {}):
                flags.add(i)
        lexeme = lexeme_init(string, 0, 0, views, flags)
        self._dict[string] = <size_t>lexeme
        self.size += 1
-        return word
+        return Lexeme(<size_t>lexeme)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -49,23 +49,8 @@ cdef class Lexeme:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
+    def __cinit__(self, size_t lexeme_addr):
-                  dict tag_stats, list string_features, list flag_features):
+        self._c = <LexemeC*>lexeme_addr
        views = []
        cdef unicode view
        for string_feature in string_features:
            view = string_feature(string, prob, cluster, case_stats, tag_stats)
            views.append(view)
        flags = set()
        for i, flag_feature in enumerate(flag_features):
            if flag_feature(string, prob, case_stats, tag_stats):
                if (1 << i):
                    flags.add(i)
        self._c = lexeme_init(string, prob, cluster, views, flags)
    def __dealloc__(self):
        lexeme_free(self._c)
    property string:
        def __get__(self):