* Moving to Word objects in place of the Lexeme struct.

2025-07-16 03:02:41 +03:00 · 2014-08-22 17:28:23 +02:00 · 2014-08-22 17:28:23 +02:00 · 782806df08
commit 782806df08
parent 47fbd0475a
4 changed files with 43 additions and 74 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,23 +1,21 @@
 from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport LexID
-from spacy.lexeme cimport ClusterID

 from spacy.spacy cimport Language
+from spacy.word cimport Word
 from spacy.tokens cimport Tokens
 cimport cython


 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
+    cdef int set_orth(self, unicode word, Word lex) except -1


 cdef English EN


-cpdef LexID lookup(unicode word) except 0
-cpdef Tokens tokenize(unicode string)
+cpdef Word lookup(unicode word)
+cpdef list tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -45,7 +45,6 @@ cimport spacy


 from spacy.orthography.latin cimport *
-from spacy.lexeme cimport *

 from .orthography.latin import *
 from .lexeme import *
@ -96,7 +95,7 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
 EN = English('en')


-cpdef Tokens tokenize(unicode string):
+cpdef list tokenize(unicode string):
    """Tokenize a string.

    The tokenization rules are defined in two places:
@ -113,7 +112,7 @@ cpdef Tokens tokenize(unicode string):
    return EN.tokenize(string)


-cpdef LexID lookup(unicode string) except 0:
+cpdef Word lookup(unicode string):
    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.

    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -125,7 +124,7 @@ cpdef LexID lookup(unicode string) except 0:
    Returns:
        lexeme (LexID): A reference to a lexical type.
    """
-    return <LexID>EN.lookup(string)
+    return EN.lookup(string)


 cpdef unicode unhash(StringHash hash_value):
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,21 +1,9 @@
-from libcpp.vector cimport vector
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
+from spacy.word cimport Word

-# Circular import problems here
-ctypedef size_t Lexeme_addr
 ctypedef uint32_t StringHash
-from spacy.lexeme cimport Lexeme

-from spacy.tokens cimport Tokens
-
-# Put these above import to avoid circular import problem
-ctypedef char Bits8
-ctypedef uint64_t Bits64
-ctypedef int ClusterID
-
-
-from spacy.lexeme cimport Lexeme


 cdef class Language:
@ -24,16 +12,16 @@ cdef class Language:
    cdef dict vocab
    cdef dict bacov

-    cpdef Tokens tokenize(self, unicode text)
+    cpdef list tokenize(self, unicode text)

-    cdef Lexeme* lookup(self, unicode string) except NULL
-    cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
+    cdef Word lookup(self, unicode string)
+    cdef list lookup_chunk(self, unicode chunk)
    
-    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
-    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
+    cdef list new_chunk(self, unicode string, list substrings)
+    cdef Word new_lexeme(self, unicode lex)
    
    cpdef unicode unhash(self, StringHash hashed)
    
    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode string, Lexeme* word)
+    cdef int set_orth(self, unicode string, Word word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -14,9 +14,6 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref

-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport LexID
-
 from . import util
 from os import path

@ -33,7 +30,7 @@ cdef class Language:
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))

-    cpdef Tokens tokenize(self, unicode string):
+    cpdef list tokenize(self, unicode string):
        """Tokenize.

        Split the string into tokens.
@ -44,8 +41,8 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object.
        """
-        cdef Lexeme** chunk
-        cdef Tokens tokens = Tokens(self)
+        cdef list chunk
+        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
@ -53,64 +50,50 @@ cdef class Language:
            if _is_whitespace(c):
                if start < i:
                    chunk = self.lookup_chunk(string[start:i])
-                    _extend(tokens, chunk)
+                    tokens.extend(chunk)
                start = i + 1
            i += 1
        if start < i:
            chunk = self.lookup_chunk(string[start:])
-            _extend(tokens, chunk)
+            tokens.extend(chunk)
        return tokens

-    cdef Lexeme* lookup(self, unicode string) except NULL:
+    cdef Word lookup(self, unicode string):
        assert len(string) != 0
-        cdef Lexeme* word 
-        cdef LexID lex_id
+        cdef Word word 
        cdef StringHash h = hash(string)
        if h in self.vocab:
-            lex_id = self.vocab[h]
-            word = <Lexeme*>lex_id
+            word = self.vocab[h]
        else:
            word = self.new_lexeme(string)
        return word

-    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
+    cdef list lookup_chunk(self, unicode string):
        cdef StringHash h = hash(string)
-        cdef Lexeme** chunk
+        cdef list chunk
        cdef size_t chunk_id
        if h in self.chunks:
-            chunk_id = self.chunks[h]
-            chunk = <Lexeme**>chunk_id
+            chunk = self.chunks[h]
        else:
            chunk = self.new_chunk(string, self.find_substrings(string))
        return chunk

-    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
-        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
+    cdef list new_chunk(self, unicode string, list substrings):
+        chunk = []
        for i, substring in enumerate(substrings):
-            chunk[i] = self.lookup(substring)
-        chunk[i + 1] = NULL
+            chunk.append(self.lookup(substring))
        cdef StringHash h = hash(string)
-        self.chunks[h] = <size_t>chunk
+        self.chunks[h] = chunk
        return chunk

-    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
-        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-        cdef bytes byte_string = string.encode('utf8')
-        word.string = <char*>byte_string
-        word.length = len(byte_string)
-        word.lex = hash(string)
-        word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
-        cdef unicode view
-        cdef StringHash hashed
-        for i, view_func in enumerate(self.view_funcs):
-            view = view_func(string)
-            hashed = hash(view)
-            word.string_views[i] = hashed
-            self.bacov[hashed] = view
+    cdef Word new_lexeme(self, unicode string):
+        string_views = [view_func(string) for view_func in self.view_funcs]
+        word = Word(string.encode('utf8'), string_views)
        self.bacov[word.lex] = string
-        self.vocab[word.lex] = <LexID>word
+        self.vocab[word.lex] = word
        return word

+    """
    def add_view_funcs(self, list view_funcs):
        self.view_funcs.extend(view_funcs)
        cdef size_t nr_views = len(self.view_funcs)
@ -132,6 +115,7 @@ cdef class Language:
                hashed = hash(view)
                word.string_views[i] = hashed
                self.bacov[hashed] = view
+    """

    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
@ -162,7 +146,7 @@ cdef class Language:
    cdef int find_split(self, unicode word):
        return len(word)

-    cdef int set_orth(self, unicode string, Lexeme* word):
+    cdef int set_orth(self, unicode string, Word word):
        pass

    def load_tokenization(self, token_rules):
@ -190,7 +174,7 @@ cdef class Language:
        '''
        cdef unicode string
        cdef dict word_dist
-        cdef Lexeme* w
+        cdef Word w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
            w.prob = word_dist.prob
@ -212,9 +196,9 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
        return False


-cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
-    cdef size_t i = 0
-    while chunk[i] != NULL:
-        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
-        tokens.length += 1
-        i += 1
+#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
+#    cdef size_t i = 0
+#    while chunk[i] != NULL:
+#        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
+#        tokens.length += 1
+#        i += 1