* Moving to Word objects in place of the Lexeme struct.

2025-10-28 14:41:14 +03:00 · 2014-08-22 17:28:23 +02:00 · 2014-08-22 17:28:23 +02:00 · 782806df08
commit 782806df08
parent 47fbd0475a
4 changed files with 43 additions and 74 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,23 +1,21 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport ClusterID
 from spacy.spacy cimport Language
 from spacy.word cimport Word
 from spacy.tokens cimport Tokens
 cimport cython
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
+    cdef int set_orth(self, unicode word, Word lex) except -1
 cdef English EN
-cpdef LexID lookup(unicode word) except 0
+cpdef Word lookup(unicode word)
-cpdef Tokens tokenize(unicode string)
+cpdef list tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -45,7 +45,6 @@ cimport spacy
 from spacy.orthography.latin cimport *
 from spacy.lexeme cimport *
 from .orthography.latin import *
 from .lexeme import *
@ -96,7 +95,7 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
 EN = English('en')
-cpdef Tokens tokenize(unicode string):
+cpdef list tokenize(unicode string):
    """Tokenize a string.
    The tokenization rules are defined in two places:
@ -113,7 +112,7 @@ cpdef Tokens tokenize(unicode string):
    return EN.tokenize(string)
-cpdef LexID lookup(unicode string) except 0:
+cpdef Word lookup(unicode string):
    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -125,7 +124,7 @@ cpdef LexID lookup(unicode string) except 0:
    Returns:
        lexeme (LexID): A reference to a lexical type.
    """
-    return <LexID>EN.lookup(string)
+    return EN.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,21 +1,9 @@
 from libcpp.vector cimport vector
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 from spacy.word cimport Word
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint32_t StringHash
 from spacy.lexeme cimport Lexeme
 from spacy.tokens cimport Tokens
 # Put these above import to avoid circular import problem
 ctypedef char Bits8
 ctypedef uint64_t Bits64
 ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 cdef class Language:
@ -24,16 +12,16 @@ cdef class Language:
    cdef dict vocab
    cdef dict bacov
-    cpdef Tokens tokenize(self, unicode text)
+    cpdef list tokenize(self, unicode text)
-    cdef Lexeme* lookup(self, unicode string) except NULL
+    cdef Word lookup(self, unicode string)
-    cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
+    cdef list lookup_chunk(self, unicode chunk)
-    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
+    cdef list new_chunk(self, unicode string, list substrings)
-    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
+    cdef Word new_lexeme(self, unicode lex)
    cpdef unicode unhash(self, StringHash hashed)
    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode string, Lexeme* word)
+    cdef int set_orth(self, unicode string, Word word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -14,9 +14,6 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport LexID
 from . import util
 from os import path
@ -33,7 +30,7 @@ cdef class Language:
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))
-    cpdef Tokens tokenize(self, unicode string):
+    cpdef list tokenize(self, unicode string):
        """Tokenize.
        Split the string into tokens.
@ -44,8 +41,8 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object.
        """
-        cdef Lexeme** chunk
+        cdef list chunk
-        cdef Tokens tokens = Tokens(self)
+        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
@ -53,64 +50,50 @@ cdef class Language:
            if _is_whitespace(c):
                if start < i:
                    chunk = self.lookup_chunk(string[start:i])
-                    _extend(tokens, chunk)
+                    tokens.extend(chunk)
                start = i + 1
            i += 1
        if start < i:
            chunk = self.lookup_chunk(string[start:])
-            _extend(tokens, chunk)
+            tokens.extend(chunk)
        return tokens
-    cdef Lexeme* lookup(self, unicode string) except NULL:
+    cdef Word lookup(self, unicode string):
        assert len(string) != 0
-        cdef Lexeme* word 
+        cdef Word word 
        cdef LexID lex_id
        cdef StringHash h = hash(string)
        if h in self.vocab:
-            lex_id = self.vocab[h]
+            word = self.vocab[h]
            word = <Lexeme*>lex_id
        else:
            word = self.new_lexeme(string)
        return word
-    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
+    cdef list lookup_chunk(self, unicode string):
        cdef StringHash h = hash(string)
-        cdef Lexeme** chunk
+        cdef list chunk
        cdef size_t chunk_id
        if h in self.chunks:
-            chunk_id = self.chunks[h]
+            chunk = self.chunks[h]
            chunk = <Lexeme**>chunk_id
        else:
            chunk = self.new_chunk(string, self.find_substrings(string))
        return chunk
-    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
+    cdef list new_chunk(self, unicode string, list substrings):
-        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
+        chunk = []
        for i, substring in enumerate(substrings):
-            chunk[i] = self.lookup(substring)
+            chunk.append(self.lookup(substring))
        chunk[i + 1] = NULL
        cdef StringHash h = hash(string)
-        self.chunks[h] = <size_t>chunk
+        self.chunks[h] = chunk
        return chunk
-    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
+    cdef Word new_lexeme(self, unicode string):
-        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+        string_views = [view_func(string) for view_func in self.view_funcs]
-        cdef bytes byte_string = string.encode('utf8')
+        word = Word(string.encode('utf8'), string_views)
        word.string = <char*>byte_string
        word.length = len(byte_string)
        word.lex = hash(string)
        word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
        cdef unicode view
        cdef StringHash hashed
        for i, view_func in enumerate(self.view_funcs):
            view = view_func(string)
            hashed = hash(view)
            word.string_views[i] = hashed
            self.bacov[hashed] = view
        self.bacov[word.lex] = string
-        self.vocab[word.lex] = <LexID>word
+        self.vocab[word.lex] = word
        return word
    """
    def add_view_funcs(self, list view_funcs):
        self.view_funcs.extend(view_funcs)
        cdef size_t nr_views = len(self.view_funcs)
@ -132,6 +115,7 @@ cdef class Language:
                hashed = hash(view)
                word.string_views[i] = hashed
                self.bacov[hashed] = view
    """
    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
@ -162,7 +146,7 @@ cdef class Language:
    cdef int find_split(self, unicode word):
        return len(word)
-    cdef int set_orth(self, unicode string, Lexeme* word):
+    cdef int set_orth(self, unicode string, Word word):
        pass
    def load_tokenization(self, token_rules):
@ -190,7 +174,7 @@ cdef class Language:
        '''
        cdef unicode string
        cdef dict word_dist
-        cdef Lexeme* w
+        cdef Word w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
            w.prob = word_dist.prob
@ -212,9 +196,9 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
        return False
-cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
+#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
-    cdef size_t i = 0
+#    cdef size_t i = 0
-    while chunk[i] != NULL:
+#    while chunk[i] != NULL:
-        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
+#        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
-        tokens.length += 1
+#        tokens.length += 1
-        i += 1
+#        i += 1