* Slight cleaning of tokenizer code

2025-10-16 08:46:51 +03:00 · 2014-10-10 19:17:22 +11:00 · 2014-10-10 19:17:22 +11:00 · 71ee921055
commit 71ee921055
parent 59b41a9fd3
7 changed files with 31 additions and 26 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -41,6 +41,7 @@ cdef class Lexicon:
 cdef class Language:
    cdef Pool _mem
    cdef unicode name
    cdef vector[size_t] counts
    cdef PreshMap cache
    cdef PreshMap specials
    cpdef readonly Lexicon lexicon
@ -51,7 +52,6 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)
    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -16,6 +16,7 @@ import re
 from .util import read_lang_data
 from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
 from spacy.lexeme cimport LexStr_orig
 from murmurhash.mrmr cimport hash64
 from cpython.ref cimport Py_INCREF
@ -45,12 +46,20 @@ cdef class Language:
        self.suffix_re = re.compile(suffix)
        self.lexicon = Lexicon(lexemes)
        self._load_special_tokenization(rules)
        self.counts = vector[size_t]()
    property nr_types:
        def __get__(self):
            """Return the number of lexical types in the vocabulary"""
            return self.lexicon.size
    property counts:
        def __get__(self):
            cdef size_t i
            for i in range(self.lexicon.size):
                count = self.counts[i] if i < self.counts.size() else 0
                yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
@ -85,23 +94,23 @@ cdef class Language:
        cdef size_t start = 0
        cdef size_t i = 0
        cdef Py_UNICODE* chars = string
        cdef Py_UNICODE c
        cdef String span
        for i in range(length):
-            c = chars[i]
+            if Py_UNICODE_ISSPACE(chars[i]) == 1:
            if Py_UNICODE_ISSPACE(c) == 1:
                if start < i:
                    string_from_slice(&span, chars, start, i)
-                    try:
+                    self._tokenize(tokens.v, &span)
                        self._tokenize(tokens.v, &span)
                    except MemoryError:
                        print chars[start:i]
                        raise
                start = i + 1
        i += 1
        if start < i:
            string_from_slice(&span, chars, start, i)
            self._tokenize(tokens.v, &span)
        cdef int id_
        for i in range(tokens.v.size()):
            id_ = tokens.id(i)
            while id_ >= self.counts.size():
                self.counts.push_back(0)
            self.counts[id_] += 1
        return tokens
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
@ -163,17 +172,6 @@ cdef class Language:
        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
        self._save_cached(tokens_v, orig_key, orig_size)
    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
        lexemes = <LexemeC**>self.cache.get(string.key)
        cdef size_t i = 0
        if lexemes != NULL:
            while lexemes[i] != NULL:
                tokens.push_back(lexemes[i])
                i += 1
            string.n = 0
            string.key = 0
            string.chars = NULL
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1:
@ -261,6 +259,7 @@ cdef class Lexicon:
            lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
            lexeme_unpack(lexeme, lexeme_dict)
            self._dict.set(string.key, lexeme)
            self.lexemes.push_back(lexeme)
            self.size += 1
    cdef LexemeC* get(self, String* string) except NULL:
@ -273,6 +272,7 @@ cdef class Lexicon:
        cdef unicode unicode_string = string.chars[:string.n]
        lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
        self._dict.set(string.key, lex)
        self.lexemes.push_back(lex)
        self.size += 1
        return lex
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -21,7 +21,7 @@ cpdef enum LexFloats:
 cpdef enum LexStrs:
-    LexStr_key
+    LexStr_orig
    LexStr_casefix
    LexStr_shape
    LexStr_unsparse
@ -70,6 +70,7 @@ cdef struct LexemeC:
    flag_t orth_flags
    flag_t dist_flags
 cpdef dict get_lexeme_dict(size_t i, unicode string)
 cdef char* intern_and_encode(unicode string, size_t* length) except NULL
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
    floats[<int>LexFloat_sentiment] = 0
    strings = [None for _ in range(LexStr_N)]
-    strings[<int>LexStr_key] = string
+    strings[<int>LexStr_orig] = string
-    strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
+    strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
    strings[<int>LexStr_shape] = orth.word_shape(string)
    strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
    strings[<int>LexStr_asciied] = orth.asciied(string)
@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
    flags |= orth.is_space(string) << LexOrth_space
    flags |= orth.is_title(string) << LexOrth_title
    flags |= orth.is_upper(string) << LexOrth_upper
    return flags
 def get_dist_flags(unicode string):
    return 0
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -5,6 +5,7 @@ from libcpp.vector cimport vector
 cdef class Tokens:
    cdef vector[LexemeC*] *v
    cpdef int id(self, size_t i) except -1
    cpdef unicode string(self, size_t i)
    cpdef float prob(self, size_t i) except 1
    cpdef int cluster(self, size_t i) except *
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -40,8 +40,11 @@ cdef class Tokens:
    def append(self, Lexeme lexeme):
        self.v.push_back(lexeme._c)
    cpdef int id(self, size_t i) except -1:
        return self.v.at(i).ints[<int>LexInt_i]
    cpdef unicode string(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key]
+        cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
        cdef unicode string = utf8_string.decode('utf8')
        return string
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -54,7 +54,7 @@ cdef class Lexeme:
    property string:
        def __get__(self):
-            cdef bytes utf8_string = self._c.strings[<int>LexStr_key]
+            cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
            cdef unicode string = utf8_string.decode('utf8')
            return string