* Refactor tokenization, splitting it into a clearer life-cycle.

2025-10-29 15:07:54 +03:00 · 2014-09-16 13:16:02 +02:00 · 2014-09-16 13:16:02 +02:00 · 143e51ec73
commit 143e51ec73
parent c396581a0b
5 changed files with 136 additions and 55 deletions
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -94,3 +94,8 @@ you'll  you will
 you're  you are
 you've  you have
 10km    10 km
 U.S.    U.S.
 U.N.    U.N.
 Ms. Ms.
 Mr. Mr.
 P.  P.
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens
 cdef class English(Language):
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -56,27 +56,48 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
+    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
-        if length == 1:
+        cdef Py_UNICODE c0 = chars[0]
        cdef Py_UNICODE c1 = chars[1]
        if c0 == ",":
            return 1
-        if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"):
+        elif c0 == '"':
            return 2
        cdef int i = 0
        # Leading punctuation
        if _check_punct(characters, 0, length):
            return 1
-        # Contractions
+        elif c0 == "(":
-        elif length >= 3 and characters[length - 2] == "'":
+            return 1
-            c2 = characters[length-1]
+        elif c0 == "[":
-            if c2 == "s" or c2 == "S":
+            return 1
-                return length - 2
+        elif c0 == "{":
-        if length >= 1:
+            return 1
-            # Split off all trailing punctuation characters
+        elif c0 == "*":
-            i = 0
+            return 1
-            while i < length and not _check_punct(characters, i, length):
+        elif c0 == "<":
-                i += 1
+            return 1
-        return i
+        elif c0 == "$":
-
+            return 1
        elif c0 == "£":
            return 1
        elif c0 == "€":
            return 1
        elif c0 == "\u201c":
            return 1
        elif c0 == "'":
            if c1 == "s":
                return 2
            elif c1 == "S":
                return 2
            elif c1 == "'":
                return 2
            else:
                return 1
        elif c0 == "`":
            if c1 == "`":
                return 2
            else:
                return 1
        else:
            return 0
 abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
 cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
    cdef unicode char_i = characters[i]
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -42,5 +42,14 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)
    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1
    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
 from murmurhash.mrmr cimport hash64
 from cpython.ref cimport Py_INCREF
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
 from spacy._hashing cimport PointerHash
 from spacy import orth
@ -191,42 +194,77 @@ cdef class Language:
        return tokens
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
-        cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
+        self._check_cache(tokens_v, string)
-        cdef size_t i
+        if not string.n:
        if lexemes != NULL:
            i = 0
            while lexemes[i] != NULL:
                tokens_v.push_back(lexemes[i])
                i += 1
            return 0
-        cdef uint64_t key = string.key 
+        cdef uint64_t orig_key = string.key
-        cdef size_t first_token = tokens_v.size()
+        cdef size_t orig_size = tokens_v.size()
        cdef int split
        cdef int remaining = string.n
        cdef String prefix
        cdef LexemeC* lexeme
        while remaining >= 1:
            split = self._split_one(string.chars, string.n)
            remaining -= split
            string_slice_prefix(string, &prefix, split)
            lexemes = <LexemeC**>self.specials.get(prefix.key)
            if lexemes != NULL:
                i = 0
                while lexemes[i] != NULL:
                    tokens_v.push_back(lexemes[i])
                    i += 1
            else:
                lexeme = <LexemeC*>self.lexicon.get(&prefix)
                tokens_v.push_back(lexeme)
        lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
        cdef size_t j
        for i, j in enumerate(range(first_token, tokens_v.size())):
            lexemes[i] = tokens_v[0][j]
        lexemes[i+1] = NULL
        self.cache.set(key, lexemes)
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
+        cdef vector[LexemeC*] prefixes
-        return length
+        cdef vector[LexemeC*] suffixes
        cdef String affix
        cdef int split = self._find_prefix(string.chars, string.n)
        while string.n and split >= 1:
            string_slice_prefix(string, &affix, split)
            prefixes.push_back(self.lexicon.get(&affix))
            split = self._find_prefix(string.chars, string.n)
        split = self._find_suffix(string.chars, string.n)
        while string.n and split >= 1:
            string_slice_suffix(string, &affix, split)
            suffixes.push_back(self.lexicon.get(&affix))
            split = self._find_suffix(string.chars, string.n)
        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
        self._save_cached(tokens_v, orig_key, orig_size)
    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
        lexemes = <LexemeC**>self.cache.get(string.key)
        cdef size_t i = 0
        if lexemes != NULL:
            while lexemes[i] != NULL:
                tokens.push_back(lexemes[i])
                i += 1
            string.n = 0
            string.key = 0
            string.chars = NULL
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1:
        cdef LexemeC* lexeme
        for lexeme in prefixes[0]:
            tokens.push_back(lexeme)
        self._check_cache(tokens, string)
        if string.n != 0:
            tokens.push_back(self.lexicon.get(string))
        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            tokens.push_back(deref(it))
            preinc(it)
    cdef int _save_cached(self, vector[LexemeC*] *tokens,
                          uint64_t key, size_t n) except -1:
        pass
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
        return 0
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
        if length < 2:
            return 0
        cdef unicode string = characters[:length]
        print repr(string)
        if string.endswith("'s") or string.endswith("'S"):
            return 2
        elif string.endswith("..."):
            return 3
        elif not string[-1].isalnum():
            return 1
        else:
            return 0
    def _load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
    s.chars += n
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
 cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
    string_from_slice(suffix, s.chars, s.n - n, s.n)
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
`@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens`


	`cdef class English(Language):`	`cdef class English(Language):`
	`cdef int _split_one(self, Py_UNICODE* characters, size_t length)`	`pass`