* Refactor tokenization, splitting it into a clearer life-cycle.

2025-10-30 23:47:31 +03:00 · 2014-09-16 13:16:02 +02:00 · 2014-09-16 13:16:02 +02:00 · 143e51ec73
commit 143e51ec73
parent c396581a0b
5 changed files with 136 additions and 55 deletions
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -94,3 +94,8 @@ you'll  you will
 you're  you are
 you've  you have
 10km    10 km
+U.S.    U.S.
+U.N.    U.N.
+Ms. Ms.
+Mr. Mr.
+P.  P.
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens


 cdef class English(Language):
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -56,26 +56,47 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
-        if length == 1:
+    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
+        cdef Py_UNICODE c0 = chars[0]
+        cdef Py_UNICODE c1 = chars[1]
+        if c0 == ",":
            return 1
-        if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"):
-            return 2
-        cdef int i = 0
-        # Leading punctuation
-        if _check_punct(characters, 0, length):
+        elif c0 == '"':
            return 1
-        # Contractions
-        elif length >= 3 and characters[length - 2] == "'":
-            c2 = characters[length-1]
-            if c2 == "s" or c2 == "S":
-                return length - 2
-        if length >= 1:
-            # Split off all trailing punctuation characters
-            i = 0
-            while i < length and not _check_punct(characters, i, length):
-                i += 1
-        return i
+        elif c0 == "(":
+            return 1
+        elif c0 == "[":
+            return 1
+        elif c0 == "{":
+            return 1
+        elif c0 == "*":
+            return 1
+        elif c0 == "<":
+            return 1
+        elif c0 == "$":
+            return 1
+        elif c0 == "£":
+            return 1
+        elif c0 == "€":
+            return 1
+        elif c0 == "\u201c":
+            return 1
+        elif c0 == "'":
+            if c1 == "s":
+                return 2
+            elif c1 == "S":
+                return 2
+            elif c1 == "'":
+                return 2
+            else:
+                return 1
+        elif c0 == "`":
+            if c1 == "`":
+                return 2
+            else:
+                return 1
+        else:
+            return 0
        
 abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
 cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -42,5 +42,14 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)

+    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
-    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
+    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
+    
+    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes) except -1
+
+    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
+ 
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
 from murmurhash.mrmr cimport hash64
 from cpython.ref cimport Py_INCREF

+from cython.operator cimport preincrement as preinc
+from cython.operator cimport dereference as deref
+

 from spacy._hashing cimport PointerHash
 from spacy import orth
@ -191,42 +194,77 @@ cdef class Language:
        return tokens

    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
-        cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
-        cdef size_t i
-        if lexemes != NULL:
-            i = 0
-            while lexemes[i] != NULL:
-                tokens_v.push_back(lexemes[i])
-                i += 1
+        self._check_cache(tokens_v, string)
+        if not string.n:
            return 0
-        cdef uint64_t key = string.key 
-        cdef size_t first_token = tokens_v.size()
-        cdef int split
-        cdef int remaining = string.n
-        cdef String prefix
-        cdef LexemeC* lexeme
-        while remaining >= 1:
-            split = self._split_one(string.chars, string.n)
-            remaining -= split
-            string_slice_prefix(string, &prefix, split)
-            lexemes = <LexemeC**>self.specials.get(prefix.key)
-            if lexemes != NULL:
-                i = 0
-                while lexemes[i] != NULL:
-                    tokens_v.push_back(lexemes[i])
-                    i += 1
-            else:
-                lexeme = <LexemeC*>self.lexicon.get(&prefix)
-                tokens_v.push_back(lexeme)
-        lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
-        cdef size_t j
-        for i, j in enumerate(range(first_token, tokens_v.size())):
-            lexemes[i] = tokens_v[0][j]
-        lexemes[i+1] = NULL
-        self.cache.set(key, lexemes)
+        cdef uint64_t orig_key = string.key
+        cdef size_t orig_size = tokens_v.size()

-    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
-        return length
+        cdef vector[LexemeC*] prefixes
+        cdef vector[LexemeC*] suffixes
+
+        cdef String affix
+        cdef int split = self._find_prefix(string.chars, string.n)
+        while string.n and split >= 1:
+            string_slice_prefix(string, &affix, split)
+            prefixes.push_back(self.lexicon.get(&affix))
+            split = self._find_prefix(string.chars, string.n)
+
+        split = self._find_suffix(string.chars, string.n)
+        while string.n and split >= 1:
+            string_slice_suffix(string, &affix, split)
+            suffixes.push_back(self.lexicon.get(&affix))
+            split = self._find_suffix(string.chars, string.n)
+ 
+        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
+        self._save_cached(tokens_v, orig_key, orig_size)
+
+    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
+        lexemes = <LexemeC**>self.cache.get(string.key)
+        cdef size_t i = 0
+        if lexemes != NULL:
+            while lexemes[i] != NULL:
+                tokens.push_back(lexemes[i])
+                i += 1
+            string.n = 0
+            string.key = 0
+            string.chars = NULL
+
+
+    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes) except -1:
+        cdef LexemeC* lexeme
+        for lexeme in prefixes[0]:
+            tokens.push_back(lexeme)
+        self._check_cache(tokens, string)
+        if string.n != 0:
+            tokens.push_back(self.lexicon.get(string))
+        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
+        while it != suffixes.rend():
+            tokens.push_back(deref(it))
+            preinc(it)
+
+    cdef int _save_cached(self, vector[LexemeC*] *tokens,
+                          uint64_t key, size_t n) except -1:
+        pass
+        
+    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
+        return 0
+
+    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
+        if length < 2:
+            return 0
+        cdef unicode string = characters[:length]
+        print repr(string)
+        if string.endswith("'s") or string.endswith("'S"):
+            return 2
+        elif string.endswith("..."):
+            return 3
+        elif not string[-1].isalnum():
+            return 1
+        else:
+            return 0

    def _load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
    s.chars += n
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+
+
+cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
+    string_from_slice(suffix, s.chars, s.n - n, s.n)
+    s.n -= n
+    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+
+