* Refactor tokenization, enable cache, and ensure we look up specials correctly even when there's confusing punctuation surrounding the token.

2025-11-03 01:17:52 +03:00 · 2014-09-16 18:01:46 +02:00 · 2014-09-16 18:01:46 +02:00 · 0152831c89
commit 0152831c89
parent 143e51ec73
6 changed files with 234 additions and 99 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -56,67 +56,7 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
-        cdef Py_UNICODE c0 = chars[0]
-        cdef Py_UNICODE c1 = chars[1]
-        if c0 == ",":
-            return 1
-        elif c0 == '"':
-            return 1
-        elif c0 == "(":
-            return 1
-        elif c0 == "[":
-            return 1
-        elif c0 == "{":
-            return 1
-        elif c0 == "*":
-            return 1
-        elif c0 == "<":
-            return 1
-        elif c0 == "$":
-            return 1
-        elif c0 == "£":
-            return 1
-        elif c0 == "€":
-            return 1
-        elif c0 == "\u201c":
-            return 1
-        elif c0 == "'":
-            if c1 == "s":
-                return 2
-            elif c1 == "S":
-                return 2
-            elif c1 == "'":
-                return 2
-            else:
-                return 1
-        elif c0 == "`":
-            if c1 == "`":
-                return 2
-            else:
-                return 1
-        else:
-            return 0
-        
-abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
-cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
-    cdef unicode char_i = characters[i]
-    cdef unicode char_i1 = characters[i+1]
-    # Don't count appostrophes as punct if the next char is a letter
-    if characters[i] == "'" and i < (length - 1) and char_i1.isalpha():
-        return i == 0
-    if characters[i] == "-":
-        return False
-        #and i < (length - 1) and characters[i+1] == '-':
-        #return False
-    # Don't count commas as punct if the next char is a number
-    if characters[i] == "," and i < (length - 1) and char_i1.isdigit():
-        return False
-    if characters[i] == "." and i < (length - 1):
-        return False
-    if characters[i] == "." and characters[:i] in abbreviations:
-        return False
-    return not char_i.isalnum()
+    pass


 EN = English('en', [], [])
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -25,7 +25,7 @@ cdef class Lexicon:
    cpdef readonly size_t size

    cpdef Lexeme lookup(self, unicode string)
-    cdef LexemeC* get(self, String* s)
+    cdef LexemeC* get(self, String* s) except NULL
    
    cdef PointerHash _dict
    
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -185,7 +185,11 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(c) == 1:
                if start < i:
                    string_from_slice(&span, chars, start, i)
-                    self._tokenize(tokens.v, &span)
+                    try:
+                        self._tokenize(tokens.v, &span)
+                    except MemoryError:
+                        print chars[start:i]
+                        raise
                start = i + 1
        i += 1
        if start < i:
@ -194,28 +198,61 @@ cdef class Language:
        return tokens

    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
-        self._check_cache(tokens_v, string)
-        if not string.n:
+        cdef size_t i
+        lexemes = <LexemeC**>self.cache.get(string.key)
+        if lexemes != NULL:
+            i = 0
+            while lexemes[i] != NULL:
+                tokens.push_back(lexemes[i])
+                i += 1
            return 0
+
        cdef uint64_t orig_key = string.key
        cdef size_t orig_size = tokens_v.size()

        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes

-        cdef String affix
-        cdef int split = self._find_prefix(string.chars, string.n)
-        while string.n and split >= 1:
-            string_slice_prefix(string, &affix, split)
-            prefixes.push_back(self.lexicon.get(&affix))
-            split = self._find_prefix(string.chars, string.n)
+        cdef String prefix
+        cdef String suffix
+        cdef String minus_pre
+        cdef String minus_suf
+        cdef size_t last_size = 0
+        while string.n != 0 and string.n != last_size:
+            last_size = string.n
+            pre_len = self._find_prefix(string.chars, string.n)
+            if pre_len != 0:
+                string_from_slice(&prefix, string.chars, 0, pre_len)
+                string_from_slice(&minus_pre, string.chars, pre_len, string.n)
+                # Check whether we've hit a special-case
+                if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
+                    string = &minus_pre
+                    prefixes.push_back(self.lexicon.get(&prefix))
+                    break
+            suf_len = self._find_suffix(string.chars, string.n)
+            if suf_len != 0:
+                string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
+                string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                # Check whether we've hit a special-case
+                if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
+                    string = &minus_suf
+                    suffixes.push_back(self.lexicon.get(&suffix))
+                    break
+
+            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
+                string_from_slice(string, string.chars, pre_len, string.n - suf_len)
+                prefixes.push_back(self.lexicon.get(&prefix))
+                suffixes.push_back(self.lexicon.get(&suffix))
+            elif pre_len:
+                string = &minus_pre
+                prefixes.push_back(self.lexicon.get(&prefix))
+            elif suf_len:
+                string = &minus_suf
+                suffixes.push_back(self.lexicon.get(&suffix))
+
+            if self.specials.get(string.key):
+                break

-        split = self._find_suffix(string.chars, string.n)
-        while string.n and split >= 1:
-            string_slice_suffix(string, &affix, split)
-            suffixes.push_back(self.lexicon.get(&affix))
-            split = self._find_suffix(string.chars, string.n)
- 
        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
        self._save_cached(tokens_v, orig_key, orig_size)

@ -230,16 +267,23 @@ cdef class Language:
            string.key = 0
            string.chars = NULL

-
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1:
+        cdef size_t i
+        cdef LexemeC** lexemes
        cdef LexemeC* lexeme
-        for lexeme in prefixes[0]:
+        for lexeme in deref(prefixes):
            tokens.push_back(lexeme)
-        self._check_cache(tokens, string)
        if string.n != 0:
-            tokens.push_back(self.lexicon.get(string))
+            lexemes = <LexemeC**>self.specials.get(string.key)
+            if lexemes != NULL:
+                i = 0 
+                while lexemes[i] != NULL:
+                    tokens.push_back(lexemes[i])
+                    i += 1
+            else:
+                tokens.push_back(self.lexicon.get(string))
        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            tokens.push_back(deref(it))
@ -247,22 +291,100 @@ cdef class Language:

    cdef int _save_cached(self, vector[LexemeC*] *tokens,
                          uint64_t key, size_t n) except -1:
-        pass
-        
-    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
-        return 0
-
-    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
-        if length < 2:
-            return 0
-        cdef unicode string = characters[:length]
-        print repr(string)
-        if string.endswith("'s") or string.endswith("'S"):
-            return 2
-        elif string.endswith("..."):
-            return 3
-        elif not string[-1].isalnum():
+        assert tokens.size() > n
+        lexemes = <LexemeC**>calloc((tokens.size() - n) + 1, sizeof(LexemeC**))
+        cdef size_t i, j
+        for i, j in enumerate(range(n, tokens.size())):
+            lexemes[i] = tokens.at(j)
+        lexemes[i + 1] = NULL
+        self.cache.set(key, lexemes)
+    
+    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
+        cdef Py_UNICODE c0 = chars[0]
+        cdef Py_UNICODE c1 = chars[1]
+        if c0 == ",":
            return 1
+        elif c0 == '"':
+            return 1
+        elif c0 == "(":
+            return 1
+        elif c0 == "[":
+            return 1
+        elif c0 == "{":
+            return 1
+        elif c0 == "*":
+            return 1
+        elif c0 == "<":
+            return 1
+        elif c0 == "$":
+            return 1
+        elif c0 == "£":
+            return 1
+        elif c0 == "€":
+            return 1
+        elif c0 == "\u201c":
+            return 1
+        elif c0 == "'":
+            return 1
+        elif c0 == "`":
+            if c1 == "`":
+                return 2
+            else:
+                return 1
+        else:
+            return 0
+ 
+    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
+        cdef Py_UNICODE c0 = chars[length - 1]
+        cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0
+        cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0
+ 
+        if c0 == ",":
+            return 1
+        elif c0 == '"':
+            return 1
+        elif c0 == ')':
+            return 1
+        elif c0 == ']':
+            return 1
+        elif c0 == '}':
+            return 1
+        elif c0 == '*':
+            return 1
+        elif c0 == '!':
+            return 1
+        elif c0 == '?':
+            return 1
+        elif c0 == '%':
+            return 1
+        elif c0 == '$':
+            return 1
+        elif c0 == '>':
+            return 1
+        elif c0 == ':':
+            return 1
+        elif c0 == "'":
+            return 1
+        elif c0 == u'\u201d':
+            return 1
+        elif c0 == "s":
+            if c1 == "'":
+                return 2
+            else:
+                return 0
+        elif c0 == "S":
+            if c1 == "'":
+                return 2
+            else:
+                return 0
+        elif c0 == ".":
+            if c1 == ".":
+                if c2 == ".":
+                    return 3
+                else:
+                    return 2
+            else:
+                return 1
        else:
            return 0

@ -316,7 +438,7 @@ cdef class Lexicon:
            self._dict.set(string.key, lexeme)
            self.size += 1

-    cdef LexemeC* get(self, String* string):
+    cdef LexemeC* get(self, String* string) except NULL:
        cdef LexemeC* lexeme
        lexeme = <LexemeC*>self._dict.get(string.key)
        if lexeme != NULL:
@ -372,5 +494,3 @@ cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
    string_from_slice(suffix, s.chars, s.n - n, s.n)
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
-
-
--- a/tests/test_is_punct.py
+++ b/tests/test_is_punct.py
@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+
+from spacy.orth import is_punct
+
+
+def test_comma():
+    assert is_punct(',', 0, {}, {}) == True
+
+
+def test_space():
+    assert is_punct(' ', 0, {}, {}) == False
+
+
+def test_letter():
+    assert is_punct('a', 0, {}, {}) == False
--- a/tests/test_only_punct.py
+++ b/tests/test_only_punct.py
@ -0,0 +1,14 @@
+from __future__ import unicode_literals
+import pytest
+
+from spacy.en import EN
+
+def test_only_pre1():
+    assert len(EN.tokenize("(")) == 1
+
+
+def test_only_pre2():
+    assert len(EN.tokenize("((")) == 2
+
+def test_only_suf2():
+    assert len(EN.tokenize("''")) == 2
--- a/tests/test_special_affix.py
+++ b/tests/test_special_affix.py
@ -0,0 +1,45 @@
+"""Test entries in the tokenization special-case interacting with prefix
+and suffix punctuation."""
+from __future__ import unicode_literals
+import pytest
+
+from spacy.en import EN
+
+def test_no_special():
+    assert len(EN.tokenize("(can)")) == 3
+
+def test_no_punct():
+    assert len(EN.tokenize("can't")) == 2
+
+def test_prefix():
+    assert len(EN.tokenize("(can't")) == 3
+
+
+def test_suffix():
+    assert len(EN.tokenize("can't)")) == 3
+
+
+def test_wrap():
+    assert len(EN.tokenize("(can't)")) == 4
+
+
+def test_uneven_wrap():
+    assert len(EN.tokenize("(can't?)")) == 5
+
+
+def test_prefix_interact():
+    assert len(EN.tokenize("U.S.")) == 1
+    assert len(EN.tokenize("us.")) == 2
+    assert len(EN.tokenize("(U.S.")) == 2
+
+
+def test_suffix_interact():
+    assert len(EN.tokenize("U.S.)")) == 2
+
+
+def test_even_wrap_interact():
+    assert len(EN.tokenize("(U.S.)")) == 3
+
+
+def test_uneven_wrap_interact():
+    assert len(EN.tokenize("(U.S.?)")) == 4