diff --git a/data/en/tokenization b/data/en/tokenization index 09f8c1e62..1190cdf59 100644 --- a/data/en/tokenization +++ b/data/en/tokenization @@ -94,3 +94,8 @@ you'll you will you're you are you've you have 10km 10 km +U.S. U.S. +U.N. U.N. +Ms. Ms. +Mr. Mr. +P. P. diff --git a/spacy/en.pxd b/spacy/en.pxd index c3c605f1f..5160a1177 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens cdef class English(Language): - cdef int _split_one(self, Py_UNICODE* characters, size_t length) + pass diff --git a/spacy/en.pyx b/spacy/en.pyx index b346afda1..e3d2ec7a1 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -56,27 +56,48 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - cdef int _split_one(self, Py_UNICODE* characters, size_t length): - if length == 1: + cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: + cdef Py_UNICODE c0 = chars[0] + cdef Py_UNICODE c1 = chars[1] + if c0 == ",": return 1 - if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"): - return 2 - cdef int i = 0 - # Leading punctuation - if _check_punct(characters, 0, length): + elif c0 == '"': return 1 - # Contractions - elif length >= 3 and characters[length - 2] == "'": - c2 = characters[length-1] - if c2 == "s" or c2 == "S": - return length - 2 - if length >= 1: - # Split off all trailing punctuation characters - i = 0 - while i < length and not _check_punct(characters, i, length): - i += 1 - return i - + elif c0 == "(": + return 1 + elif c0 == "[": + return 1 + elif c0 == "{": + return 1 + elif c0 == "*": + return 1 + elif c0 == "<": + return 1 + elif c0 == "$": + return 1 + elif c0 == "£": + return 1 + elif c0 == "€": + return 1 + elif c0 == "\u201c": + return 1 + elif c0 == "'": + if c1 == "s": + return 2 + elif c1 == "S": + return 2 + elif c1 == "'": + return 2 + else: + return 1 + elif c0 == "`": + if c1 == "`": + return 2 + else: + return 1 + else: + return 0 + abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P']) cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length): cdef unicode char_i = characters[i] diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 93d813d1c..d154a77d3 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -42,5 +42,14 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) + cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1 cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 - cdef int _split_one(self, Py_UNICODE* characters, size_t length) + cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) + cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) + + cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes) except -1 + + cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1 + diff --git a/spacy/lang.pyx b/spacy/lang.pyx index bd205b233..b8e5256d6 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF +from cython.operator cimport preincrement as preinc +from cython.operator cimport dereference as deref + from spacy._hashing cimport PointerHash from spacy import orth @@ -191,42 +194,77 @@ cdef class Language: return tokens cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: - cdef LexemeC** lexemes = self.cache.get(string.key) - cdef size_t i - if lexemes != NULL: - i = 0 - while lexemes[i] != NULL: - tokens_v.push_back(lexemes[i]) - i += 1 + self._check_cache(tokens_v, string) + if not string.n: return 0 - cdef uint64_t key = string.key - cdef size_t first_token = tokens_v.size() - cdef int split - cdef int remaining = string.n - cdef String prefix - cdef LexemeC* lexeme - while remaining >= 1: - split = self._split_one(string.chars, string.n) - remaining -= split - string_slice_prefix(string, &prefix, split) - lexemes = self.specials.get(prefix.key) - if lexemes != NULL: - i = 0 - while lexemes[i] != NULL: - tokens_v.push_back(lexemes[i]) - i += 1 - else: - lexeme = self.lexicon.get(&prefix) - tokens_v.push_back(lexeme) - lexemes = calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*)) - cdef size_t j - for i, j in enumerate(range(first_token, tokens_v.size())): - lexemes[i] = tokens_v[0][j] - lexemes[i+1] = NULL - self.cache.set(key, lexemes) + cdef uint64_t orig_key = string.key + cdef size_t orig_size = tokens_v.size() - cdef int _split_one(self, Py_UNICODE* characters, size_t length): - return length + cdef vector[LexemeC*] prefixes + cdef vector[LexemeC*] suffixes + + cdef String affix + cdef int split = self._find_prefix(string.chars, string.n) + while string.n and split >= 1: + string_slice_prefix(string, &affix, split) + prefixes.push_back(self.lexicon.get(&affix)) + split = self._find_prefix(string.chars, string.n) + + split = self._find_suffix(string.chars, string.n) + while string.n and split >= 1: + string_slice_suffix(string, &affix, split) + suffixes.push_back(self.lexicon.get(&affix)) + split = self._find_suffix(string.chars, string.n) + + self._attach_tokens(tokens_v, string, &prefixes, &suffixes) + self._save_cached(tokens_v, orig_key, orig_size) + + cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1: + lexemes = self.cache.get(string.key) + cdef size_t i = 0 + if lexemes != NULL: + while lexemes[i] != NULL: + tokens.push_back(lexemes[i]) + i += 1 + string.n = 0 + string.key = 0 + string.chars = NULL + + + cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes) except -1: + cdef LexemeC* lexeme + for lexeme in prefixes[0]: + tokens.push_back(lexeme) + self._check_cache(tokens, string) + if string.n != 0: + tokens.push_back(self.lexicon.get(string)) + cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() + while it != suffixes.rend(): + tokens.push_back(deref(it)) + preinc(it) + + cdef int _save_cached(self, vector[LexemeC*] *tokens, + uint64_t key, size_t n) except -1: + pass + + cdef int _find_prefix(self, Py_UNICODE* characters, size_t length): + return 0 + + cdef int _find_suffix(self, Py_UNICODE* characters, size_t length): + if length < 2: + return 0 + cdef unicode string = characters[:length] + print repr(string) + if string.endswith("'s") or string.endswith("'S"): + return 2 + elif string.endswith("..."): + return 3 + elif not string[-1].isalnum(): + return 1 + else: + return 0 def _load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. @@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil: s.chars += n s.n -= n s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + + +cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil: + string_from_slice(suffix, s.chars, s.n - n, s.n) + s.n -= n + s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + +