* Refactor tokenization, splitting it into a clearer life-cycle.

This commit is contained in:
Matthew Honnibal 2014-09-16 13:16:02 +02:00
parent c396581a0b
commit 143e51ec73
5 changed files with 136 additions and 55 deletions

View File

@ -94,3 +94,8 @@ you'll you will
you're you are you're you are
you've you have you've you have
10km 10 km 10km 10 km
U.S. U.S.
U.N. U.N.
Ms. Ms.
Mr. Mr.
P. P.

View File

@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens
cdef class English(Language): cdef class English(Language):
cdef int _split_one(self, Py_UNICODE* characters, size_t length) pass

View File

@ -56,26 +56,47 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language. name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method. lexicon (Lexicon): The lexicon. Exposes the lookup method.
""" """
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
if length == 1: cdef Py_UNICODE c0 = chars[0]
cdef Py_UNICODE c1 = chars[1]
if c0 == ",":
return 1 return 1
if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"): elif c0 == '"':
return 1
elif c0 == "(":
return 1
elif c0 == "[":
return 1
elif c0 == "{":
return 1
elif c0 == "*":
return 1
elif c0 == "<":
return 1
elif c0 == "$":
return 1
elif c0 == "£":
return 1
elif c0 == "":
return 1
elif c0 == "\u201c":
return 1
elif c0 == "'":
if c1 == "s":
return 2 return 2
cdef int i = 0 elif c1 == "S":
# Leading punctuation return 2
if _check_punct(characters, 0, length): elif c1 == "'":
return 2
else:
return 1 return 1
# Contractions elif c0 == "`":
elif length >= 3 and characters[length - 2] == "'": if c1 == "`":
c2 = characters[length-1] return 2
if c2 == "s" or c2 == "S": else:
return length - 2 return 1
if length >= 1: else:
# Split off all trailing punctuation characters return 0
i = 0
while i < length and not _check_punct(characters, i, length):
i += 1
return i
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P']) abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length): cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):

View File

@ -42,5 +42,14 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _split_one(self, Py_UNICODE* characters, size_t length) cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1

View File

@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref
from spacy._hashing cimport PointerHash from spacy._hashing cimport PointerHash
from spacy import orth from spacy import orth
@ -191,42 +194,77 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key) self._check_cache(tokens_v, string)
cdef size_t i if not string.n:
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens_v.push_back(lexemes[i])
i += 1
return 0 return 0
cdef uint64_t key = string.key cdef uint64_t orig_key = string.key
cdef size_t first_token = tokens_v.size() cdef size_t orig_size = tokens_v.size()
cdef int split
cdef int remaining = string.n
cdef String prefix
cdef LexemeC* lexeme
while remaining >= 1:
split = self._split_one(string.chars, string.n)
remaining -= split
string_slice_prefix(string, &prefix, split)
lexemes = <LexemeC**>self.specials.get(prefix.key)
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens_v.push_back(lexemes[i])
i += 1
else:
lexeme = <LexemeC*>self.lexicon.get(&prefix)
tokens_v.push_back(lexeme)
lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
cdef size_t j
for i, j in enumerate(range(first_token, tokens_v.size())):
lexemes[i] = tokens_v[0][j]
lexemes[i+1] = NULL
self.cache.set(key, lexemes)
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef vector[LexemeC*] prefixes
return length cdef vector[LexemeC*] suffixes
cdef String affix
cdef int split = self._find_prefix(string.chars, string.n)
while string.n and split >= 1:
string_slice_prefix(string, &affix, split)
prefixes.push_back(self.lexicon.get(&affix))
split = self._find_prefix(string.chars, string.n)
split = self._find_suffix(string.chars, string.n)
while string.n and split >= 1:
string_slice_suffix(string, &affix, split)
suffixes.push_back(self.lexicon.get(&affix))
split = self._find_suffix(string.chars, string.n)
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
self._save_cached(tokens_v, orig_key, orig_size)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i = 0
if lexemes != NULL:
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
string.n = 0
string.key = 0
string.chars = NULL
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1:
cdef LexemeC* lexeme
for lexeme in prefixes[0]:
tokens.push_back(lexeme)
self._check_cache(tokens, string)
if string.n != 0:
tokens.push_back(self.lexicon.get(string))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
tokens.push_back(deref(it))
preinc(it)
cdef int _save_cached(self, vector[LexemeC*] *tokens,
uint64_t key, size_t n) except -1:
pass
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
return 0
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
if length < 2:
return 0
cdef unicode string = characters[:length]
print repr(string)
if string.endswith("'s") or string.endswith("'S"):
return 2
elif string.endswith("..."):
return 3
elif not string[-1].isalnum():
return 1
else:
return 0
def _load_special_tokenization(self, token_rules): def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
s.chars += n s.chars += n
s.n -= n s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
string_from_slice(suffix, s.chars, s.n - n, s.n)
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)