* Refactor tokenization, splitting it into a clearer life-cycle.

This commit is contained in:
Matthew Honnibal 2014-09-16 13:16:02 +02:00
parent c396581a0b
commit 143e51ec73
5 changed files with 136 additions and 55 deletions

View File

@ -94,3 +94,8 @@ you'll you will
you're you are
you've you have
10km 10 km
U.S. U.S.
U.N. U.N.
Ms. Ms.
Mr. Mr.
P. P.

View File

@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens
cdef class English(Language):
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
pass

View File

@ -56,27 +56,48 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
if length == 1:
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef Py_UNICODE c0 = chars[0]
cdef Py_UNICODE c1 = chars[1]
if c0 == ",":
return 1
if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"):
return 2
cdef int i = 0
# Leading punctuation
if _check_punct(characters, 0, length):
elif c0 == '"':
return 1
# Contractions
elif length >= 3 and characters[length - 2] == "'":
c2 = characters[length-1]
if c2 == "s" or c2 == "S":
return length - 2
if length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not _check_punct(characters, i, length):
i += 1
return i
elif c0 == "(":
return 1
elif c0 == "[":
return 1
elif c0 == "{":
return 1
elif c0 == "*":
return 1
elif c0 == "<":
return 1
elif c0 == "$":
return 1
elif c0 == "£":
return 1
elif c0 == "":
return 1
elif c0 == "\u201c":
return 1
elif c0 == "'":
if c1 == "s":
return 2
elif c1 == "S":
return 2
elif c1 == "'":
return 2
else:
return 1
elif c0 == "`":
if c1 == "`":
return 2
else:
return 1
else:
return 0
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
cdef unicode char_i = characters[i]

View File

@ -42,5 +42,14 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1

View File

@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref
from spacy._hashing cimport PointerHash
from spacy import orth
@ -191,42 +194,77 @@ cdef class Language:
return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens_v.push_back(lexemes[i])
i += 1
self._check_cache(tokens_v, string)
if not string.n:
return 0
cdef uint64_t key = string.key
cdef size_t first_token = tokens_v.size()
cdef int split
cdef int remaining = string.n
cdef String prefix
cdef LexemeC* lexeme
while remaining >= 1:
split = self._split_one(string.chars, string.n)
remaining -= split
string_slice_prefix(string, &prefix, split)
lexemes = <LexemeC**>self.specials.get(prefix.key)
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens_v.push_back(lexemes[i])
i += 1
else:
lexeme = <LexemeC*>self.lexicon.get(&prefix)
tokens_v.push_back(lexeme)
lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
cdef size_t j
for i, j in enumerate(range(first_token, tokens_v.size())):
lexemes[i] = tokens_v[0][j]
lexemes[i+1] = NULL
self.cache.set(key, lexemes)
cdef uint64_t orig_key = string.key
cdef size_t orig_size = tokens_v.size()
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef String affix
cdef int split = self._find_prefix(string.chars, string.n)
while string.n and split >= 1:
string_slice_prefix(string, &affix, split)
prefixes.push_back(self.lexicon.get(&affix))
split = self._find_prefix(string.chars, string.n)
split = self._find_suffix(string.chars, string.n)
while string.n and split >= 1:
string_slice_suffix(string, &affix, split)
suffixes.push_back(self.lexicon.get(&affix))
split = self._find_suffix(string.chars, string.n)
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
self._save_cached(tokens_v, orig_key, orig_size)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i = 0
if lexemes != NULL:
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
string.n = 0
string.key = 0
string.chars = NULL
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1:
cdef LexemeC* lexeme
for lexeme in prefixes[0]:
tokens.push_back(lexeme)
self._check_cache(tokens, string)
if string.n != 0:
tokens.push_back(self.lexicon.get(string))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
tokens.push_back(deref(it))
preinc(it)
cdef int _save_cached(self, vector[LexemeC*] *tokens,
uint64_t key, size_t n) except -1:
pass
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
return 0
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
if length < 2:
return 0
cdef unicode string = characters[:length]
print repr(string)
if string.endswith("'s") or string.endswith("'S"):
return 2
elif string.endswith("..."):
return 3
elif not string[-1].isalnum():
return 1
else:
return 0
def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules.
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
s.chars += n
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
string_from_slice(suffix, s.chars, s.n - n, s.n)
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)