mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Refactor tokenization, splitting it into a clearer life-cycle.
This commit is contained in:
parent
c396581a0b
commit
143e51ec73
|
@ -94,3 +94,8 @@ you'll you will
|
|||
you're you are
|
||||
you've you have
|
||||
10km 10 km
|
||||
U.S. U.S.
|
||||
U.N. U.N.
|
||||
Ms. Ms.
|
||||
Mr. Mr.
|
||||
P. P.
|
||||
|
|
|
@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens
|
|||
|
||||
|
||||
cdef class English(Language):
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||
pass
|
||||
|
|
59
spacy/en.pyx
59
spacy/en.pyx
|
@ -56,27 +56,48 @@ cdef class English(Language):
|
|||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||
if length == 1:
|
||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef Py_UNICODE c0 = chars[0]
|
||||
cdef Py_UNICODE c1 = chars[1]
|
||||
if c0 == ",":
|
||||
return 1
|
||||
if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"):
|
||||
return 2
|
||||
cdef int i = 0
|
||||
# Leading punctuation
|
||||
if _check_punct(characters, 0, length):
|
||||
elif c0 == '"':
|
||||
return 1
|
||||
# Contractions
|
||||
elif length >= 3 and characters[length - 2] == "'":
|
||||
c2 = characters[length-1]
|
||||
if c2 == "s" or c2 == "S":
|
||||
return length - 2
|
||||
if length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not _check_punct(characters, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
elif c0 == "(":
|
||||
return 1
|
||||
elif c0 == "[":
|
||||
return 1
|
||||
elif c0 == "{":
|
||||
return 1
|
||||
elif c0 == "*":
|
||||
return 1
|
||||
elif c0 == "<":
|
||||
return 1
|
||||
elif c0 == "$":
|
||||
return 1
|
||||
elif c0 == "£":
|
||||
return 1
|
||||
elif c0 == "€":
|
||||
return 1
|
||||
elif c0 == "\u201c":
|
||||
return 1
|
||||
elif c0 == "'":
|
||||
if c1 == "s":
|
||||
return 2
|
||||
elif c1 == "S":
|
||||
return 2
|
||||
elif c1 == "'":
|
||||
return 2
|
||||
else:
|
||||
return 1
|
||||
elif c0 == "`":
|
||||
if c1 == "`":
|
||||
return 2
|
||||
else:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
|
||||
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
||||
cdef unicode char_i = characters[i]
|
||||
|
|
|
@ -42,5 +42,14 @@ cdef class Language:
|
|||
cpdef Tokens tokenize(self, unicode text)
|
||||
cpdef Lexeme lookup(self, unicode text)
|
||||
|
||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||
|
||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1
|
||||
|
||||
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
|
||||
|
||||
|
|
114
spacy/lang.pyx
114
spacy/lang.pyx
|
@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
|
||||
from spacy._hashing cimport PointerHash
|
||||
from spacy import orth
|
||||
|
@ -191,42 +194,77 @@ cdef class Language:
|
|||
return tokens
|
||||
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
|
||||
cdef size_t i
|
||||
if lexemes != NULL:
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
tokens_v.push_back(lexemes[i])
|
||||
i += 1
|
||||
self._check_cache(tokens_v, string)
|
||||
if not string.n:
|
||||
return 0
|
||||
cdef uint64_t key = string.key
|
||||
cdef size_t first_token = tokens_v.size()
|
||||
cdef int split
|
||||
cdef int remaining = string.n
|
||||
cdef String prefix
|
||||
cdef LexemeC* lexeme
|
||||
while remaining >= 1:
|
||||
split = self._split_one(string.chars, string.n)
|
||||
remaining -= split
|
||||
string_slice_prefix(string, &prefix, split)
|
||||
lexemes = <LexemeC**>self.specials.get(prefix.key)
|
||||
if lexemes != NULL:
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
tokens_v.push_back(lexemes[i])
|
||||
i += 1
|
||||
else:
|
||||
lexeme = <LexemeC*>self.lexicon.get(&prefix)
|
||||
tokens_v.push_back(lexeme)
|
||||
lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
|
||||
cdef size_t j
|
||||
for i, j in enumerate(range(first_token, tokens_v.size())):
|
||||
lexemes[i] = tokens_v[0][j]
|
||||
lexemes[i+1] = NULL
|
||||
self.cache.set(key, lexemes)
|
||||
cdef uint64_t orig_key = string.key
|
||||
cdef size_t orig_size = tokens_v.size()
|
||||
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||
return length
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
|
||||
cdef String affix
|
||||
cdef int split = self._find_prefix(string.chars, string.n)
|
||||
while string.n and split >= 1:
|
||||
string_slice_prefix(string, &affix, split)
|
||||
prefixes.push_back(self.lexicon.get(&affix))
|
||||
split = self._find_prefix(string.chars, string.n)
|
||||
|
||||
split = self._find_suffix(string.chars, string.n)
|
||||
while string.n and split >= 1:
|
||||
string_slice_suffix(string, &affix, split)
|
||||
suffixes.push_back(self.lexicon.get(&affix))
|
||||
split = self._find_suffix(string.chars, string.n)
|
||||
|
||||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||
self._save_cached(tokens_v, orig_key, orig_size)
|
||||
|
||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
|
||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||
cdef size_t i = 0
|
||||
if lexemes != NULL:
|
||||
while lexemes[i] != NULL:
|
||||
tokens.push_back(lexemes[i])
|
||||
i += 1
|
||||
string.n = 0
|
||||
string.key = 0
|
||||
string.chars = NULL
|
||||
|
||||
|
||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1:
|
||||
cdef LexemeC* lexeme
|
||||
for lexeme in prefixes[0]:
|
||||
tokens.push_back(lexeme)
|
||||
self._check_cache(tokens, string)
|
||||
if string.n != 0:
|
||||
tokens.push_back(self.lexicon.get(string))
|
||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
tokens.push_back(deref(it))
|
||||
preinc(it)
|
||||
|
||||
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
||||
uint64_t key, size_t n) except -1:
|
||||
pass
|
||||
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
|
||||
return 0
|
||||
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
|
||||
if length < 2:
|
||||
return 0
|
||||
cdef unicode string = characters[:length]
|
||||
print repr(string)
|
||||
if string.endswith("'s") or string.endswith("'S"):
|
||||
return 2
|
||||
elif string.endswith("..."):
|
||||
return 3
|
||||
elif not string[-1].isalnum():
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def _load_special_tokenization(self, token_rules):
|
||||
'''Load special-case tokenization rules.
|
||||
|
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
|
|||
s.chars += n
|
||||
s.n -= n
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
|
||||
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
||||
s.n -= n
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user