mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Refactor tokenization, splitting it into a clearer life-cycle.
This commit is contained in:
parent
c396581a0b
commit
143e51ec73
|
@ -94,3 +94,8 @@ you'll you will
|
||||||
you're you are
|
you're you are
|
||||||
you've you have
|
you've you have
|
||||||
10km 10 km
|
10km 10 km
|
||||||
|
U.S. U.S.
|
||||||
|
U.N. U.N.
|
||||||
|
Ms. Ms.
|
||||||
|
Mr. Mr.
|
||||||
|
P. P.
|
||||||
|
|
|
@ -4,4 +4,4 @@ from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
pass
|
||||||
|
|
59
spacy/en.pyx
59
spacy/en.pyx
|
@ -56,27 +56,48 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
if length == 1:
|
cdef Py_UNICODE c0 = chars[0]
|
||||||
|
cdef Py_UNICODE c1 = chars[1]
|
||||||
|
if c0 == ",":
|
||||||
return 1
|
return 1
|
||||||
if characters[0] == "'" and (characters[1] == "s" or characters[1] == "S"):
|
elif c0 == '"':
|
||||||
return 2
|
|
||||||
cdef int i = 0
|
|
||||||
# Leading punctuation
|
|
||||||
if _check_punct(characters, 0, length):
|
|
||||||
return 1
|
return 1
|
||||||
# Contractions
|
elif c0 == "(":
|
||||||
elif length >= 3 and characters[length - 2] == "'":
|
return 1
|
||||||
c2 = characters[length-1]
|
elif c0 == "[":
|
||||||
if c2 == "s" or c2 == "S":
|
return 1
|
||||||
return length - 2
|
elif c0 == "{":
|
||||||
if length >= 1:
|
return 1
|
||||||
# Split off all trailing punctuation characters
|
elif c0 == "*":
|
||||||
i = 0
|
return 1
|
||||||
while i < length and not _check_punct(characters, i, length):
|
elif c0 == "<":
|
||||||
i += 1
|
return 1
|
||||||
return i
|
elif c0 == "$":
|
||||||
|
return 1
|
||||||
|
elif c0 == "£":
|
||||||
|
return 1
|
||||||
|
elif c0 == "€":
|
||||||
|
return 1
|
||||||
|
elif c0 == "\u201c":
|
||||||
|
return 1
|
||||||
|
elif c0 == "'":
|
||||||
|
if c1 == "s":
|
||||||
|
return 2
|
||||||
|
elif c1 == "S":
|
||||||
|
return 2
|
||||||
|
elif c1 == "'":
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
elif c0 == "`":
|
||||||
|
if c1 == "`":
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
|
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
|
||||||
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
||||||
cdef unicode char_i = characters[i]
|
cdef unicode char_i = characters[i]
|
||||||
|
|
|
@ -42,5 +42,14 @@ cdef class Language:
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
|
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||||
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
||||||
|
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||||
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes) except -1
|
||||||
|
|
||||||
|
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
|
||||||
|
|
||||||
|
|
114
spacy/lang.pyx
114
spacy/lang.pyx
|
@ -20,6 +20,9 @@ from spacy.lexeme cimport LexemeC, lexeme_init
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
|
|
||||||
|
from cython.operator cimport preincrement as preinc
|
||||||
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
|
|
||||||
from spacy._hashing cimport PointerHash
|
from spacy._hashing cimport PointerHash
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
|
@ -191,42 +194,77 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||||
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
|
self._check_cache(tokens_v, string)
|
||||||
cdef size_t i
|
if not string.n:
|
||||||
if lexemes != NULL:
|
|
||||||
i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens_v.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
return 0
|
return 0
|
||||||
cdef uint64_t key = string.key
|
cdef uint64_t orig_key = string.key
|
||||||
cdef size_t first_token = tokens_v.size()
|
cdef size_t orig_size = tokens_v.size()
|
||||||
cdef int split
|
|
||||||
cdef int remaining = string.n
|
|
||||||
cdef String prefix
|
|
||||||
cdef LexemeC* lexeme
|
|
||||||
while remaining >= 1:
|
|
||||||
split = self._split_one(string.chars, string.n)
|
|
||||||
remaining -= split
|
|
||||||
string_slice_prefix(string, &prefix, split)
|
|
||||||
lexemes = <LexemeC**>self.specials.get(prefix.key)
|
|
||||||
if lexemes != NULL:
|
|
||||||
i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens_v.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
lexeme = <LexemeC*>self.lexicon.get(&prefix)
|
|
||||||
tokens_v.push_back(lexeme)
|
|
||||||
lexemes = <LexemeC**>calloc((tokens_v.size() - first_token) + 1, sizeof(LexemeC*))
|
|
||||||
cdef size_t j
|
|
||||||
for i, j in enumerate(range(first_token, tokens_v.size())):
|
|
||||||
lexemes[i] = tokens_v[0][j]
|
|
||||||
lexemes[i+1] = NULL
|
|
||||||
self.cache.set(key, lexemes)
|
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef vector[LexemeC*] prefixes
|
||||||
return length
|
cdef vector[LexemeC*] suffixes
|
||||||
|
|
||||||
|
cdef String affix
|
||||||
|
cdef int split = self._find_prefix(string.chars, string.n)
|
||||||
|
while string.n and split >= 1:
|
||||||
|
string_slice_prefix(string, &affix, split)
|
||||||
|
prefixes.push_back(self.lexicon.get(&affix))
|
||||||
|
split = self._find_prefix(string.chars, string.n)
|
||||||
|
|
||||||
|
split = self._find_suffix(string.chars, string.n)
|
||||||
|
while string.n and split >= 1:
|
||||||
|
string_slice_suffix(string, &affix, split)
|
||||||
|
suffixes.push_back(self.lexicon.get(&affix))
|
||||||
|
split = self._find_suffix(string.chars, string.n)
|
||||||
|
|
||||||
|
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||||
|
self._save_cached(tokens_v, orig_key, orig_size)
|
||||||
|
|
||||||
|
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
|
||||||
|
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||||
|
cdef size_t i = 0
|
||||||
|
if lexemes != NULL:
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
|
string.n = 0
|
||||||
|
string.key = 0
|
||||||
|
string.chars = NULL
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||||
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes) except -1:
|
||||||
|
cdef LexemeC* lexeme
|
||||||
|
for lexeme in prefixes[0]:
|
||||||
|
tokens.push_back(lexeme)
|
||||||
|
self._check_cache(tokens, string)
|
||||||
|
if string.n != 0:
|
||||||
|
tokens.push_back(self.lexicon.get(string))
|
||||||
|
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
|
while it != suffixes.rend():
|
||||||
|
tokens.push_back(deref(it))
|
||||||
|
preinc(it)
|
||||||
|
|
||||||
|
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
||||||
|
uint64_t key, size_t n) except -1:
|
||||||
|
pass
|
||||||
|
|
||||||
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
|
||||||
|
if length < 2:
|
||||||
|
return 0
|
||||||
|
cdef unicode string = characters[:length]
|
||||||
|
print repr(string)
|
||||||
|
if string.endswith("'s") or string.endswith("'S"):
|
||||||
|
return 2
|
||||||
|
elif string.endswith("..."):
|
||||||
|
return 3
|
||||||
|
elif not string[-1].isalnum():
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
@ -328,3 +366,11 @@ cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
|
||||||
s.chars += n
|
s.chars += n
|
||||||
s.n -= n
|
s.n -= n
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
|
||||||
|
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
||||||
|
s.n -= n
|
||||||
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user