mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Slight cleaning of tokenizer code
This commit is contained in:
parent
59b41a9fd3
commit
71ee921055
|
@ -41,6 +41,7 @@ cdef class Lexicon:
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef Pool _mem
|
cdef Pool _mem
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
|
cdef vector[size_t] counts
|
||||||
cdef PreshMap cache
|
cdef PreshMap cache
|
||||||
cdef PreshMap specials
|
cdef PreshMap specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
@ -51,7 +52,6 @@ cdef class Language:
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
|
@ -16,6 +16,7 @@ import re
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from spacy.tokens import Tokens
|
from spacy.tokens import Tokens
|
||||||
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||||
|
from spacy.lexeme cimport LexStr_orig
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
|
@ -45,12 +46,20 @@ cdef class Language:
|
||||||
self.suffix_re = re.compile(suffix)
|
self.suffix_re = re.compile(suffix)
|
||||||
self.lexicon = Lexicon(lexemes)
|
self.lexicon = Lexicon(lexemes)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
self.counts = vector[size_t]()
|
||||||
|
|
||||||
property nr_types:
|
property nr_types:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Return the number of lexical types in the vocabulary"""
|
"""Return the number of lexical types in the vocabulary"""
|
||||||
return self.lexicon.size
|
return self.lexicon.size
|
||||||
|
|
||||||
|
property counts:
|
||||||
|
def __get__(self):
|
||||||
|
cdef size_t i
|
||||||
|
for i in range(self.lexicon.size):
|
||||||
|
count = self.counts[i] if i < self.counts.size() else 0
|
||||||
|
yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
|
@ -85,23 +94,23 @@ cdef class Language:
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
cdef Py_UNICODE c
|
|
||||||
cdef String span
|
cdef String span
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
c = chars[i]
|
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||||
if Py_UNICODE_ISSPACE(c) == 1:
|
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
try:
|
self._tokenize(tokens.v, &span)
|
||||||
self._tokenize(tokens.v, &span)
|
|
||||||
except MemoryError:
|
|
||||||
print chars[start:i]
|
|
||||||
raise
|
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens.v, &span)
|
self._tokenize(tokens.v, &span)
|
||||||
|
cdef int id_
|
||||||
|
for i in range(tokens.v.size()):
|
||||||
|
id_ = tokens.id(i)
|
||||||
|
while id_ >= self.counts.size():
|
||||||
|
self.counts.push_back(0)
|
||||||
|
self.counts[id_] += 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||||
|
@ -163,17 +172,6 @@ cdef class Language:
|
||||||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||||
self._save_cached(tokens_v, orig_key, orig_size)
|
self._save_cached(tokens_v, orig_key, orig_size)
|
||||||
|
|
||||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
|
|
||||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
|
||||||
cdef size_t i = 0
|
|
||||||
if lexemes != NULL:
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
string.n = 0
|
|
||||||
string.key = 0
|
|
||||||
string.chars = NULL
|
|
||||||
|
|
||||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except -1:
|
vector[LexemeC*] *suffixes) except -1:
|
||||||
|
@ -261,6 +259,7 @@ cdef class Lexicon:
|
||||||
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||||
lexeme_unpack(lexeme, lexeme_dict)
|
lexeme_unpack(lexeme, lexeme_dict)
|
||||||
self._dict.set(string.key, lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
|
self.lexemes.push_back(lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cdef LexemeC* get(self, String* string) except NULL:
|
cdef LexemeC* get(self, String* string) except NULL:
|
||||||
|
@ -273,6 +272,7 @@ cdef class Lexicon:
|
||||||
cdef unicode unicode_string = string.chars[:string.n]
|
cdef unicode unicode_string = string.chars[:string.n]
|
||||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
||||||
self._dict.set(string.key, lex)
|
self._dict.set(string.key, lex)
|
||||||
|
self.lexemes.push_back(lex)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ cpdef enum LexFloats:
|
||||||
|
|
||||||
|
|
||||||
cpdef enum LexStrs:
|
cpdef enum LexStrs:
|
||||||
LexStr_key
|
LexStr_orig
|
||||||
LexStr_casefix
|
LexStr_casefix
|
||||||
LexStr_shape
|
LexStr_shape
|
||||||
LexStr_unsparse
|
LexStr_unsparse
|
||||||
|
@ -70,6 +70,7 @@ cdef struct LexemeC:
|
||||||
flag_t orth_flags
|
flag_t orth_flags
|
||||||
flag_t dist_flags
|
flag_t dist_flags
|
||||||
|
|
||||||
|
|
||||||
cpdef dict get_lexeme_dict(size_t i, unicode string)
|
cpdef dict get_lexeme_dict(size_t i, unicode string)
|
||||||
|
|
||||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
||||||
|
|
|
@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
|
||||||
floats[<int>LexFloat_sentiment] = 0
|
floats[<int>LexFloat_sentiment] = 0
|
||||||
|
|
||||||
strings = [None for _ in range(LexStr_N)]
|
strings = [None for _ in range(LexStr_N)]
|
||||||
strings[<int>LexStr_key] = string
|
strings[<int>LexStr_orig] = string
|
||||||
strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
|
strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
|
||||||
strings[<int>LexStr_shape] = orth.word_shape(string)
|
strings[<int>LexStr_shape] = orth.word_shape(string)
|
||||||
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
||||||
strings[<int>LexStr_asciied] = orth.asciied(string)
|
strings[<int>LexStr_asciied] = orth.asciied(string)
|
||||||
|
@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
|
||||||
flags |= orth.is_space(string) << LexOrth_space
|
flags |= orth.is_space(string) << LexOrth_space
|
||||||
flags |= orth.is_title(string) << LexOrth_title
|
flags |= orth.is_title(string) << LexOrth_title
|
||||||
flags |= orth.is_upper(string) << LexOrth_upper
|
flags |= orth.is_upper(string) << LexOrth_upper
|
||||||
|
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
def get_dist_flags(unicode string):
|
def get_dist_flags(unicode string):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from libcpp.vector cimport vector
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef vector[LexemeC*] *v
|
cdef vector[LexemeC*] *v
|
||||||
|
|
||||||
|
cpdef int id(self, size_t i) except -1
|
||||||
cpdef unicode string(self, size_t i)
|
cpdef unicode string(self, size_t i)
|
||||||
cpdef float prob(self, size_t i) except 1
|
cpdef float prob(self, size_t i) except 1
|
||||||
cpdef int cluster(self, size_t i) except *
|
cpdef int cluster(self, size_t i) except *
|
||||||
|
|
|
@ -40,8 +40,11 @@ cdef class Tokens:
|
||||||
def append(self, Lexeme lexeme):
|
def append(self, Lexeme lexeme):
|
||||||
self.v.push_back(lexeme._c)
|
self.v.push_back(lexeme._c)
|
||||||
|
|
||||||
|
cpdef int id(self, size_t i) except -1:
|
||||||
|
return self.v.at(i).ints[<int>LexInt_i]
|
||||||
|
|
||||||
cpdef unicode string(self, size_t i):
|
cpdef unicode string(self, size_t i):
|
||||||
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key]
|
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
cdef unicode string = utf8_string.decode('utf8')
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ cdef class Lexeme:
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef bytes utf8_string = self._c.strings[<int>LexStr_key]
|
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
cdef unicode string = utf8_string.decode('utf8')
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user