mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Slight cleaning of tokenizer code
This commit is contained in:
parent
59b41a9fd3
commit
71ee921055
|
@ -41,6 +41,7 @@ cdef class Lexicon:
|
|||
cdef class Language:
|
||||
cdef Pool _mem
|
||||
cdef unicode name
|
||||
cdef vector[size_t] counts
|
||||
cdef PreshMap cache
|
||||
cdef PreshMap specials
|
||||
cpdef readonly Lexicon lexicon
|
||||
|
@ -51,7 +52,6 @@ cdef class Language:
|
|||
cpdef Tokens tokenize(self, unicode text)
|
||||
cpdef Lexeme lookup(self, unicode text)
|
||||
|
||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||
|
|
|
@ -16,6 +16,7 @@ import re
|
|||
from .util import read_lang_data
|
||||
from spacy.tokens import Tokens
|
||||
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||
from spacy.lexeme cimport LexStr_orig
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
@ -45,12 +46,20 @@ cdef class Language:
|
|||
self.suffix_re = re.compile(suffix)
|
||||
self.lexicon = Lexicon(lexemes)
|
||||
self._load_special_tokenization(rules)
|
||||
self.counts = vector[size_t]()
|
||||
|
||||
property nr_types:
|
||||
def __get__(self):
|
||||
"""Return the number of lexical types in the vocabulary"""
|
||||
return self.lexicon.size
|
||||
|
||||
property counts:
|
||||
def __get__(self):
|
||||
cdef size_t i
|
||||
for i in range(self.lexicon.size):
|
||||
count = self.counts[i] if i < self.counts.size() else 0
|
||||
yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
|
||||
|
||||
cpdef Lexeme lookup(self, unicode string):
|
||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||
|
||||
|
@ -85,23 +94,23 @@ cdef class Language:
|
|||
cdef size_t start = 0
|
||||
cdef size_t i = 0
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef Py_UNICODE c
|
||||
cdef String span
|
||||
for i in range(length):
|
||||
c = chars[i]
|
||||
if Py_UNICODE_ISSPACE(c) == 1:
|
||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||
if start < i:
|
||||
string_from_slice(&span, chars, start, i)
|
||||
try:
|
||||
self._tokenize(tokens.v, &span)
|
||||
except MemoryError:
|
||||
print chars[start:i]
|
||||
raise
|
||||
start = i + 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_from_slice(&span, chars, start, i)
|
||||
self._tokenize(tokens.v, &span)
|
||||
cdef int id_
|
||||
for i in range(tokens.v.size()):
|
||||
id_ = tokens.id(i)
|
||||
while id_ >= self.counts.size():
|
||||
self.counts.push_back(0)
|
||||
self.counts[id_] += 1
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||
|
@ -163,17 +172,6 @@ cdef class Language:
|
|||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||
self._save_cached(tokens_v, orig_key, orig_size)
|
||||
|
||||
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
|
||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||
cdef size_t i = 0
|
||||
if lexemes != NULL:
|
||||
while lexemes[i] != NULL:
|
||||
tokens.push_back(lexemes[i])
|
||||
i += 1
|
||||
string.n = 0
|
||||
string.key = 0
|
||||
string.chars = NULL
|
||||
|
||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1:
|
||||
|
@ -261,6 +259,7 @@ cdef class Lexicon:
|
|||
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||
lexeme_unpack(lexeme, lexeme_dict)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.lexemes.push_back(lexeme)
|
||||
self.size += 1
|
||||
|
||||
cdef LexemeC* get(self, String* string) except NULL:
|
||||
|
@ -273,6 +272,7 @@ cdef class Lexicon:
|
|||
cdef unicode unicode_string = string.chars[:string.n]
|
||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
||||
self._dict.set(string.key, lex)
|
||||
self.lexemes.push_back(lex)
|
||||
self.size += 1
|
||||
return lex
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ cpdef enum LexFloats:
|
|||
|
||||
|
||||
cpdef enum LexStrs:
|
||||
LexStr_key
|
||||
LexStr_orig
|
||||
LexStr_casefix
|
||||
LexStr_shape
|
||||
LexStr_unsparse
|
||||
|
@ -70,6 +70,7 @@ cdef struct LexemeC:
|
|||
flag_t orth_flags
|
||||
flag_t dist_flags
|
||||
|
||||
|
||||
cpdef dict get_lexeme_dict(size_t i, unicode string)
|
||||
|
||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
||||
|
|
|
@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
|
|||
floats[<int>LexFloat_sentiment] = 0
|
||||
|
||||
strings = [None for _ in range(LexStr_N)]
|
||||
strings[<int>LexStr_key] = string
|
||||
strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
|
||||
strings[<int>LexStr_orig] = string
|
||||
strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
|
||||
strings[<int>LexStr_shape] = orth.word_shape(string)
|
||||
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
||||
strings[<int>LexStr_asciied] = orth.asciied(string)
|
||||
|
@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
|
|||
flags |= orth.is_space(string) << LexOrth_space
|
||||
flags |= orth.is_title(string) << LexOrth_title
|
||||
flags |= orth.is_upper(string) << LexOrth_upper
|
||||
|
||||
return flags
|
||||
|
||||
|
||||
def get_dist_flags(unicode string):
|
||||
return 0
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from libcpp.vector cimport vector
|
|||
cdef class Tokens:
|
||||
cdef vector[LexemeC*] *v
|
||||
|
||||
cpdef int id(self, size_t i) except -1
|
||||
cpdef unicode string(self, size_t i)
|
||||
cpdef float prob(self, size_t i) except 1
|
||||
cpdef int cluster(self, size_t i) except *
|
||||
|
|
|
@ -40,8 +40,11 @@ cdef class Tokens:
|
|||
def append(self, Lexeme lexeme):
|
||||
self.v.push_back(lexeme._c)
|
||||
|
||||
cpdef int id(self, size_t i) except -1:
|
||||
return self.v.at(i).ints[<int>LexInt_i]
|
||||
|
||||
cpdef unicode string(self, size_t i):
|
||||
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key]
|
||||
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ cdef class Lexeme:
|
|||
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef bytes utf8_string = self._c.strings[<int>LexStr_key]
|
||||
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user