mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags
This commit is contained in:
parent
187372c7f3
commit
1c9253701d
|
@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport Tokens, TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
|
||||
|
@ -45,5 +45,5 @@ cdef class Language:
|
|||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap
|
|||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
from .lexeme cimport check_flag, IS_ALPHA
|
||||
from .lexeme cimport check_flag
|
||||
|
||||
from .utf8string cimport slice_unicode
|
||||
|
||||
|
@ -114,7 +114,7 @@ cdef class Language:
|
|||
orig_size = tokens.length
|
||||
self._split_affixes(span, &prefixes, &suffixes)
|
||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
||||
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except NULL:
|
||||
|
@ -189,14 +189,14 @@ cdef class Language:
|
|||
idx = tokens.push_back(idx, deref(it))
|
||||
preinc(it)
|
||||
|
||||
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
||||
cdef int i
|
||||
for i in range(n):
|
||||
if tokens[i].id == 1:
|
||||
if tokens[i].lex.id == 1:
|
||||
return 0
|
||||
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i]
|
||||
lexemes[i] = tokens[i].lex
|
||||
lexemes[i + 1] = NULL
|
||||
self._cache.set(key, lexemes)
|
||||
|
||||
|
@ -255,7 +255,9 @@ cdef class Lexicon:
|
|||
self.set_flags = set_flags
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Retrieve a pointer to a Lexeme from the lexicon.'''
|
||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._map.get(string.key)
|
||||
if lex != NULL:
|
||||
|
|
|
@ -9,18 +9,22 @@ from .typedefs cimport flags_t
|
|||
from .utf8string cimport StringStore
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
int idx
|
||||
int pos
|
||||
int sense
|
||||
|
||||
|
||||
cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore _string_store
|
||||
|
||||
cdef const Lexeme** _lex_ptr
|
||||
cdef int* _idx_ptr
|
||||
cdef int* _pos_ptr
|
||||
cdef int* _ner_ptr
|
||||
cdef const Lexeme** lex
|
||||
cdef int* idx
|
||||
cdef int* pos
|
||||
cdef int* ner
|
||||
cdef TokenC* _data
|
||||
cdef TokenC* data
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
|
|
@ -40,28 +40,18 @@ cdef class Tokens:
|
|||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||
# However, we need to remember the true starting places, so that we can
|
||||
# realloc.
|
||||
self._lex_ptr = <const Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self.lex = self._lex_ptr
|
||||
self.idx = self._idx_ptr
|
||||
self.pos = self._pos_ptr
|
||||
self.ner = self._ner_ptr
|
||||
self._data = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||
cdef int i
|
||||
for i in range(size + (PADDING*2)):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
self.lex += PADDING
|
||||
self.idx += PADDING
|
||||
self.pos += PADDING
|
||||
self.ner += PADDING
|
||||
self._data[i] = EMPTY_TOKEN
|
||||
self.data = self._data + PADDING
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
|
||||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
|
||||
self.lex[i][0])
|
||||
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
||||
self.data[i].sense, self.data[i].lex[0])
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(self.length):
|
||||
|
@ -73,10 +63,11 @@ cdef class Tokens:
|
|||
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
self.lex[self.length] = lexeme
|
||||
self.idx[self.length] = idx
|
||||
self.pos[self.length] = 0
|
||||
self.ner[self.length] = 0
|
||||
cdef TokenC* t = &self.data[self.length]
|
||||
t.lex = lexeme
|
||||
t.idx = idx
|
||||
t.pos = 0
|
||||
t.sense = 0
|
||||
self.length += 1
|
||||
return idx + lexeme.length
|
||||
|
||||
|
@ -108,7 +99,7 @@ cdef class Tokens:
|
|||
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_attr(self.lex[i], feature)
|
||||
output[i, j] = get_attr(self.data[i].lex, feature)
|
||||
return output
|
||||
|
||||
def count_by(self, attr_id_t attr_id):
|
||||
|
@ -118,23 +109,18 @@ cdef class Tokens:
|
|||
|
||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||
for i in range(self.length):
|
||||
attr = get_attr(self.lex[i], attr_id)
|
||||
attr = get_attr(self.data[i].lex, attr_id)
|
||||
counts.inc(attr, 1)
|
||||
return dict(counts)
|
||||
|
||||
def _realloc(self, new_size):
|
||||
self.max_length = new_size
|
||||
n = new_size + (PADDING * 2)
|
||||
self._lex_ptr = <const Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
||||
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
|
||||
self.lex = self._lex_ptr + PADDING
|
||||
self.idx = self._idx_ptr + PADDING
|
||||
self.pos = self._pos_ptr + PADDING
|
||||
self.ner = self._ner_ptr + PADDING
|
||||
self._data = <TokenC*>self.mem.realloc(self._data, n * sizeof(TokenC))
|
||||
self.data = self._data + PADDING
|
||||
cdef int i
|
||||
for i in range(self.length, self.max_length + PADDING):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
self.data[i] = EMPTY_TOKEN
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
|
|
Loading…
Reference in New Issue
Block a user