mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
59c763eec1
* `strings`: More roubust type checking of keys/IDs, coerce `int`-like types to `hash_t` * Preserve existing public API behaviour * Fix return type * Replace `bool` with `bint`, rename to `_try_coerce_to_hash`, replace `id` with `hash` * Avoid unnecessary re-encoding and re-calculation of strings and hashs respectively * Rename variables named `hash` Add comment on early return
30 lines
727 B
Cython
30 lines
727 B
Cython
from libc.stdint cimport int64_t
|
|
from libcpp.vector cimport vector
|
|
from libcpp.set cimport set
|
|
from cymem.cymem cimport Pool
|
|
from preshed.maps cimport PreshMap
|
|
from murmurhash.mrmr cimport hash64
|
|
|
|
from .typedefs cimport attr_t, hash_t
|
|
|
|
|
|
cpdef hash_t hash_string(str string) except 0
|
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
|
|
|
cdef str decode_Utf8Str(const Utf8Str* string)
|
|
|
|
|
|
ctypedef union Utf8Str:
|
|
unsigned char[8] s
|
|
unsigned char* p
|
|
|
|
|
|
cdef class StringStore:
|
|
cdef Pool mem
|
|
|
|
cdef vector[hash_t] keys
|
|
cdef public PreshMap _map
|
|
|
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|