mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add get_string_id helper to spacy.strings
This commit is contained in:
parent
cc1ea03004
commit
16fd8dce1d
|
@ -9,6 +9,7 @@ from libc.stdint cimport uint32_t
|
|||
from murmurhash.mrmr cimport hash64, hash32
|
||||
import srsly
|
||||
|
||||
from .compat import basestring_
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
from .typedefs cimport hash_t
|
||||
|
@ -16,6 +17,24 @@ from .errors import Errors
|
|||
from . import util
|
||||
|
||||
|
||||
def get_string_id(key):
|
||||
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||
already an ID, return it.
|
||||
|
||||
This function optimises for convenience over performance, so shouldn't be
|
||||
used in tight loops.
|
||||
"""
|
||||
if not isinstance(key, basestring_):
|
||||
return key
|
||||
elif key in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[key]
|
||||
elif not key:
|
||||
return 0
|
||||
else:
|
||||
chars = key.encode('utf8')
|
||||
return hash_utf8(chars, len(chars))
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
chars = string.encode('utf8')
|
||||
return hash_utf8(chars, len(chars))
|
||||
|
|
Loading…
Reference in New Issue
Block a user