mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update Cython string types (#9143)
* Replace all basestring references with unicode `basestring` was a compatability type introduced by Cython to make dealing with utf-8 strings in Python2 easier. In Python3 it is equivalent to the unicode (or str) type. I replaced all references to basestring with unicode, since that was used elsewhere, but we could also just replace them with str, which shoudl also be equivalent. All tests pass locally. * Replace all references to unicode type with str Since we only support python3 this is simpler. * Remove all references to unicode type This removes all references to the unicode type across the codebase and replaces them with `str`, which makes it more drastic than the prior commits. In order to make this work importing `unicode_literals` had to be removed, and one explicit unicode literal also had to be removed (it is unclear why this is necessary in Cython with language level 3, but without doing it there were errors about implicit conversion). When `unicode` is used as a type in comments it was also edited to be `str`. Additionally `coding: utf8` headers were removed from a few files.
This commit is contained in:
parent
c5de9b463a
commit
0f01f46e02
|
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if strings_map is not None and isinstance(value, str):
|
||||||
if hasattr(strings_map, 'add'):
|
if hasattr(strings_map, 'add'):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
else:
|
else:
|
||||||
|
|
20
spacy/kb.pyx
20
spacy/kb.pyx
|
@ -122,7 +122,7 @@ cdef class KnowledgeBase:
|
||||||
def get_alias_strings(self):
|
def get_alias_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._alias_index]
|
return [self.vocab.strings[x] for x in self._alias_index]
|
||||||
|
|
||||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
|
@ -182,15 +182,15 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
def contains_entity(self, unicode entity):
|
def contains_entity(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
return entity_hash in self._entry_index
|
return entity_hash in self._entry_index
|
||||||
|
|
||||||
def contains_alias(self, unicode alias):
|
def contains_alias(self, str alias):
|
||||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||||
return alias_hash in self._alias_index
|
return alias_hash in self._alias_index
|
||||||
|
|
||||||
def add_alias(self, unicode alias, entities, probabilities):
|
def add_alias(self, str alias, entities, probabilities):
|
||||||
"""
|
"""
|
||||||
For a given alias, add its potential entities and prior probabilies to the KB.
|
For a given alias, add its potential entities and prior probabilies to the KB.
|
||||||
Return the alias_hash at the end
|
Return the alias_hash at the end
|
||||||
|
@ -236,7 +236,7 @@ cdef class KnowledgeBase:
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
|
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
|
@ -283,7 +283,7 @@ cdef class KnowledgeBase:
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
|
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -304,7 +304,7 @@ cdef class KnowledgeBase:
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
def get_vector(self, unicode entity):
|
def get_vector(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
# Return an empty list if this entity is unknown in this KB
|
# Return an empty list if this entity is unknown in this KB
|
||||||
|
@ -314,7 +314,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, unicode entity, unicode alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a given entity,
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
or return 0.0 when this combination is not known in the knowledge base"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
|
@ -582,7 +582,7 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -624,7 +624,7 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
|
|
|
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||||
morphology (dict): The token's morphological features following the
|
morphology (dict): The token's morphological features following the
|
||||||
Universal Dependencies scheme.
|
Universal Dependencies scheme.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -284,7 +284,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lower]
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
|
@ -294,7 +294,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.norm]
|
return self.vocab.strings[self.c.norm]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.norm = self.vocab.strings.add(x)
|
self.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
|
@ -304,7 +304,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.shape]
|
return self.vocab.strings[self.c.shape]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
|
@ -314,7 +314,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.prefix]
|
return self.vocab.strings[self.c.prefix]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
|
@ -324,7 +324,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.suffix]
|
return self.vocab.strings[self.c.suffix]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
|
@ -332,7 +332,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lang]
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
def __set__(self, unicode x):
|
def __set__(self, str x):
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
property flags:
|
||||||
|
|
|
@ -151,9 +151,9 @@ cdef class DependencyMatcher:
|
||||||
Creates a token key to be used by the matcher
|
Creates a token key to be used by the matcher
|
||||||
"""
|
"""
|
||||||
return self._normalize_key(
|
return self._normalize_key(
|
||||||
unicode(key) + DELIMITER +
|
str(key) + DELIMITER +
|
||||||
unicode(pattern_idx) + DELIMITER +
|
str(pattern_idx) + DELIMITER +
|
||||||
unicode(token_idx)
|
str(token_idx)
|
||||||
)
|
)
|
||||||
|
|
||||||
def add(self, key, patterns, *, on_match=None):
|
def add(self, key, patterns, *, on_match=None):
|
||||||
|
@ -438,7 +438,7 @@ cdef class DependencyMatcher:
|
||||||
return candidate_children
|
return candidate_children
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return self.vocab.strings.add(key)
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
|
@ -317,7 +317,7 @@ cdef class Matcher:
|
||||||
return final_matches
|
return final_matches
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return self.vocab.strings.add(key)
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
@ -365,7 +365,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
for i, token in enumerate(doclike):
|
for i, token in enumerate(doclike):
|
||||||
for name, index in extensions.items():
|
for name, index in extensions.items():
|
||||||
value = token._.get(name)
|
value = token._.get(name)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = token.vocab.strings[value]
|
value = token.vocab.strings[value]
|
||||||
extra_attr_values[i * nr_extra_attr + index] = value
|
extra_attr_values[i * nr_extra_attr + index] = value
|
||||||
# Main loop
|
# Main loop
|
||||||
|
@ -791,7 +791,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
def _get_attr_values(spec, string_store):
|
def _get_attr_values(spec, string_store):
|
||||||
attr_values = []
|
attr_values = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, str):
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == '_':
|
if attr == '_':
|
||||||
continue
|
continue
|
||||||
|
@ -802,7 +802,7 @@ def _get_attr_values(spec, string_store):
|
||||||
if attr == "IS_SENT_START":
|
if attr == "IS_SENT_START":
|
||||||
attr = "SENT_START"
|
attr = "SENT_START"
|
||||||
attr = IDS.get(attr)
|
attr = IDS.get(attr)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
elif isinstance(value, bool):
|
elif isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
|
@ -943,7 +943,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||||
output = []
|
output = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, str):
|
||||||
if attr == "_":
|
if attr == "_":
|
||||||
output.extend(
|
output.extend(
|
||||||
_get_extension_extra_predicates(
|
_get_extension_extra_predicates(
|
||||||
|
@ -1000,7 +1000,7 @@ def _get_operators(spec):
|
||||||
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
||||||
# Fix casing
|
# Fix casing
|
||||||
spec = {key.upper(): values for key, values in spec.items()
|
spec = {key.upper(): values for key, values in spec.items()
|
||||||
if isinstance(key, basestring)}
|
if isinstance(key, str)}
|
||||||
if "OP" not in spec:
|
if "OP" not in spec:
|
||||||
return (ONE,)
|
return (ONE,)
|
||||||
elif spec["OP"] in lookup:
|
elif spec["OP"] in lookup:
|
||||||
|
@ -1018,7 +1018,7 @@ def _get_extensions(spec, string_store, name2index):
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
|
||||||
continue
|
continue
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
if name not in name2index:
|
if name not in name2index:
|
||||||
name2index[name] = len(name2index)
|
name2index[name] = len(name2index)
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ...errors import Errors
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
|
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
|
|
||||||
|
|
|
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
|
||||||
from .typedefs cimport attr_t, hash_t
|
from .typedefs cimport attr_t, hash_t
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0
|
cpdef hash_t hash_string(str string) except 0
|
||||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||||
|
|
||||||
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
cdef str decode_Utf8Str(const Utf8Str* string)
|
||||||
|
|
||||||
|
|
||||||
ctypedef union Utf8Str:
|
ctypedef union Utf8Str:
|
||||||
|
@ -25,5 +25,5 @@ cdef class StringStore:
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||||
|
|
|
@ -33,7 +33,7 @@ def get_string_id(key):
|
||||||
return hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(str string) except 0:
|
||||||
chars = string.encode("utf8")
|
chars = string.encode("utf8")
|
||||||
return hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
||||||
return hash32(utf8_string, length, 1)
|
return hash32(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
cdef str decode_Utf8Str(const Utf8Str* string):
|
||||||
cdef int i, length
|
cdef int i, length
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
return string.s[1:string.s[0]+1].decode("utf8")
|
return string.s[1:string.s[0]+1].decode("utf8")
|
||||||
|
@ -107,17 +107,17 @@ cdef class StringStore:
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""Retrieve a string from a given hash, or vice versa.
|
"""Retrieve a string from a given hash, or vice versa.
|
||||||
|
|
||||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
string_or_id (bytes, str or uint64): The value to encode.
|
||||||
Returns (str / uint64): The value to be retrieved.
|
Returns (str / uint64): The value to be retrieved.
|
||||||
"""
|
"""
|
||||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
if isinstance(string_or_id, str) and len(string_or_id) == 0:
|
||||||
return 0
|
return 0
|
||||||
elif string_or_id == 0:
|
elif string_or_id == 0:
|
||||||
return ""
|
return ""
|
||||||
elif string_or_id in SYMBOLS_BY_STR:
|
elif string_or_id in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string_or_id]
|
return SYMBOLS_BY_STR[string_or_id]
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
if isinstance(string_or_id, unicode):
|
if isinstance(string_or_id, str):
|
||||||
key = hash_string(string_or_id)
|
key = hash_string(string_or_id)
|
||||||
return key
|
return key
|
||||||
elif isinstance(string_or_id, bytes):
|
elif isinstance(string_or_id, bytes):
|
||||||
|
@ -135,14 +135,14 @@ cdef class StringStore:
|
||||||
|
|
||||||
def as_int(self, key):
|
def as_int(self, key):
|
||||||
"""If key is an int, return it; otherwise, get the int value."""
|
"""If key is an int, return it; otherwise, get the int value."""
|
||||||
if not isinstance(key, basestring):
|
if not isinstance(key, str):
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
|
||||||
def as_string(self, key):
|
def as_string(self, key):
|
||||||
"""If key is a string, return it; otherwise, get the string value."""
|
"""If key is a string, return it; otherwise, get the string value."""
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, str):
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
@ -153,7 +153,7 @@ cdef class StringStore:
|
||||||
string (str): The string to add.
|
string (str): The string to add.
|
||||||
RETURNS (uint64): The string's hash value.
|
RETURNS (uint64): The string's hash value.
|
||||||
"""
|
"""
|
||||||
if isinstance(string, unicode):
|
if isinstance(string, str):
|
||||||
if string in SYMBOLS_BY_STR:
|
if string in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string]
|
return SYMBOLS_BY_STR[string]
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
|
@ -189,7 +189,7 @@ cdef class StringStore:
|
||||||
return True
|
return True
|
||||||
elif string in SYMBOLS_BY_STR:
|
elif string in SYMBOLS_BY_STR:
|
||||||
return True
|
return True
|
||||||
elif isinstance(string, unicode):
|
elif isinstance(string, str):
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
else:
|
else:
|
||||||
string = string.encode("utf8")
|
string = string.encode("utf8")
|
||||||
|
@ -269,7 +269,7 @@ cdef class StringStore:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode("utf8")
|
cdef bytes byte_string = py_string.encode("utf8")
|
||||||
return self._intern_utf8(byte_string, len(byte_string))
|
return self._intern_utf8(byte_string, len(byte_string))
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef class Tokenizer:
|
||||||
cdef int _property_init_count # TODO: unused, remove in v3.1
|
cdef int _property_init_count # TODO: unused, remove in v3.1
|
||||||
cdef int _property_init_max # TODO: unused, remove in v3.1
|
cdef int _property_init_max # TODO: unused, remove in v3.1
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
||||||
vector[SpanC] &filtered, int doc_len) nogil
|
vector[SpanC] &filtered, int doc_len) nogil
|
||||||
|
@ -37,13 +37,13 @@ cdef class Tokenizer:
|
||||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases) except -1
|
bint with_special_cases) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
||||||
int* has_special, bint with_special_cases) except -1
|
int* has_special, bint with_special_cases) except -1
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef str _split_affixes(self, Pool mem, str string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
bint with_special_cases)
|
bint with_special_cases)
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, str string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
bint with_special_cases) except -1
|
bint with_special_cases) except -1
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: embedsignature=True, profile=True, binding=True
|
# cython: embedsignature=True, profile=True, binding=True
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
|
@ -132,7 +130,7 @@ cdef class Tokenizer:
|
||||||
self.url_match)
|
self.url_match)
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
def __call__(self, unicode string):
|
def __call__(self, str string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
string (str): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
|
@ -145,7 +143,7 @@ cdef class Tokenizer:
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
|
||||||
"""Tokenize according to affix and token_match settings.
|
"""Tokenize according to affix and token_match settings.
|
||||||
|
|
||||||
string (str): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
|
@ -161,7 +159,7 @@ cdef class Tokenizer:
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef int has_special = 0
|
cdef int has_special = 0
|
||||||
cdef bint in_ws = string[0].isspace()
|
cdef bint in_ws = string[0].isspace()
|
||||||
cdef unicode span
|
cdef str span
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
# We find spans of whitespace and non-space characters, and ignore
|
# We find spans of whitespace and non-space characters, and ignore
|
||||||
# spans that are exactly ' '. So, our sequences will all be separated
|
# spans that are exactly ' '. So, our sequences will all be separated
|
||||||
|
@ -373,7 +371,7 @@ cdef class Tokenizer:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
|
@ -385,16 +383,16 @@ cdef class Tokenizer:
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef str _split_affixes(self, Pool mem, str string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases):
|
bint with_special_cases):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef unicode prefix
|
cdef str prefix
|
||||||
cdef unicode suffix
|
cdef str suffix
|
||||||
cdef unicode minus_pre
|
cdef str minus_pre
|
||||||
cdef unicode minus_suf
|
cdef str minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
if self.token_match and self.token_match(string):
|
if self.token_match and self.token_match(string):
|
||||||
|
@ -430,7 +428,7 @@ cdef class Tokenizer:
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, str string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
|
@ -440,7 +438,7 @@ cdef class Tokenizer:
|
||||||
cdef int split, end
|
cdef int split, end
|
||||||
cdef const LexemeC* const* lexemes
|
cdef const LexemeC* const* lexemes
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef unicode span
|
cdef str span
|
||||||
cdef int i
|
cdef int i
|
||||||
if prefixes.size():
|
if prefixes.size():
|
||||||
for i in range(prefixes.size()):
|
for i in range(prefixes.size()):
|
||||||
|
@ -513,7 +511,7 @@ cdef class Tokenizer:
|
||||||
cached.data.lexemes = <const LexemeC* const*>lexemes
|
cached.data.lexemes = <const LexemeC* const*>lexemes
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, str string):
|
||||||
"""Find internal split points of the string, such as hyphens.
|
"""Find internal split points of the string, such as hyphens.
|
||||||
|
|
||||||
string (str): The string to segment.
|
string (str): The string to segment.
|
||||||
|
@ -527,7 +525,7 @@ cdef class Tokenizer:
|
||||||
return 0
|
return 0
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, str string):
|
||||||
"""Find the length of a prefix that should be segmented from the
|
"""Find the length of a prefix that should be segmented from the
|
||||||
string, or None if no prefix rules match.
|
string, or None if no prefix rules match.
|
||||||
|
|
||||||
|
@ -541,7 +539,7 @@ cdef class Tokenizer:
|
||||||
match = self.prefix_search(string)
|
match = self.prefix_search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, str string):
|
||||||
"""Find the length of a suffix that should be segmented from the
|
"""Find the length of a suffix that should be segmented from the
|
||||||
string, or None if no suffix rules match.
|
string, or None if no suffix rules match.
|
||||||
|
|
||||||
|
@ -579,7 +577,7 @@ cdef class Tokenizer:
|
||||||
if attr not in (ORTH, NORM):
|
if attr not in (ORTH, NORM):
|
||||||
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
|
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
|
||||||
|
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, str string, substrings):
|
||||||
"""Add a special-case tokenization rule.
|
"""Add a special-case tokenization rule.
|
||||||
|
|
||||||
string (str): The string to specially tokenize.
|
string (str): The string to specially tokenize.
|
||||||
|
|
|
@ -36,7 +36,7 @@ class DocBin:
|
||||||
"spans": List[Dict[str, bytes]], # SpanGroups data for each doc
|
"spans": List[Dict[str, bytes]], # SpanGroups data for each doc
|
||||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||||
"strings": List[unicode] # List of unique strings in the token data
|
"strings": List[str] # List of unique strings in the token data
|
||||||
"version": str, # DocBin version number
|
"version": str, # DocBin version number
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -260,7 +260,7 @@ cdef class Doc:
|
||||||
raise ValueError(Errors.E027)
|
raise ValueError(Errors.E027)
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
for word, has_space in zip(words, spaces):
|
for word, has_space in zip(words, spaces):
|
||||||
if isinstance(word, unicode):
|
if isinstance(word, str):
|
||||||
lexeme = self.vocab.get(self.mem, word)
|
lexeme = self.vocab.get(self.mem, word)
|
||||||
elif isinstance(word, bytes):
|
elif isinstance(word, bytes):
|
||||||
raise ValueError(Errors.E028.format(value=word))
|
raise ValueError(Errors.E028.format(value=word))
|
||||||
|
@ -1362,7 +1362,7 @@ cdef class Doc:
|
||||||
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
||||||
start = 0
|
start = 0
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef unicode orth_
|
cdef str orth_
|
||||||
text = msg["text"]
|
text = msg["text"]
|
||||||
attrs = msg["array_body"]
|
attrs = msg["array_body"]
|
||||||
for i in range(attrs.shape[0]):
|
for i in range(attrs.shape[0]):
|
||||||
|
@ -1423,7 +1423,7 @@ cdef class Doc:
|
||||||
attributes are inherited from the syntactic root of the span.
|
attributes are inherited from the syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef str tag, lemma, ent_type
|
||||||
attr_len = len(attributes)
|
attr_len = len(attributes)
|
||||||
span_len = len(spans)
|
span_len = len(spans)
|
||||||
if not attr_len == span_len:
|
if not attr_len == span_len:
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
|
@ -745,7 +743,7 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
def __set__(self, unicode key):
|
def __set__(self, str key):
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -766,7 +764,7 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
def __set__(self, unicode label_):
|
def __set__(self, str label_):
|
||||||
self.label = self.doc.vocab.strings.add(label_)
|
self.label = self.doc.vocab.strings.add(label_)
|
||||||
|
|
||||||
property kb_id_:
|
property kb_id_:
|
||||||
|
@ -774,7 +772,7 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
|
||||||
def __set__(self, unicode kb_id_):
|
def __set__(self, str kb_id_):
|
||||||
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -267,7 +267,7 @@ cdef class Token:
|
||||||
"""RETURNS (str): The text content of the span (with trailing
|
"""RETURNS (str): The text content of the span (with trailing
|
||||||
whitespace).
|
whitespace).
|
||||||
"""
|
"""
|
||||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
cdef str orth = self.vocab.strings[self.c.lex.orth]
|
||||||
if self.c.spacy:
|
if self.c.spacy:
|
||||||
return orth + " "
|
return orth + " "
|
||||||
else:
|
else:
|
||||||
|
@ -820,7 +820,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.norm]
|
return self.vocab.strings[self.norm]
|
||||||
|
|
||||||
def __set__(self, unicode norm_):
|
def __set__(self, str norm_):
|
||||||
self.c.norm = self.vocab.strings.add(norm_)
|
self.c.norm = self.vocab.strings.add(norm_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -858,7 +858,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
||||||
def __set__(self, unicode lemma_):
|
def __set__(self, str lemma_):
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
|
@ -890,7 +890,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
def __set__(self, unicode label):
|
def __set__(self, str label):
|
||||||
self.c.dep = self.vocab.strings.add(label)
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -36,12 +36,12 @@ cdef class Vocab:
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* get(self, Pool mem, str string) except NULL
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
|
|
@ -60,7 +60,7 @@ cdef class Vocab:
|
||||||
vice versa.
|
vice versa.
|
||||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
oov_prob (float): Default OOV probability.
|
oov_prob (float): Default OOV probability.
|
||||||
vectors_name (unicode): Optional name to identify the vectors table.
|
vectors_name (str): Optional name to identify the vectors table.
|
||||||
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
|
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
|
||||||
A function that yields base noun phrases used for Doc.noun_chunks.
|
A function that yields base noun phrases used for Doc.noun_chunks.
|
||||||
"""
|
"""
|
||||||
|
@ -105,7 +105,7 @@ cdef class Vocab:
|
||||||
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||||
`Token.check_flag`.
|
`Token.check_flag`.
|
||||||
|
|
||||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the
|
flag_getter (callable): A function `f(str) -> bool`, to get the
|
||||||
flag value.
|
flag value.
|
||||||
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||||
the bit at which the flag will be stored. If -1, the lowest
|
the bit at which the flag will be stored. If -1, the lowest
|
||||||
|
@ -128,7 +128,7 @@ cdef class Vocab:
|
||||||
self.lex_attr_getters[flag_id] = flag_getter
|
self.lex_attr_getters[flag_id] = flag_getter
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
@ -162,7 +162,7 @@ cdef class Vocab:
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
||||||
# I think this heuristic is bad, and the Vocab should always
|
# I think this heuristic is bad, and the Vocab should always
|
||||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
||||||
# was originally supposed to work. The best solution to the growing
|
# was originally supposed to work. The best solution to the growing
|
||||||
|
@ -184,7 +184,7 @@ cdef class Vocab:
|
||||||
if self.lex_attr_getters is not None:
|
if self.lex_attr_getters is not None:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, str):
|
||||||
value = self.strings.add(value)
|
value = self.strings.add(value)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
|
@ -201,7 +201,7 @@ cdef class Vocab:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the string or int key has an entry in the vocabulary.
|
"""Check whether the string or int key has an entry in the vocabulary.
|
||||||
|
|
||||||
string (unicode): The ID string.
|
string (str): The ID string.
|
||||||
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#contains
|
DOCS: https://spacy.io/api/vocab#contains
|
||||||
|
@ -209,7 +209,7 @@ cdef class Vocab:
|
||||||
cdef hash_t int_key
|
cdef hash_t int_key
|
||||||
if isinstance(key, bytes):
|
if isinstance(key, bytes):
|
||||||
int_key = self.strings[key.decode("utf8")]
|
int_key = self.strings[key.decode("utf8")]
|
||||||
elif isinstance(key, unicode):
|
elif isinstance(key, str):
|
||||||
int_key = self.strings[key]
|
int_key = self.strings[key]
|
||||||
else:
|
else:
|
||||||
int_key = key
|
int_key = key
|
||||||
|
@ -234,7 +234,7 @@ cdef class Vocab:
|
||||||
previously unseen unicode string is given, a new lexeme is created and
|
previously unseen unicode string is given, a new lexeme is created and
|
||||||
stored.
|
stored.
|
||||||
|
|
||||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
id_or_string (int or str): The integer ID of a word, or its unicode
|
||||||
string. If `int >= Lexicon.size`, `IndexError` is raised. If
|
string. If `int >= Lexicon.size`, `IndexError` is raised. If
|
||||||
`id_or_string` is neither an int nor a unicode string, `ValueError`
|
`id_or_string` is neither an int nor a unicode string, `ValueError`
|
||||||
is raised.
|
is raised.
|
||||||
|
@ -247,7 +247,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#getitem
|
DOCS: https://spacy.io/api/vocab#getitem
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if isinstance(id_or_string, unicode):
|
if isinstance(id_or_string, str):
|
||||||
orth = self.strings.add(id_or_string)
|
orth = self.strings.add(id_or_string)
|
||||||
else:
|
else:
|
||||||
orth = id_or_string
|
orth = id_or_string
|
||||||
|
@ -348,7 +348,7 @@ cdef class Vocab:
|
||||||
If `minn` is defined, then the resulting vector uses Fasttext's
|
If `minn` is defined, then the resulting vector uses Fasttext's
|
||||||
subword features by average over ngrams of `orth`.
|
subword features by average over ngrams of `orth`.
|
||||||
|
|
||||||
orth (int / unicode): The hash value of a word, or its unicode string.
|
orth (int / str): The hash value of a word, or its unicode string.
|
||||||
minn (int): Minimum n-gram length used for Fasttext's ngram computation.
|
minn (int): Minimum n-gram length used for Fasttext's ngram computation.
|
||||||
Defaults to the length of `orth`.
|
Defaults to the length of `orth`.
|
||||||
maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
|
maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
|
||||||
|
@ -401,7 +401,7 @@ cdef class Vocab:
|
||||||
"""Set a vector for a word in the vocabulary. Words can be referenced
|
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||||
by string or int ID.
|
by string or int ID.
|
||||||
|
|
||||||
orth (int / unicode): The word.
|
orth (int / str): The word.
|
||||||
vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
|
vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#set_vector
|
DOCS: https://spacy.io/api/vocab#set_vector
|
||||||
|
@ -423,7 +423,7 @@ cdef class Vocab:
|
||||||
"""Check whether a word has a vector. Returns False if no vectors have
|
"""Check whether a word has a vector. Returns False if no vectors have
|
||||||
been loaded. Words can be looked up by string or int ID.
|
been loaded. Words can be looked up by string or int ID.
|
||||||
|
|
||||||
orth (int / unicode): The word.
|
orth (int / str): The word.
|
||||||
RETURNS (bool): Whether the word has a vector.
|
RETURNS (bool): Whether the word has a vector.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#has_vector
|
DOCS: https://spacy.io/api/vocab#has_vector
|
||||||
|
@ -448,7 +448,7 @@ cdef class Vocab:
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (str or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist.
|
it doesn't exist.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
@ -469,7 +469,7 @@ cdef class Vocab:
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory.
|
path (str or Path): A path to a directory.
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (Vocab): The modified `Vocab` object.
|
RETURNS (Vocab): The modified `Vocab` object.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user