Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
This commit is contained in:
Paul O'Leary McCann 2021-09-14 00:02:17 +09:00 committed by GitHub
parent c5de9b463a
commit 0f01f46e02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 90 additions and 97 deletions

View File

@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
if strings_map is not None and isinstance(value, basestring):
if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'):
value = strings_map.add(value)
else:

View File

@ -122,7 +122,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
def add_entity(self, str entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
@ -182,15 +182,15 @@ cdef class KnowledgeBase:
i += 1
def contains_entity(self, unicode entity):
def contains_entity(self, str entity):
cdef hash_t entity_hash = self.vocab.strings.add(entity)
return entity_hash in self._entry_index
def contains_alias(self, unicode alias):
def contains_alias(self, str alias):
cdef hash_t alias_hash = self.vocab.strings.add(alias)
return alias_hash in self._alias_index
def add_alias(self, unicode alias, entities, probabilities):
def add_alias(self, str alias, entities, probabilities):
"""
For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end
@ -236,7 +236,7 @@ cdef class KnowledgeBase:
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
"""
For an alias already existing in the KB, extend its potential entities with one more.
Throw a warning if either the alias or the entity is unknown,
@ -283,7 +283,7 @@ cdef class KnowledgeBase:
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@ -304,7 +304,7 @@ cdef class KnowledgeBase:
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0]
def get_vector(self, unicode entity):
def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity]
# Return an empty list if this entity is unknown in this KB
@ -314,7 +314,7 @@ cdef class KnowledgeBase:
return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, unicode entity, unicode alias):
def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity,
or return 0.0 when this combination is not known in the knowledge base"""
cdef hash_t alias_hash = self.vocab.strings[alias]
@ -582,7 +582,7 @@ cdef class Writer:
def __init__(self, path):
assert isinstance(path, Path)
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp:
raise IOError(Errors.E146.format(path=path))
@ -624,7 +624,7 @@ cdef class Writer:
cdef class Reader:
def __init__(self, path):
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)

View File

@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""

View File

@ -284,7 +284,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, unicode x):
def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
@ -294,7 +294,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, unicode x):
def __set__(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_:
@ -304,7 +304,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, unicode x):
def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
@ -314,7 +314,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x):
def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
@ -324,7 +324,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x):
def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
@ -332,7 +332,7 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, unicode x):
def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags:

View File

@ -151,9 +151,9 @@ cdef class DependencyMatcher:
Creates a token key to be used by the matcher
"""
return self._normalize_key(
unicode(key) + DELIMITER +
unicode(pattern_idx) + DELIMITER +
unicode(token_idx)
str(key) + DELIMITER +
str(pattern_idx) + DELIMITER +
str(token_idx)
)
def add(self, key, patterns, *, on_match=None):
@ -438,7 +438,7 @@ cdef class DependencyMatcher:
return candidate_children
def _normalize_key(self, key):
if isinstance(key, basestring):
if isinstance(key, str):
return self.vocab.strings.add(key)
else:
return key

View File

@ -317,7 +317,7 @@ cdef class Matcher:
return final_matches
def _normalize_key(self, key):
if isinstance(key, basestring):
if isinstance(key, str):
return self.vocab.strings.add(key)
else:
return key
@ -365,7 +365,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
for i, token in enumerate(doclike):
for name, index in extensions.items():
value = token._.get(name)
if isinstance(value, basestring):
if isinstance(value, str):
value = token.vocab.strings[value]
extra_attr_values[i * nr_extra_attr + index] = value
# Main loop
@ -791,7 +791,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
def _get_attr_values(spec, string_store):
attr_values = []
for attr, value in spec.items():
if isinstance(attr, basestring):
if isinstance(attr, str):
attr = attr.upper()
if attr == '_':
continue
@ -802,7 +802,7 @@ def _get_attr_values(spec, string_store):
if attr == "IS_SENT_START":
attr = "SENT_START"
attr = IDS.get(attr)
if isinstance(value, basestring):
if isinstance(value, str):
value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)
@ -943,7 +943,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = []
for attr, value in spec.items():
if isinstance(attr, basestring):
if isinstance(attr, str):
if attr == "_":
output.extend(
_get_extension_extra_predicates(
@ -1000,7 +1000,7 @@ def _get_operators(spec):
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
# Fix casing
spec = {key.upper(): values for key, values in spec.items()
if isinstance(key, basestring)}
if isinstance(key, str)}
if "OP" not in spec:
return (ONE,)
elif spec["OP"] in lookup:
@ -1018,7 +1018,7 @@ def _get_extensions(spec, string_store, name2index):
if isinstance(value, dict):
# Handle predicates (e.g. "IN", in the extra_predicates, not here.
continue
if isinstance(value, basestring):
if isinstance(value, str):
value = string_store.add(value)
if name not in name2index:
name2index[name] = len(name2index)

View File

@ -17,7 +17,7 @@ from ...errors import Errors
from thinc.extra.search cimport Beam
cdef weight_t MIN_SCORE = -90000
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
DEF NON_MONOTONIC = True

View File

@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(unicode string) except 0
cpdef hash_t hash_string(str string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef unicode decode_Utf8Str(const Utf8Str* string)
cdef str decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str:
@ -25,5 +25,5 @@ cdef class StringStore:
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -33,7 +33,7 @@ def get_string_id(key):
return hash_utf8(chars, len(chars))
cpdef hash_t hash_string(unicode string) except 0:
cpdef hash_t hash_string(str string) except 0:
chars = string.encode("utf8")
return hash_utf8(chars, len(chars))
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
cdef unicode decode_Utf8Str(const Utf8Str* string):
cdef str decode_Utf8Str(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode("utf8")
@ -107,17 +107,17 @@ cdef class StringStore:
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, unicode or uint64): The value to encode.
string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
if isinstance(string_or_id, str) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
if isinstance(string_or_id, unicode):
if isinstance(string_or_id, str):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
@ -135,14 +135,14 @@ cdef class StringStore:
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
if not isinstance(key, basestring):
if not isinstance(key, str):
return key
else:
return self[key]
def as_string(self, key):
"""If key is a string, return it; otherwise, get the string value."""
if isinstance(key, basestring):
if isinstance(key, str):
return key
else:
return self[key]
@ -153,7 +153,7 @@ cdef class StringStore:
string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, unicode):
if isinstance(string, str):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
@ -189,7 +189,7 @@ cdef class StringStore:
return True
elif string in SYMBOLS_BY_STR:
return True
elif isinstance(string, unicode):
elif isinstance(string, str):
key = hash_string(string)
else:
string = string.encode("utf8")
@ -269,7 +269,7 @@ cdef class StringStore:
for string in strings:
self.add(string)
cdef const Utf8Str* intern_unicode(self, unicode py_string):
cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string))

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest

View File

@ -26,7 +26,7 @@ cdef class Tokenizer:
cdef int _property_init_count # TODO: unused, remove in v3.1
cdef int _property_init_max # TODO: unused, remove in v3.1
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original,
vector[SpanC] &filtered, int doc_len) nogil
@ -37,13 +37,13 @@ cdef class Tokenizer:
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
int* has_special,
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef unicode _split_affixes(self, Pool mem, unicode string,
cdef str _split_affixes(self, Pool mem, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)
cdef int _attach_tokens(self, Doc tokens, unicode string,
cdef int _attach_tokens(self, Doc tokens, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) except -1

View File

@ -1,6 +1,4 @@
# cython: embedsignature=True, profile=True, binding=True
from __future__ import unicode_literals
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
@ -132,7 +130,7 @@ cdef class Tokenizer:
self.url_match)
return (self.__class__, args, None, None)
def __call__(self, unicode string):
def __call__(self, str string):
"""Tokenize a string.
string (str): The string to tokenize.
@ -145,7 +143,7 @@ cdef class Tokenizer:
return doc
@cython.boundscheck(False)
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
string (str): The string to tokenize.
@ -161,7 +159,7 @@ cdef class Tokenizer:
cdef int start = 0
cdef int has_special = 0
cdef bint in_ws = string[0].isspace()
cdef unicode span
cdef str span
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
@ -373,7 +371,7 @@ cdef class Tokenizer:
return False
return True
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
@ -385,16 +383,16 @@ cdef class Tokenizer:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string,
cdef str _split_affixes(self, Pool mem, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases):
cdef size_t i
cdef unicode prefix
cdef unicode suffix
cdef unicode minus_pre
cdef unicode minus_suf
cdef str prefix
cdef str suffix
cdef str minus_pre
cdef str minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string):
@ -430,7 +428,7 @@ cdef class Tokenizer:
suffixes.push_back(self.vocab.get(mem, suffix))
return string
cdef int _attach_tokens(self, Doc tokens, unicode string,
cdef int _attach_tokens(self, Doc tokens, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
@ -440,7 +438,7 @@ cdef class Tokenizer:
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
cdef unicode span
cdef str span
cdef int i
if prefixes.size():
for i in range(prefixes.size()):
@ -513,7 +511,7 @@ cdef class Tokenizer:
cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached)
def find_infix(self, unicode string):
def find_infix(self, str string):
"""Find internal split points of the string, such as hyphens.
string (str): The string to segment.
@ -527,7 +525,7 @@ cdef class Tokenizer:
return 0
return list(self.infix_finditer(string))
def find_prefix(self, unicode string):
def find_prefix(self, str string):
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
@ -541,7 +539,7 @@ cdef class Tokenizer:
match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string):
def find_suffix(self, str string):
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
@ -579,7 +577,7 @@ cdef class Tokenizer:
if attr not in (ORTH, NORM):
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
def add_special_case(self, unicode string, substrings):
def add_special_case(self, str string, substrings):
"""Add a special-case tokenization rule.
string (str): The string to specially tokenize.

View File

@ -36,7 +36,7 @@ class DocBin:
"spans": List[Dict[str, bytes]], # SpanGroups data for each doc
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data
"strings": List[str] # List of unique strings in the token data
"version": str, # DocBin version number
}

View File

@ -260,7 +260,7 @@ cdef class Doc:
raise ValueError(Errors.E027)
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, unicode):
if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
@ -1362,7 +1362,7 @@ cdef class Doc:
self.has_unknown_spaces = msg["has_unknown_spaces"]
start = 0
cdef const LexemeC* lex
cdef unicode orth_
cdef str orth_
text = msg["text"]
attrs = msg["array_body"]
for i in range(attrs.shape[0]):
@ -1423,7 +1423,7 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
cdef unicode tag, lemma, ent_type
cdef str tag, lemma, ent_type
attr_len = len(attributes)
span_len = len(spans)
if not attr_len == span_len:

View File

@ -1,5 +1,3 @@
from __future__ import unicode_literals
cimport numpy as np
from libc.math cimport sqrt
@ -745,7 +743,7 @@ cdef class Span:
def __get__(self):
return self.root.ent_id_
def __set__(self, unicode key):
def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
@property
@ -766,7 +764,7 @@ cdef class Span:
def __get__(self):
return self.doc.vocab.strings[self.label]
def __set__(self, unicode label_):
def __set__(self, str label_):
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
@ -774,7 +772,7 @@ cdef class Span:
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
def __set__(self, unicode kb_id_):
def __set__(self, str kb_id_):
self.kb_id = self.doc.vocab.strings.add(kb_id_)

View File

@ -267,7 +267,7 @@ cdef class Token:
"""RETURNS (str): The text content of the span (with trailing
whitespace).
"""
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
cdef str orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy:
return orth + " "
else:
@ -820,7 +820,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.norm]
def __set__(self, unicode norm_):
def __set__(self, str norm_):
self.c.norm = self.vocab.strings.add(norm_)
@property
@ -858,7 +858,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):
def __set__(self, str lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
@ -890,7 +890,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
def __set__(self, str label):
self.c.dep = self.vocab.strings.add(label)
@property

View File

@ -36,12 +36,12 @@ cdef class Vocab:
cdef public object lex_attr_getters
cdef public object cfg
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get(self, Pool mem, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth

View File

@ -60,7 +60,7 @@ cdef class Vocab:
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability.
vectors_name (unicode): Optional name to identify the vectors table.
vectors_name (str): Optional name to identify the vectors table.
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
A function that yields base noun phrases used for Doc.noun_chunks.
"""
@ -105,7 +105,7 @@ cdef class Vocab:
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
flag_getter (callable): A function `f(unicode) -> bool`, to get the
flag_getter (callable): A function `f(str) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
@ -128,7 +128,7 @@ cdef class Vocab:
self.lex_attr_getters[flag_id] = flag_getter
return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@ -162,7 +162,7 @@ cdef class Vocab:
else:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
@ -184,7 +184,7 @@ cdef class Vocab:
if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items():
value = func(string)
if isinstance(value, unicode):
if isinstance(value, str):
value = self.strings.add(value)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
@ -201,7 +201,7 @@ cdef class Vocab:
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
string (unicode): The ID string.
string (str): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#contains
@ -209,7 +209,7 @@ cdef class Vocab:
cdef hash_t int_key
if isinstance(key, bytes):
int_key = self.strings[key.decode("utf8")]
elif isinstance(key, unicode):
elif isinstance(key, str):
int_key = self.strings[key]
else:
int_key = key
@ -234,7 +234,7 @@ cdef class Vocab:
previously unseen unicode string is given, a new lexeme is created and
stored.
id_or_string (int or unicode): The integer ID of a word, or its unicode
id_or_string (int or str): The integer ID of a word, or its unicode
string. If `int >= Lexicon.size`, `IndexError` is raised. If
`id_or_string` is neither an int nor a unicode string, `ValueError`
is raised.
@ -247,7 +247,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#getitem
"""
cdef attr_t orth
if isinstance(id_or_string, unicode):
if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string)
else:
orth = id_or_string
@ -348,7 +348,7 @@ cdef class Vocab:
If `minn` is defined, then the resulting vector uses Fasttext's
subword features by average over ngrams of `orth`.
orth (int / unicode): The hash value of a word, or its unicode string.
orth (int / str): The hash value of a word, or its unicode string.
minn (int): Minimum n-gram length used for Fasttext's ngram computation.
Defaults to the length of `orth`.
maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
@ -401,7 +401,7 @@ cdef class Vocab:
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
orth (int / unicode): The word.
orth (int / str): The word.
vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
DOCS: https://spacy.io/api/vocab#set_vector
@ -423,7 +423,7 @@ cdef class Vocab:
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID.
orth (int / unicode): The word.
orth (int / str): The word.
RETURNS (bool): Whether the word has a vector.
DOCS: https://spacy.io/api/vocab#has_vector
@ -448,7 +448,7 @@ cdef class Vocab:
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
path (str or Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
@ -469,7 +469,7 @@ cdef class Vocab:
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory.
path (str or Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.