Update Cython string types (#9143)

* Replace all basestring references with unicode `basestring` was a compatability type introduced by Cython to make dealing with utf-8 strings in Python2 easier. In Python3 it is equivalent to the unicode (or str) type. I replaced all references to basestring with unicode, since that was used elsewhere, but we could also just replace them with str, which shoudl also be equivalent. All tests pass locally. * Replace all references to unicode type with str Since we only support python3 this is simpler. * Remove all references to unicode type This removes all references to the unicode type across the codebase and replaces them with `str`, which makes it more drastic than the prior commits. In order to make this work importing `unicode_literals` had to be removed, and one explicit unicode literal also had to be removed (it is unclear why this is necessary in Cython with language level 3, but without doing it there were errors about implicit conversion). When `unicode` is used as a type in comments it was also edited to be `str`. Additionally `coding: utf8` headers were removed from a few files.
2025-10-23 04:04:22 +03:00 · 2021-09-14 00:02:17 +09:00 · 2021-09-14 00:02:17 +09:00 · 0f01f46e02
commit 0f01f46e02
parent c5de9b463a
18 changed files with 90 additions and 97 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
-            if strings_map is not None and isinstance(value, basestring):
+            if strings_map is not None and isinstance(value, str):
                if hasattr(strings_map, 'add'):
                    value = strings_map.add(value)
                else:
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -122,7 +122,7 @@ cdef class KnowledgeBase:
    def get_alias_strings(self):
        return [self.vocab.strings[x] for x in self._alias_index]

-    def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
+    def add_entity(self, str entity, float freq, vector[float] entity_vector):
        """
        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
        Return the hash of the entity ID/name at the end.
@ -182,15 +182,15 @@ cdef class KnowledgeBase:

            i += 1

-    def contains_entity(self, unicode entity):
+    def contains_entity(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
        return entity_hash in self._entry_index

-    def contains_alias(self, unicode alias):
+    def contains_alias(self, str alias):
        cdef hash_t alias_hash = self.vocab.strings.add(alias)
        return alias_hash in self._alias_index

-    def add_alias(self, unicode alias, entities, probabilities):
+    def add_alias(self, str alias, entities, probabilities):
        """
        For a given alias, add its potential entities and prior probabilies to the KB.
        Return the alias_hash at the end
@ -236,7 +236,7 @@ cdef class KnowledgeBase:
            raise RuntimeError(Errors.E891.format(alias=alias))
        return alias_hash

-    def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
        """
        For an alias already existing in the KB, extend its potential entities with one more.
        Throw a warning if either the alias or the entity is unknown,
@ -283,7 +283,7 @@ cdef class KnowledgeBase:
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry

-    def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
+    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -304,7 +304,7 @@ cdef class KnowledgeBase:
                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                if entry_index != 0]

-    def get_vector(self, unicode entity):
+    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]

        # Return an empty list if this entity is unknown in this KB
@ -314,7 +314,7 @@ cdef class KnowledgeBase:

        return self._vectors_table[self._entries[entry_index].vector_index]

-    def get_prior_prob(self, unicode entity, unicode alias):
+    def get_prior_prob(self, str entity, str alias):
        """ Return the prior probability of a given alias being linked to a given entity,
        or return 0.0 when this combination is not known in the knowledge base"""
        cdef hash_t alias_hash = self.vocab.strings[alias]
@ -582,7 +582,7 @@ cdef class Writer:
    def __init__(self, path):
        assert isinstance(path, Path)
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'wb')
        if not self._fp:
            raise IOError(Errors.E146.format(path=path))
@ -624,7 +624,7 @@ cdef class Writer:
 cdef class Reader:
    def __init__(self, path):
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.

-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -284,7 +284,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lower]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lower = self.vocab.strings.add(x)

    property norm_:
@ -294,7 +294,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.norm]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.norm = self.vocab.strings.add(x)

    property shape_:
@ -304,7 +304,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.shape]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.shape = self.vocab.strings.add(x)

    property prefix_:
@ -314,7 +314,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.prefix]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.prefix = self.vocab.strings.add(x)

    property suffix_:
@ -324,7 +324,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.suffix]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.suffix = self.vocab.strings.add(x)

    property lang_:
@ -332,7 +332,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lang]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lang = self.vocab.strings.add(x)

    property flags:
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -151,9 +151,9 @@ cdef class DependencyMatcher:
        Creates a token key to be used by the matcher
        """
        return self._normalize_key(
-            unicode(key) + DELIMITER + 
-            unicode(pattern_idx) + DELIMITER + 
-            unicode(token_idx)
+            str(key) + DELIMITER +
+            str(pattern_idx) + DELIMITER +
+            str(token_idx)
        )

    def add(self, key, patterns, *, on_match=None):
@ -438,7 +438,7 @@ cdef class DependencyMatcher:
        return candidate_children

    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -317,7 +317,7 @@ cdef class Matcher:
            return final_matches

    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
@ -365,7 +365,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
    for i, token in enumerate(doclike):
        for name, index in extensions.items():
            value = token._.get(name)
-            if isinstance(value, basestring):
+            if isinstance(value, str):
                value = token.vocab.strings[value]
            extra_attr_values[i * nr_extra_attr + index] = value
    # Main loop
@ -791,7 +791,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
 def _get_attr_values(spec, string_store):
    attr_values = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            attr = attr.upper()
            if attr == '_':
                continue
@ -802,7 +802,7 @@ def _get_attr_values(spec, string_store):
            if attr == "IS_SENT_START":
                attr = "SENT_START"
            attr = IDS.get(attr)
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
@ -943,7 +943,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
    seen_predicates = {pred.key: pred.i for pred in extra_predicates}
    output = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            if attr == "_":
                output.extend(
                    _get_extension_extra_predicates(
@ -1000,7 +1000,7 @@ def _get_operators(spec):
              "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
    # Fix casing
    spec = {key.upper(): values for key, values in spec.items()
-            if isinstance(key, basestring)}
+            if isinstance(key, str)}
    if "OP" not in spec:
        return (ONE,)
    elif spec["OP"] in lookup:
@ -1018,7 +1018,7 @@ def _get_extensions(spec, string_store, name2index):
        if isinstance(value, dict):
            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
            continue
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        if name not in name2index:
            name2index[name] = len(name2index)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -17,7 +17,7 @@ from ...errors import Errors
 from thinc.extra.search cimport Beam

 cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
+cdef attr_t SUBTOK_LABEL = hash_string('subtok')

 DEF NON_MONOTONIC = True

--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
 from .typedefs cimport attr_t, hash_t


-cpdef hash_t hash_string(unicode string) except 0
+cpdef hash_t hash_string(str string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil

-cdef unicode decode_Utf8Str(const Utf8Str* string)
+cdef str decode_Utf8Str(const Utf8Str* string)


 ctypedef union Utf8Str:
@ -25,5 +25,5 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map

-    cdef const Utf8Str* intern_unicode(self, unicode py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -33,7 +33,7 @@ def get_string_id(key):
        return hash_utf8(chars, len(chars))


-cpdef hash_t hash_string(unicode string) except 0:
+cpdef hash_t hash_string(str string) except 0:
    chars = string.encode("utf8")
    return hash_utf8(chars, len(chars))

@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)


-cdef unicode decode_Utf8Str(const Utf8Str* string):
+cdef str decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode("utf8")
@ -107,17 +107,17 @@ cdef class StringStore:
    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.

-        string_or_id (bytes, unicode or uint64): The value to encode.
+        string_or_id (bytes, str or uint64): The value to encode.
        Returns (str / uint64): The value to be retrieved.
        """
-        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
+        if isinstance(string_or_id, str) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return ""
        elif string_or_id in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string_or_id]
        cdef hash_t key
-        if isinstance(string_or_id, unicode):
+        if isinstance(string_or_id, str):
            key = hash_string(string_or_id)
            return key
        elif isinstance(string_or_id, bytes):
@ -135,14 +135,14 @@ cdef class StringStore:

    def as_int(self, key):
        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, basestring):
+        if not isinstance(key, str):
            return key
        else:
            return self[key]

    def as_string(self, key):
        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return key
        else:
            return self[key]
@ -153,7 +153,7 @@ cdef class StringStore:
        string (str): The string to add.
        RETURNS (uint64): The string's hash value.
        """
-        if isinstance(string, unicode):
+        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            key = hash_string(string)
@ -189,7 +189,7 @@ cdef class StringStore:
            return True
        elif string in SYMBOLS_BY_STR:
            return True
-        elif isinstance(string, unicode):
+        elif isinstance(string, str):
            key = hash_string(string)
        else:
            string = string.encode("utf8")
@ -269,7 +269,7 @@ cdef class StringStore:
        for string in strings:
            self.add(string)

-    cdef const Utf8Str* intern_unicode(self, unicode py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
        return self._intern_utf8(byte_string, len(byte_string))
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -26,7 +26,7 @@ cdef class Tokenizer:
    cdef int _property_init_count  # TODO: unused, remove in v3.1
    cdef int _property_init_max    # TODO: unused, remove in v3.1

-    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
    cdef void _filter_special_spans(self, vector[SpanC] &original,
                            vector[SpanC] &filtered, int doc_len) nogil
@ -37,13 +37,13 @@ cdef class Tokenizer:
    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
                                     int* has_special,
                                     bint with_special_cases) except -1
-    cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
                       int* has_special, bint with_special_cases) except -1
-    cdef unicode _split_affixes(self, Pool mem, unicode string,
+    cdef str _split_affixes(self, Pool mem, str string,
                                vector[LexemeC*] *prefixes,
                                vector[LexemeC*] *suffixes, int* has_special,
                                bint with_special_cases)
-    cdef int _attach_tokens(self, Doc tokens, unicode string,
+    cdef int _attach_tokens(self, Doc tokens, str string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes, int* has_special,
                            bint with_special_cases) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,6 +1,4 @@
 # cython: embedsignature=True, profile=True, binding=True
-from __future__ import unicode_literals
-
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from libc.string cimport memcpy, memset
@ -132,7 +130,7 @@ cdef class Tokenizer:
                self.url_match)
        return (self.__class__, args, None, None)

-    def __call__(self, unicode string):
+    def __call__(self, str string):
        """Tokenize a string.

        string (str): The string to tokenize.
@ -145,7 +143,7 @@ cdef class Tokenizer:
        return doc

    @cython.boundscheck(False)
-    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
        """Tokenize according to affix and token_match settings.

        string (str): The string to tokenize.
@ -161,7 +159,7 @@ cdef class Tokenizer:
        cdef int start = 0
        cdef int has_special = 0
        cdef bint in_ws = string[0].isspace()
-        cdef unicode span
+        cdef str span
        # The task here is much like string.split, but not quite
        # We find spans of whitespace and non-space characters, and ignore
        # spans that are exactly ' '. So, our sequences will all be separated
@ -373,7 +371,7 @@ cdef class Tokenizer:
            return False
        return True

-    cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
+    cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
        cdef int orig_size
@ -385,16 +383,16 @@ cdef class Tokenizer:
        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                          tokens.length - orig_size)

-    cdef unicode _split_affixes(self, Pool mem, unicode string,
+    cdef str _split_affixes(self, Pool mem, str string,
                                vector[const LexemeC*] *prefixes,
                                vector[const LexemeC*] *suffixes,
                                int* has_special,
                                bint with_special_cases):
        cdef size_t i
-        cdef unicode prefix
-        cdef unicode suffix
-        cdef unicode minus_pre
-        cdef unicode minus_suf
+        cdef str prefix
+        cdef str suffix
+        cdef str minus_pre
+        cdef str minus_suf
        cdef size_t last_size = 0
        while string and len(string) != last_size:
            if self.token_match and self.token_match(string):
@ -430,7 +428,7 @@ cdef class Tokenizer:
                suffixes.push_back(self.vocab.get(mem, suffix))
        return string

-    cdef int _attach_tokens(self, Doc tokens, unicode string,
+    cdef int _attach_tokens(self, Doc tokens, str string,
                            vector[const LexemeC*] *prefixes,
                            vector[const LexemeC*] *suffixes,
                            int* has_special,
@ -440,7 +438,7 @@ cdef class Tokenizer:
        cdef int split, end
        cdef const LexemeC* const* lexemes
        cdef const LexemeC* lexeme
-        cdef unicode span
+        cdef str span
        cdef int i
        if prefixes.size():
            for i in range(prefixes.size()):
@ -513,7 +511,7 @@ cdef class Tokenizer:
        cached.data.lexemes = <const LexemeC* const*>lexemes
        self._cache.set(key, cached)

-    def find_infix(self, unicode string):
+    def find_infix(self, str string):
        """Find internal split points of the string, such as hyphens.

        string (str): The string to segment.
@ -527,7 +525,7 @@ cdef class Tokenizer:
            return 0
        return list(self.infix_finditer(string))

-    def find_prefix(self, unicode string):
+    def find_prefix(self, str string):
        """Find the length of a prefix that should be segmented from the
        string, or None if no prefix rules match.

@ -541,7 +539,7 @@ cdef class Tokenizer:
        match = self.prefix_search(string)
        return (match.end() - match.start()) if match is not None else 0

-    def find_suffix(self, unicode string):
+    def find_suffix(self, str string):
        """Find the length of a suffix that should be segmented from the
        string, or None if no suffix rules match.

@ -579,7 +577,7 @@ cdef class Tokenizer:
                if attr not in (ORTH, NORM):
                    raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))

-    def add_special_case(self, unicode string, substrings):
+    def add_special_case(self, str string, substrings):
        """Add a special-case tokenization rule.

        string (str): The string to specially tokenize.
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -36,7 +36,7 @@ class DocBin:
        "spans": List[Dict[str, bytes]], # SpanGroups data for each doc
        "spaces": bytes, # Serialized numpy boolean array with spaces data
        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
-        "strings": List[unicode] # List of unique strings in the token data
+        "strings": List[str] # List of unique strings in the token data
        "version": str, # DocBin version number
    }

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -260,7 +260,7 @@ cdef class Doc:
            raise ValueError(Errors.E027)
        cdef const LexemeC* lexeme
        for word, has_space in zip(words, spaces):
-            if isinstance(word, unicode):
+            if isinstance(word, str):
                lexeme = self.vocab.get(self.mem, word)
            elif isinstance(word, bytes):
                raise ValueError(Errors.E028.format(value=word))
@ -1362,7 +1362,7 @@ cdef class Doc:
            self.has_unknown_spaces = msg["has_unknown_spaces"]
        start = 0
        cdef const LexemeC* lex
-        cdef unicode orth_
+        cdef str orth_
        text = msg["text"]
        attrs = msg["array_body"]
        for i in range(attrs.shape[0]):
@ -1423,7 +1423,7 @@ cdef class Doc:
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The first newly merged token.
        """
-        cdef unicode tag, lemma, ent_type
+        cdef str tag, lemma, ent_type
        attr_len = len(attributes)
        span_len = len(spans)
        if not attr_len == span_len:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 cimport numpy as np
 from libc.math cimport sqrt

@ -745,7 +743,7 @@ cdef class Span:
        def __get__(self):
            return self.root.ent_id_

-        def __set__(self, unicode key):
+        def __set__(self, str key):
            raise NotImplementedError(Errors.E200.format(attr="ent_id_"))

    @property
@ -766,7 +764,7 @@ cdef class Span:
        def __get__(self):
            return self.doc.vocab.strings[self.label]

-        def __set__(self, unicode label_):
+        def __set__(self, str label_):
            self.label = self.doc.vocab.strings.add(label_)

    property kb_id_:
@ -774,7 +772,7 @@ cdef class Span:
        def __get__(self):
            return self.doc.vocab.strings[self.kb_id]

-        def __set__(self, unicode kb_id_):
+        def __set__(self, str kb_id_):
            self.kb_id = self.doc.vocab.strings.add(kb_id_)


--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -267,7 +267,7 @@ cdef class Token:
        """RETURNS (str): The text content of the span (with trailing
            whitespace).
        """
-        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
+        cdef str orth = self.vocab.strings[self.c.lex.orth]
        if self.c.spacy:
            return orth + " "
        else:
@ -820,7 +820,7 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.norm]

-        def __set__(self, unicode norm_):
+        def __set__(self, str norm_):
            self.c.norm = self.vocab.strings.add(norm_)

    @property
@ -858,7 +858,7 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.lemma]

-        def __set__(self, unicode lemma_):
+        def __set__(self, str lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
@ -890,7 +890,7 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.dep]

-        def __set__(self, unicode label):
+        def __set__(self, str label):
            self.c.dep = self.vocab.strings.add(label)

    @property
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -36,12 +36,12 @@ cdef class Vocab:
    cdef public object lex_attr_getters
    cdef public object cfg

-    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
    cdef const TokenC* make_fused_token(self, substrings) except NULL

-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL

    cdef PreshMap _by_orth
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -60,7 +60,7 @@ cdef class Vocab:
            vice versa.
        lookups (Lookups): Container for large lookup tables and dictionaries.
        oov_prob (float): Default OOV probability.
-        vectors_name (unicode): Optional name to identify the vectors table.
+        vectors_name (str): Optional name to identify the vectors table.
        get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
            A function that yields base noun phrases used for Doc.noun_chunks.
        """
@ -105,7 +105,7 @@ cdef class Vocab:
        See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
        `Token.check_flag`.

-        flag_getter (callable): A function `f(unicode) -> bool`, to get the
+        flag_getter (callable): A function `f(str) -> bool`, to get the
            flag value.
        flag_id (int): An integer between 1 and 63 (inclusive), specifying
            the bit at which the flag will be stored. If -1, the lowest
@ -128,7 +128,7 @@ cdef class Vocab:
        self.lex_attr_getters[flag_id] = flag_getter
        return flag_id

-    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
        `Lexeme` if necessary using memory acquired from the given pool. If the
        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@ -162,7 +162,7 @@ cdef class Vocab:
        else:
            return self._new_lexeme(mem, self.strings[orth])

-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
        # I think this heuristic is bad, and the Vocab should always
        # own the lexemes. It avoids weird bugs this way, as it's how the thing
        # was originally supposed to work. The best solution to the growing
@ -184,7 +184,7 @@ cdef class Vocab:
        if self.lex_attr_getters is not None:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
-                if isinstance(value, unicode):
+                if isinstance(value, str):
                    value = self.strings.add(value)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
@ -201,7 +201,7 @@ cdef class Vocab:
    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.

-        string (unicode): The ID string.
+        string (str): The ID string.
        RETURNS (bool) Whether the string has an entry in the vocabulary.

        DOCS: https://spacy.io/api/vocab#contains
@ -209,7 +209,7 @@ cdef class Vocab:
        cdef hash_t int_key
        if isinstance(key, bytes):
            int_key = self.strings[key.decode("utf8")]
-        elif isinstance(key, unicode):
+        elif isinstance(key, str):
            int_key = self.strings[key]
        else:
            int_key = key
@ -234,7 +234,7 @@ cdef class Vocab:
        previously unseen unicode string is given, a new lexeme is created and
        stored.

-        id_or_string (int or unicode): The integer ID of a word, or its unicode
+        id_or_string (int or str): The integer ID of a word, or its unicode
            string. If `int >= Lexicon.size`, `IndexError` is raised. If
            `id_or_string` is neither an int nor a unicode string, `ValueError`
            is raised.
@ -247,7 +247,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#getitem
        """
        cdef attr_t orth
-        if isinstance(id_or_string, unicode):
+        if isinstance(id_or_string, str):
            orth = self.strings.add(id_or_string)
        else:
            orth = id_or_string
@ -348,7 +348,7 @@ cdef class Vocab:
        If `minn` is defined, then the resulting vector uses Fasttext's
        subword features by average over ngrams of `orth`.

-        orth (int / unicode): The hash value of a word, or its unicode string.
+        orth (int / str): The hash value of a word, or its unicode string.
        minn (int): Minimum n-gram length used for Fasttext's ngram computation.
            Defaults to the length of `orth`.
        maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
@ -401,7 +401,7 @@ cdef class Vocab:
        """Set a vector for a word in the vocabulary. Words can be referenced
        by string or int ID.

-        orth (int / unicode): The word.
+        orth (int / str): The word.
        vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.

        DOCS: https://spacy.io/api/vocab#set_vector
@ -423,7 +423,7 @@ cdef class Vocab:
        """Check whether a word has a vector. Returns False if no vectors have
        been loaded. Words can be looked up by string or int ID.

-        orth (int / unicode): The word.
+        orth (int / str): The word.
        RETURNS (bool): Whether the word has a vector.

        DOCS: https://spacy.io/api/vocab#has_vector
@ -448,7 +448,7 @@ cdef class Vocab:
    def to_disk(self, path, *, exclude=tuple()):
        """Save the current state to a directory.

-        path (unicode or Path): A path to a directory, which will be created if
+        path (str or Path): A path to a directory, which will be created if
            it doesn't exist.
        exclude (list): String names of serialization fields to exclude.

@ -469,7 +469,7 @@ cdef class Vocab:
        """Loads state from a directory. Modifies the object in place and
        returns it.

-        path (unicode or Path): A path to a directory.
+        path (str or Path): A path to a directory.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Vocab): The modified `Vocab` object.