diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 9122de17b..640fb2f3c 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: - if strings_map is not None and isinstance(value, basestring): + if strings_map is not None and isinstance(value, str): if hasattr(strings_map, 'add'): value = strings_map.add(value) else: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index d8514b54c..421a8241a 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -122,7 +122,7 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] - def add_entity(self, unicode entity, float freq, vector[float] entity_vector): + def add_entity(self, str entity, float freq, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. @@ -182,15 +182,15 @@ cdef class KnowledgeBase: i += 1 - def contains_entity(self, unicode entity): + def contains_entity(self, str entity): cdef hash_t entity_hash = self.vocab.strings.add(entity) return entity_hash in self._entry_index - def contains_alias(self, unicode alias): + def contains_alias(self, str alias): cdef hash_t alias_hash = self.vocab.strings.add(alias) return alias_hash in self._alias_index - def add_alias(self, unicode alias, entities, probabilities): + def add_alias(self, str alias, entities, probabilities): """ For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end @@ -236,7 +236,7 @@ cdef class KnowledgeBase: raise RuntimeError(Errors.E891.format(alias=alias)) return alias_hash - def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): + def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False): """ For an alias already existing in the KB, extend its potential entities with one more. Throw a warning if either the alias or the entity is unknown, @@ -283,7 +283,7 @@ cdef class KnowledgeBase: alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]: + def get_alias_candidates(self, str alias) -> Iterator[Candidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -304,7 +304,7 @@ cdef class KnowledgeBase: for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] - def get_vector(self, unicode entity): + def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] # Return an empty list if this entity is unknown in this KB @@ -314,7 +314,7 @@ cdef class KnowledgeBase: return self._vectors_table[self._entries[entry_index].vector_index] - def get_prior_prob(self, unicode entity, unicode alias): + def get_prior_prob(self, str entity, str alias): """ Return the prior probability of a given alias being linked to a given entity, or return 0.0 when this combination is not known in the knowledge base""" cdef hash_t alias_hash = self.vocab.strings[alias] @@ -582,7 +582,7 @@ cdef class Writer: def __init__(self, path): assert isinstance(path, Path) content = bytes(path) - cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content + cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content self._fp = fopen(bytes_loc, 'wb') if not self._fp: raise IOError(Errors.E146.format(path=path)) @@ -624,7 +624,7 @@ cdef class Writer: cdef class Reader: def __init__(self, path): content = bytes(path) - cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content + cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content self._fp = fopen(bytes_loc, 'rb') if not self._fp: PyErr_SetFromErrno(IOError) diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index 2cb0f9a53..c88b69bcc 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 3564b6e42..792e405dd 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -284,7 +284,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.lower] - def __set__(self, unicode x): + def __set__(self, str x): self.c.lower = self.vocab.strings.add(x) property norm_: @@ -294,7 +294,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.norm] - def __set__(self, unicode x): + def __set__(self, str x): self.norm = self.vocab.strings.add(x) property shape_: @@ -304,7 +304,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.shape] - def __set__(self, unicode x): + def __set__(self, str x): self.c.shape = self.vocab.strings.add(x) property prefix_: @@ -314,7 +314,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.prefix] - def __set__(self, unicode x): + def __set__(self, str x): self.c.prefix = self.vocab.strings.add(x) property suffix_: @@ -324,7 +324,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.suffix] - def __set__(self, unicode x): + def __set__(self, str x): self.c.suffix = self.vocab.strings.add(x) property lang_: @@ -332,7 +332,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.lang] - def __set__(self, unicode x): + def __set__(self, str x): self.c.lang = self.vocab.strings.add(x) property flags: diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 9e0842d59..9593634d7 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -151,9 +151,9 @@ cdef class DependencyMatcher: Creates a token key to be used by the matcher """ return self._normalize_key( - unicode(key) + DELIMITER + - unicode(pattern_idx) + DELIMITER + - unicode(token_idx) + str(key) + DELIMITER + + str(pattern_idx) + DELIMITER + + str(token_idx) ) def add(self, key, patterns, *, on_match=None): @@ -438,7 +438,7 @@ cdef class DependencyMatcher: return candidate_children def _normalize_key(self, key): - if isinstance(key, basestring): + if isinstance(key, str): return self.vocab.strings.add(key) else: return key diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 555766f62..6a23d1f4b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -317,7 +317,7 @@ cdef class Matcher: return final_matches def _normalize_key(self, key): - if isinstance(key, basestring): + if isinstance(key, str): return self.vocab.strings.add(key) else: return key @@ -365,7 +365,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e for i, token in enumerate(doclike): for name, index in extensions.items(): value = token._.get(name) - if isinstance(value, basestring): + if isinstance(value, str): value = token.vocab.strings[value] extra_attr_values[i * nr_extra_attr + index] = value # Main loop @@ -791,7 +791,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): def _get_attr_values(spec, string_store): attr_values = [] for attr, value in spec.items(): - if isinstance(attr, basestring): + if isinstance(attr, str): attr = attr.upper() if attr == '_': continue @@ -802,7 +802,7 @@ def _get_attr_values(spec, string_store): if attr == "IS_SENT_START": attr = "SENT_START" attr = IDS.get(attr) - if isinstance(value, basestring): + if isinstance(value, str): value = string_store.add(value) elif isinstance(value, bool): value = int(value) @@ -943,7 +943,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): seen_predicates = {pred.key: pred.i for pred in extra_predicates} output = [] for attr, value in spec.items(): - if isinstance(attr, basestring): + if isinstance(attr, str): if attr == "_": output.extend( _get_extension_extra_predicates( @@ -1000,7 +1000,7 @@ def _get_operators(spec): "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)} # Fix casing spec = {key.upper(): values for key, values in spec.items() - if isinstance(key, basestring)} + if isinstance(key, str)} if "OP" not in spec: return (ONE,) elif spec["OP"] in lookup: @@ -1018,7 +1018,7 @@ def _get_extensions(spec, string_store, name2index): if isinstance(value, dict): # Handle predicates (e.g. "IN", in the extra_predicates, not here. continue - if isinstance(value, basestring): + if isinstance(value, str): value = string_store.add(value) if name not in name2index: name2index[name] = len(name2index) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 9ca702f9b..f34975858 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -17,7 +17,7 @@ from ...errors import Errors from thinc.extra.search cimport Beam cdef weight_t MIN_SCORE = -90000 -cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') +cdef attr_t SUBTOK_LABEL = hash_string('subtok') DEF NON_MONOTONIC = True diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 07768d347..370180135 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64 from .typedefs cimport attr_t, hash_t -cpdef hash_t hash_string(unicode string) except 0 +cpdef hash_t hash_string(str string) except 0 cdef hash_t hash_utf8(char* utf8_string, int length) nogil -cdef unicode decode_Utf8Str(const Utf8Str* string) +cdef str decode_Utf8Str(const Utf8Str* string) ctypedef union Utf8Str: @@ -25,5 +25,5 @@ cdef class StringStore: cdef vector[hash_t] keys cdef public PreshMap _map - cdef const Utf8Str* intern_unicode(self, unicode py_string) + cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 4a20cb8af..39fc441e9 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -33,7 +33,7 @@ def get_string_id(key): return hash_utf8(chars, len(chars)) -cpdef hash_t hash_string(unicode string) except 0: +cpdef hash_t hash_string(str string) except 0: chars = string.encode("utf8") return hash_utf8(chars, len(chars)) @@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: return hash32(utf8_string, length, 1) -cdef unicode decode_Utf8Str(const Utf8Str* string): +cdef str decode_Utf8Str(const Utf8Str* string): cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: return string.s[1:string.s[0]+1].decode("utf8") @@ -107,17 +107,17 @@ cdef class StringStore: def __getitem__(self, object string_or_id): """Retrieve a string from a given hash, or vice versa. - string_or_id (bytes, unicode or uint64): The value to encode. + string_or_id (bytes, str or uint64): The value to encode. Returns (str / uint64): The value to be retrieved. """ - if isinstance(string_or_id, basestring) and len(string_or_id) == 0: + if isinstance(string_or_id, str) and len(string_or_id) == 0: return 0 elif string_or_id == 0: return "" elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] cdef hash_t key - if isinstance(string_or_id, unicode): + if isinstance(string_or_id, str): key = hash_string(string_or_id) return key elif isinstance(string_or_id, bytes): @@ -135,14 +135,14 @@ cdef class StringStore: def as_int(self, key): """If key is an int, return it; otherwise, get the int value.""" - if not isinstance(key, basestring): + if not isinstance(key, str): return key else: return self[key] def as_string(self, key): """If key is a string, return it; otherwise, get the string value.""" - if isinstance(key, basestring): + if isinstance(key, str): return key else: return self[key] @@ -153,7 +153,7 @@ cdef class StringStore: string (str): The string to add. RETURNS (uint64): The string's hash value. """ - if isinstance(string, unicode): + if isinstance(string, str): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] key = hash_string(string) @@ -189,7 +189,7 @@ cdef class StringStore: return True elif string in SYMBOLS_BY_STR: return True - elif isinstance(string, unicode): + elif isinstance(string, str): key = hash_string(string) else: string = string.encode("utf8") @@ -269,7 +269,7 @@ cdef class StringStore: for string in strings: self.add(string) - cdef const Utf8Str* intern_unicode(self, unicode py_string): + cdef const Utf8Str* intern_unicode(self, str py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") return self._intern_utf8(byte_string, len(byte_string)) diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py index 91a048764..5cf6eb1a6 100644 --- a/spacy/tests/lang/ky/test_tokenizer.py +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 719e8e6f5..44f6ee522 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -26,7 +26,7 @@ cdef class Tokenizer: cdef int _property_init_count # TODO: unused, remove in v3.1 cdef int _property_init_max # TODO: unused, remove in v3.1 - cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) + cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil @@ -37,13 +37,13 @@ cdef class Tokenizer: cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1 - cdef int _tokenize(self, Doc tokens, unicode span, hash_t key, + cdef int _tokenize(self, Doc tokens, str span, hash_t key, int* has_special, bint with_special_cases) except -1 - cdef unicode _split_affixes(self, Pool mem, unicode string, + cdef str _split_affixes(self, Pool mem, str string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes, int* has_special, bint with_special_cases) - cdef int _attach_tokens(self, Doc tokens, unicode string, + cdef int _attach_tokens(self, Doc tokens, str string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes, int* has_special, bint with_special_cases) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5a89e5a17..c0c8520c7 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,6 +1,4 @@ # cython: embedsignature=True, profile=True, binding=True -from __future__ import unicode_literals - from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from libc.string cimport memcpy, memset @@ -132,7 +130,7 @@ cdef class Tokenizer: self.url_match) return (self.__class__, args, None, None) - def __call__(self, unicode string): + def __call__(self, str string): """Tokenize a string. string (str): The string to tokenize. @@ -145,7 +143,7 @@ cdef class Tokenizer: return doc @cython.boundscheck(False) - cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): + cdef Doc _tokenize_affixes(self, str string, bint with_special_cases): """Tokenize according to affix and token_match settings. string (str): The string to tokenize. @@ -161,7 +159,7 @@ cdef class Tokenizer: cdef int start = 0 cdef int has_special = 0 cdef bint in_ws = string[0].isspace() - cdef unicode span + cdef str span # The task here is much like string.split, but not quite # We find spans of whitespace and non-space characters, and ignore # spans that are exactly ' '. So, our sequences will all be separated @@ -373,7 +371,7 @@ cdef class Tokenizer: return False return True - cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: + cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size @@ -385,16 +383,16 @@ cdef class Tokenizer: self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) - cdef unicode _split_affixes(self, Pool mem, unicode string, + cdef str _split_affixes(self, Pool mem, str string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, int* has_special, bint with_special_cases): cdef size_t i - cdef unicode prefix - cdef unicode suffix - cdef unicode minus_pre - cdef unicode minus_suf + cdef str prefix + cdef str suffix + cdef str minus_pre + cdef str minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: if self.token_match and self.token_match(string): @@ -430,7 +428,7 @@ cdef class Tokenizer: suffixes.push_back(self.vocab.get(mem, suffix)) return string - cdef int _attach_tokens(self, Doc tokens, unicode string, + cdef int _attach_tokens(self, Doc tokens, str string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, int* has_special, @@ -440,7 +438,7 @@ cdef class Tokenizer: cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme - cdef unicode span + cdef str span cdef int i if prefixes.size(): for i in range(prefixes.size()): @@ -513,7 +511,7 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - def find_infix(self, unicode string): + def find_infix(self, str string): """Find internal split points of the string, such as hyphens. string (str): The string to segment. @@ -527,7 +525,7 @@ cdef class Tokenizer: return 0 return list(self.infix_finditer(string)) - def find_prefix(self, unicode string): + def find_prefix(self, str string): """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. @@ -541,7 +539,7 @@ cdef class Tokenizer: match = self.prefix_search(string) return (match.end() - match.start()) if match is not None else 0 - def find_suffix(self, unicode string): + def find_suffix(self, str string): """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. @@ -579,7 +577,7 @@ cdef class Tokenizer: if attr not in (ORTH, NORM): raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk)) - def add_special_case(self, unicode string, substrings): + def add_special_case(self, str string, substrings): """Add a special-case tokenization rule. string (str): The string to specially tokenize. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 868eb3eab..2ce329375 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -36,7 +36,7 @@ class DocBin: "spans": List[Dict[str, bytes]], # SpanGroups data for each doc "spaces": bytes, # Serialized numpy boolean array with spaces data "lengths": bytes, # Serialized numpy int32 array with the doc lengths - "strings": List[unicode] # List of unique strings in the token data + "strings": List[str] # List of unique strings in the token data "version": str, # DocBin version number } diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cd2bd6f6c..c4ddd4163 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -260,7 +260,7 @@ cdef class Doc: raise ValueError(Errors.E027) cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): - if isinstance(word, unicode): + if isinstance(word, str): lexeme = self.vocab.get(self.mem, word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) @@ -1362,7 +1362,7 @@ cdef class Doc: self.has_unknown_spaces = msg["has_unknown_spaces"] start = 0 cdef const LexemeC* lex - cdef unicode orth_ + cdef str orth_ text = msg["text"] attrs = msg["array_body"] for i in range(attrs.shape[0]): @@ -1423,7 +1423,7 @@ cdef class Doc: attributes are inherited from the syntactic root of the span. RETURNS (Token): The first newly merged token. """ - cdef unicode tag, lemma, ent_type + cdef str tag, lemma, ent_type attr_len = len(attributes) span_len = len(spans) if not attr_len == span_len: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 48c6053c1..5807ff2d2 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - cimport numpy as np from libc.math cimport sqrt @@ -745,7 +743,7 @@ cdef class Span: def __get__(self): return self.root.ent_id_ - def __set__(self, unicode key): + def __set__(self, str key): raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property @@ -766,7 +764,7 @@ cdef class Span: def __get__(self): return self.doc.vocab.strings[self.label] - def __set__(self, unicode label_): + def __set__(self, str label_): self.label = self.doc.vocab.strings.add(label_) property kb_id_: @@ -774,7 +772,7 @@ cdef class Span: def __get__(self): return self.doc.vocab.strings[self.kb_id] - def __set__(self, unicode kb_id_): + def __set__(self, str kb_id_): self.kb_id = self.doc.vocab.strings.add(kb_id_) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 3fcfda691..8877cf9d0 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -267,7 +267,7 @@ cdef class Token: """RETURNS (str): The text content of the span (with trailing whitespace). """ - cdef unicode orth = self.vocab.strings[self.c.lex.orth] + cdef str orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: return orth + " " else: @@ -820,7 +820,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.norm] - def __set__(self, unicode norm_): + def __set__(self, str norm_): self.c.norm = self.vocab.strings.add(norm_) @property @@ -858,7 +858,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.lemma] - def __set__(self, unicode lemma_): + def __set__(self, str lemma_): self.c.lemma = self.vocab.strings.add(lemma_) property pos_: @@ -890,7 +890,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.dep] - def __set__(self, unicode label): + def __set__(self, str label): self.c.dep = self.vocab.strings.add(label) @property diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 9067476f7..9b556247b 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -36,12 +36,12 @@ cdef class Vocab: cdef public object lex_attr_getters cdef public object cfg - cdef const LexemeC* get(self, Pool mem, unicode string) except NULL + cdef const LexemeC* get(self, Pool mem, str string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 13dd675af..552898a98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -60,7 +60,7 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. - vectors_name (unicode): Optional name to identify the vectors table. + vectors_name (str): Optional name to identify the vectors table. get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]): A function that yields base noun phrases used for Doc.noun_chunks. """ @@ -105,7 +105,7 @@ cdef class Vocab: See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag_getter (callable): A function `f(str) -> bool`, to get the flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest @@ -128,7 +128,7 @@ cdef class Vocab: self.lex_attr_getters[flag_id] = flag_getter return flag_id - cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: + cdef const LexemeC* get(self, Pool mem, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -162,7 +162,7 @@ cdef class Vocab: else: return self._new_lexeme(mem, self.strings[orth]) - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: # I think this heuristic is bad, and the Vocab should always # own the lexemes. It avoids weird bugs this way, as it's how the thing # was originally supposed to work. The best solution to the growing @@ -184,7 +184,7 @@ cdef class Vocab: if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) - if isinstance(value, unicode): + if isinstance(value, str): value = self.strings.add(value) if value is not None: Lexeme.set_struct_attr(lex, attr, value) @@ -201,7 +201,7 @@ cdef class Vocab: def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. - string (unicode): The ID string. + string (str): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. DOCS: https://spacy.io/api/vocab#contains @@ -209,7 +209,7 @@ cdef class Vocab: cdef hash_t int_key if isinstance(key, bytes): int_key = self.strings[key.decode("utf8")] - elif isinstance(key, unicode): + elif isinstance(key, str): int_key = self.strings[key] else: int_key = key @@ -234,7 +234,7 @@ cdef class Vocab: previously unseen unicode string is given, a new lexeme is created and stored. - id_or_string (int or unicode): The integer ID of a word, or its unicode + id_or_string (int or str): The integer ID of a word, or its unicode string. If `int >= Lexicon.size`, `IndexError` is raised. If `id_or_string` is neither an int nor a unicode string, `ValueError` is raised. @@ -247,7 +247,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#getitem """ cdef attr_t orth - if isinstance(id_or_string, unicode): + if isinstance(id_or_string, str): orth = self.strings.add(id_or_string) else: orth = id_or_string @@ -348,7 +348,7 @@ cdef class Vocab: If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. - orth (int / unicode): The hash value of a word, or its unicode string. + orth (int / str): The hash value of a word, or its unicode string. minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. maxn (int): Maximum n-gram length used for Fasttext's ngram computation. @@ -401,7 +401,7 @@ cdef class Vocab: """Set a vector for a word in the vocabulary. Words can be referenced by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set. DOCS: https://spacy.io/api/vocab#set_vector @@ -423,7 +423,7 @@ cdef class Vocab: """Check whether a word has a vector. Returns False if no vectors have been loaded. Words can be looked up by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. RETURNS (bool): Whether the word has a vector. DOCS: https://spacy.io/api/vocab#has_vector @@ -448,7 +448,7 @@ cdef class Vocab: def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str or Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -469,7 +469,7 @@ cdef class Vocab: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str or Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object.