From 1a5be637150cfa10253456fba277801c711118a1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 22 Aug 2022 15:52:24 +0200 Subject: [PATCH] Cleanup Cython structs (#11337) * cleanup Tokenizer fields * remove unused object from vocab * remove IS_OOV_DEPRECATED * add back in as FLAG13 * FLAG 18 instead * import fix * fix clumpsy fingers * revert symbol changes in favor of #11352 * bint instead of bool --- spacy/tokenizer.pxd | 6 +----- spacy/tokenizer.pyx | 9 ++++----- spacy/vocab.pxd | 1 - spacy/vocab.pyi | 1 - spacy/vocab.pyx | 7 ++----- 5 files changed, 7 insertions(+), 17 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e6a072053..86e62ddbf 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -23,11 +23,7 @@ cdef class Tokenizer: cdef object _infix_finditer cdef object _rules cdef PhraseMatcher _special_matcher - # TODO convert to bool in v4 - cdef int _faster_heuristics - # TODO next one is unused and should be removed in v4 - # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int2 + cdef bint _faster_heuristics cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 972633a2f..49ce6171a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -8,7 +8,6 @@ from preshed.maps cimport PreshMap cimport cython import re -import warnings from .tokens.doc cimport Doc from .strings cimport hash_string @@ -16,9 +15,9 @@ from .lexeme cimport EMPTY_LEXEME from .attrs import intify_attrs from .symbols import ORTH, NORM -from .errors import Errors, Warnings +from .errors import Errors from . import util -from .util import registry, get_words_and_spaces +from .util import get_words_and_spaces from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer @@ -128,10 +127,10 @@ cdef class Tokenizer: property faster_heuristics: def __get__(self): - return bool(self._faster_heuristics) + return self._faster_heuristics def __set__(self, faster_heuristics): - self._faster_heuristics = bool(faster_heuristics) + self._faster_heuristics = faster_heuristics self._reload_special_cases() def __reduce__(self): diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 9c951b2b7..815de0765 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -32,7 +32,6 @@ cdef class Vocab: cdef public object writing_system cdef public object get_noun_chunks cdef readonly int length - cdef public object _unused_object # TODO remove in v4, see #9150 cdef public object lex_attr_getters cdef public object cfg diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 4cc359c47..41964703b 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -72,7 +72,6 @@ def unpickle_vocab( sstore: StringStore, vectors: Any, morphology: Any, - _unused_object: Any, lex_attr_getters: Any, lookups: Any, get_noun_chunks: Any, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index af7d97933..d780dec0d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -558,21 +558,18 @@ def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors morph = vocab.morphology - _unused_object = vocab._unused_object lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups get_noun_chunks = vocab.get_noun_chunks return (unpickle_vocab, - (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks)) + (sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks)) -def unpickle_vocab(sstore, vectors, morphology, _unused_object, - lex_attr_getters, lookups, get_noun_chunks): +def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore vocab.morphology = morphology - vocab._unused_object = _unused_object vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups vocab.get_noun_chunks = get_noun_chunks