From 412dbb1f3885ccc3e26db67a2799cbc8a9e7f54f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 6 Jul 2020 13:06:25 +0200 Subject: [PATCH] Remove dead and/or deprecated code (#5710) * Remove dead and/or deprecated code * Remove n_threads Co-authored-by: Matthew Honnibal --- spacy/__init__.py | 3 - spacy/cli/evaluate.py | 5 +- spacy/cli/pretrain.py | 14 +- spacy/cli/train.py | 7 +- spacy/errors.py | 64 --------- spacy/language.py | 64 ++------- spacy/lemmatizer.py | 5 +- spacy/matcher/matcher.pyx | 6 +- spacy/matcher/phrasematcher.pyx | 9 +- spacy/pipeline/morphologizer.pyx | 12 +- spacy/pipeline/pipes.pyx | 52 +++---- spacy/pipeline/tok2vec.py | 3 +- spacy/syntax/nn_parser.pyx | 14 +- spacy/syntax/transition_system.pyx | 8 +- spacy/tests/doc/test_span.py | 2 - spacy/tests/parser/test_add_label.py | 4 +- spacy/tests/pipeline/test_textcat.py | 6 +- spacy/tests/regression/test_issue3001-3500.py | 26 ---- spacy/tests/regression/test_issue3611.py | 3 +- spacy/tests/regression/test_issue4030.py | 3 +- spacy/tests/regression/test_issue4348.py | 3 +- spacy/tests/serialize/test_serialize_doc.py | 4 - .../serialize/test_serialize_language.py | 4 - .../serialize/test_serialize_pipeline.py | 4 - spacy/tests/test_gold.py | 8 +- spacy/tokenizer.pxd | 2 - spacy/tokenizer.pyx | 14 +- spacy/tokens/doc.pyx | 6 - spacy/tokens/span.pyx | 15 -- spacy/tokens/token.pyx | 5 - spacy/util.py | 131 ++---------------- spacy/vocab.pyx | 12 +- 32 files changed, 88 insertions(+), 430 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index b525a5ba5..b788b11ca 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -25,9 +25,6 @@ config = registry def load(name, **overrides): - depr_path = overrides.get("path") - if depr_path not in (True, False, None): - warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning) return util.load_model(name, **overrides) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a9ddfe9be..a5d4a3661 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -4,6 +4,7 @@ from wasabi import Printer from pathlib import Path import re import srsly +from thinc.api import require_gpu, fix_random_seed from ..gold import Corpus from ..tokens import Doc @@ -52,9 +53,9 @@ def evaluate( silent: bool = True, ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) - util.fix_random_seed() + fix_random_seed() if gpu_id >= 0: - util.use_gpu(gpu_id) + require_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 5b021aabc..58e82028b 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,8 +5,8 @@ import time import re from collections import Counter from pathlib import Path -from thinc.api import use_pytorch_for_gpu_memory -from thinc.api import set_dropout_rate, to_categorical +from thinc.api import use_pytorch_for_gpu_memory, require_gpu +from thinc.api import set_dropout_rate, to_categorical, fix_random_seed from thinc.api import CosineDistance, L2Distance from wasabi import msg import srsly @@ -36,7 +36,7 @@ def pretrain_cli( Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Two objective types are available, vector-based and character-based. - + In the vector-based objective, we load word vectors that have been trained using a word2vec-style distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict vectors which match the @@ -76,13 +76,13 @@ def pretrain( if use_gpu >= 0: msg.info("Using GPU") - util.use_gpu(use_gpu) + require_gpu(use_gpu) else: msg.info("Using CPU") msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) - util.fix_random_seed(config["pretraining"]["seed"]) + fix_random_seed(config["pretraining"]["seed"]) if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: use_pytorch_for_gpu_memory() @@ -231,12 +231,12 @@ def make_docs(nlp, batch, min_length, max_length): def create_objective(config): """Create the objective for pretraining. - + We'd like to replace this with a registry function but it's tricky because we're also making a model choice based on this. For now we hard-code support for two types (characters, vectors). For characters you can specify n_characters, for vectors you can specify the loss. - + Bleh. """ objective_type = config["type"] diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b974247bd..9e12eb0f4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, List, Union, Sequence from timeit import default_timer as timer - import srsly import tqdm from pydantic import BaseModel, FilePath @@ -8,7 +7,7 @@ from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model, use_pytorch_for_gpu_memory +from thinc.api import Model, use_pytorch_for_gpu_memory, require_gpu, fix_random_seed import random from ._app import app, Arg, Opt @@ -156,7 +155,7 @@ def train_cli( if use_gpu >= 0: msg.info("Using GPU: {use_gpu}") - util.use_gpu(use_gpu) + require_gpu(use_gpu) else: msg.info("Using CPU") @@ -183,7 +182,7 @@ def train( msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) - util.fix_random_seed(config["training"]["seed"]) + fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() diff --git a/spacy/errors.py b/spacy/errors.py index 4e73aee6f..31533e7e2 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -16,16 +16,6 @@ def add_codes(err_cls): @add_codes class Warnings(object): - W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. " - "You can now call spacy.load with the path as its first argument, " - "and the model's meta.json will be used to determine the language " - "to load. For example:\nnlp = spacy.load('{path}')") - W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object " - "instead and pass in the strings as the `words` keyword argument, " - "for example:\nfrom spacy.tokens import Doc\n" - "doc = Doc(nlp.vocab, words=[...])") - W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use " - "the keyword arguments, for example tag=, lemma= or ent_type=.") W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing " "using ftfy.fix_text if necessary.") W005 = ("Doc object not parsed. This means displaCy won't be able to " @@ -45,12 +35,6 @@ class Warnings(object): "use context-sensitive tensors. You can always add your own word " "vectors, or use one of the larger models instead if available.") W008 = ("Evaluating {obj}.similarity based on empty vectors.") - W009 = ("Custom factory '{name}' provided by entry points of another " - "package overwrites built-in factory.") - W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length " - "limit anymore, so the max_length argument is now deprecated. " - "If you did not specify this parameter, make sure you call the " - "constructor with named arguments instead of positional ones.") W011 = ("It looks like you're calling displacy.serve from within a " "Jupyter notebook or a similar environment. This likely means " "you're already running a local web server, so there's no need to " @@ -64,23 +48,9 @@ class Warnings(object): "components are applied. To only create tokenized Doc objects, " "try using `nlp.make_doc(text)` or process all texts as a stream " "using `list(nlp.tokenizer.pipe(all_texts))`.") - W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more " - "efficient and less error-prone Doc.retokenize context manager " - "instead.") - W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization " - "methods is and should be replaced with `exclude`. This makes it " - "consistent with the other serializable objects.") - W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from " - "being serialized or deserialized is deprecated. Please use the " - "`exclude` argument instead. For example: exclude=['{arg}'].") - W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, " - "the argument `n_process` controls parallel inference via " - "multiprocessing.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.") W018 = ("Entity '{entity}' already exists in the Knowledge Base - " "ignoring the duplicate entry.") - W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " - "previously loaded vectors. See Issue #3853.") W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " "loaded. (Shape: {shape})") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " @@ -91,8 +61,6 @@ class Warnings(object): "or the language you're using doesn't have lemmatization data, " "you can ignore this warning. If this is surprising, make sure you " "have the spacy-lookups-data package installed.") - W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " - "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " @@ -101,28 +69,11 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") - W028 = ("Doc.from_array was called with a vector of type '{type}', " - "but is expecting one of type 'uint64' instead. This may result " - "in problems with the vocab further on in the pipeline.") - W029 = ("Unable to align tokens with entities from character offsets. " - "Discarding entity annotation for the text: {text}.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") - W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and " - "is incompatible with the current spaCy version ({current}). This " - "may lead to unexpected results or runtime errors. To resolve " - "this, download a newer compatible model or retrain your custom " - "model with the current spaCy version. For more details and " - "available updates, run: python -m spacy validate") - W032 = ("Unable to determine model compatibility for model '{model}' " - "({model_version}) with the current spaCy version ({current}). " - "This may lead to unexpected results or runtime errors. To resolve " - "this, download a newer compatible model or retrain your custom " - "model with the current spaCy version. For more details and " - "available updates, run: python -m spacy validate") W033 = ("Training a new {model} using a model with no lexeme normalization " "table. This may degrade the performance of the model to some " "degree. If this is intentional or the language you're using " @@ -236,9 +187,6 @@ class Errors(object): "the HEAD attribute would potentially override the sentence " "boundaries set by SENT_START.") E033 = ("Cannot load into non-empty Doc of length {length}.") - E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected " - "either 3 arguments (deprecated), or 0 (use keyword arguments).\n" - "Arguments supplied:\n{args}\nKeyword arguments:{kwargs}") E035 = ("Error creating span with start {start} and end {end} for Doc of " "length {length}.") E036 = ("Error calculating span: Can't find a token starting at character " @@ -347,14 +295,9 @@ class Errors(object): E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " "you're setting don't overlap.") - E105 = ("The Doc.print_tree() method is now deprecated. Please use " - "Doc.to_json() instead or write your own function.") E106 = ("Can't find doc._.{attr} attribute specified in the underscore " "settings: {opts}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") - E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated " - "in favor of the pipe name `sentencizer`, which does the same " - "thing. For example, use `nlp.create_pipeline('sentencizer')`") E109 = ("Component '{name}' could not be run. Did you forget to " "call begin_training()?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") @@ -394,10 +337,6 @@ class Errors(object): E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") - E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword " - "arguments to exclude fields from being serialized or deserialized " - "is now deprecated. Please use the `exclude` argument instead. " - "For example: exclude=['{arg}'].") E129 = ("Cannot write the label of an existing Span object because a Span " "is a read-only view of the underlying Token objects stored in the " "Doc. Instead, create a new Span object and specify the `label` " @@ -489,9 +428,6 @@ class Errors(object): E172 = ("The Lemmatizer.load classmethod is deprecated. To create a " "Lemmatizer, initialize the class directly. See the docs for " "details: https://spacy.io/api/lemmatizer") - E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " - "Lookups containing the lemmatization tables. See the docs for " - "details: https://spacy.io/api/lemmatizer#init") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") diff --git a/spacy/language.py b/spacy/language.py index dbc213574..da45c058c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,7 +8,7 @@ from copy import copy, deepcopy from pathlib import Path import warnings -from thinc.api import get_current_ops, Config +from thinc.api import get_current_ops, Config, require_gpu import srsly import multiprocessing as mp from itertools import chain, cycle @@ -233,32 +233,6 @@ class Language(object): def config(self): return self._config - # Conveniences to access pipeline components - # Shouldn't be used anymore! - @property - def tagger(self): - return self.get_pipe("tagger") - - @property - def parser(self): - return self.get_pipe("parser") - - @property - def entity(self): - return self.get_pipe("ner") - - @property - def linker(self): - return self.get_pipe("entity_linker") - - @property - def senter(self): - return self.get_pipe("senter") - - @property - def matcher(self): - return self.get_pipe("matcher") - @property def pipe_names(self): """Get names of available pipeline components. @@ -314,10 +288,7 @@ class Language(object): DOCS: https://spacy.io/api/language#create_pipe """ if name not in self.factories: - if name == "sbd": - raise KeyError(Errors.E108.format(name=name)) - else: - raise KeyError(Errors.E002.format(name=name)) + raise KeyError(Errors.E002.format(name=name)) factory = self.factories[name] # transform the model's config to an actual Model @@ -661,7 +632,7 @@ class Language(object): _ = self.vocab[word] # noqa: F841 if cfg.get("device", -1) >= 0: - util.use_gpu(cfg["device"]) + require_gpu(cfg["device"]) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) @@ -691,7 +662,7 @@ class Language(object): on, and call nlp.rehearse() with a batch of Example objects. """ if cfg.get("device", -1) >= 0: - util.use_gpu(cfg["device"]) + require_gpu(cfg["device"]) ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) @@ -782,7 +753,6 @@ class Language(object): self, texts, as_tuples=False, - n_threads=-1, batch_size=1000, disable=[], cleanup=False, @@ -807,8 +777,6 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ - if n_threads != -1: - warnings.warn(Warnings.W016, DeprecationWarning) if n_process == -1: n_process = mp.cpu_count() if as_tuples: @@ -935,7 +903,7 @@ class Language(object): if hasattr(proc2, "model"): proc1.find_listeners(proc2.model) - def to_disk(self, path, exclude=tuple(), disable=None): + def to_disk(self, path, exclude=tuple()): """Save the current state to a directory. If a model is loaded, this will include the model. @@ -945,9 +913,6 @@ class Language(object): DOCS: https://spacy.io/api/language#to_disk """ - if disable is not None: - warnings.warn(Warnings.W014, DeprecationWarning) - exclude = disable path = util.ensure_path(path) serializers = {} serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( @@ -966,7 +931,7 @@ class Language(object): serializers["vocab"] = lambda p: self.vocab.to_disk(p) util.to_disk(path, serializers, exclude) - def from_disk(self, path, exclude=tuple(), disable=None): + def from_disk(self, path, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. @@ -991,9 +956,6 @@ class Language(object): self.vocab.from_disk(path) _fix_pretrained_vectors_name(self) - if disable is not None: - warnings.warn(Warnings.W014, DeprecationWarning) - exclude = disable path = util.ensure_path(path) deserializers = {} @@ -1020,7 +982,7 @@ class Language(object): self._link_components() return self - def to_bytes(self, exclude=tuple(), disable=None, **kwargs): + def to_bytes(self, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): Names of components or serialization fields to exclude. @@ -1028,9 +990,6 @@ class Language(object): DOCS: https://spacy.io/api/language#to_bytes """ - if disable is not None: - warnings.warn(Warnings.W014, DeprecationWarning) - exclude = disable serializers = {} serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) @@ -1042,10 +1001,9 @@ class Language(object): if not hasattr(proc, "to_bytes"): continue serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) - exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. @@ -1066,9 +1024,6 @@ class Language(object): self.vocab.from_bytes(b) _fix_pretrained_vectors_name(self) - if disable is not None: - warnings.warn(Warnings.W014, DeprecationWarning) - exclude = disable deserializers = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) deserializers["meta.json"] = deserialize_meta @@ -1084,7 +1039,6 @@ class Language(object): deserializers[name] = lambda b, proc=proc: proc.from_bytes( b, exclude=["vocab"] ) - exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) self._link_components() return self @@ -1206,7 +1160,7 @@ class DisabledPipes(list): def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) - for arg in ["n_threads", "batch_size"]: + for arg in ["batch_size"]: if arg in kwargs: kwargs.pop(arg) for eg in examples: diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 7d6bfbc12..c108c975a 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,5 +1,4 @@ from .errors import Errors -from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES @@ -15,15 +14,13 @@ class Lemmatizer(object): def load(cls, *args, **kwargs): raise NotImplementedError(Errors.E172) - def __init__(self, lookups, *args, **kwargs): + def __init__(self, lookups): """Initialize a Lemmatizer. lookups (Lookups): The lookups object containing the (optional) tables "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". RETURNS (Lemmatizer): The newly constructed object. """ - if args or kwargs or not isinstance(lookups, Lookups): - raise ValueError(Errors.E173) self.lookups = lookups def __call__(self, string, univ_pos, morphology=None): diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 158730e60..673cb3298 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -174,8 +174,7 @@ cdef class Matcher: return default return (self._callbacks[key], self._patterns[key]) - def pipe(self, docs, batch_size=1000, n_threads=-1, return_matches=False, - as_tuples=False): + def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): """Match a stream of documents, yielding them in turn. docs (iterable): A stream of documents. @@ -188,9 +187,6 @@ cdef class Matcher: be a sequence of ((doc, matches), context) tuples. YIELDS (Doc): Documents, in order. """ - if n_threads != -1: - warnings.warn(Warnings.W016, DeprecationWarning) - if as_tuples: for doc, context in docs: matches = self(doc) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 1bb06c0a3..a2141dc02 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -26,7 +26,7 @@ cdef class PhraseMatcher: Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com) """ - def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): + def __init__(self, Vocab vocab, attr="ORTH", validate=False): """Initialize the PhraseMatcher. vocab (Vocab): The shared vocabulary. @@ -36,8 +36,6 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#init """ - if max_length != 0: - warnings.warn(Warnings.W010, DeprecationWarning) self.vocab = vocab self._callbacks = {} self._docs = {} @@ -287,8 +285,7 @@ cdef class PhraseMatcher: current_node = self.c_map idx += 1 - def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False, - as_tuples=False): + def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False): """Match a stream of documents, yielding them in turn. docs (iterable): A stream of documents. @@ -303,8 +300,6 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#pipe """ - if n_threads != -1: - warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in stream: matches = self(doc) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 8ded3890f..f792d57b0 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -120,15 +120,14 @@ class Morphologizer(Tagger): d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): serialize = {} serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): def load_model(b): try: self.model.from_bytes(b) @@ -140,20 +139,18 @@ class Morphologizer(Tagger): "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): serialize = { "vocab": lambda p: self.vocab.to_disk(p), "model": lambda p: p.open("wb").write(self.model.to_bytes()), "cfg": lambda p: srsly.write_json(p, self.cfg), } - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): def load_model(p): with p.open("rb") as file_: try: @@ -166,6 +163,5 @@ class Morphologizer(Tagger): "cfg": lambda p: self.cfg.update(_load_cfg(p)), "model": load_model, } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ed700b09a..61cf155a2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -66,7 +66,7 @@ class Pipe(object): self.set_annotations([doc], predictions) return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` @@ -151,7 +151,7 @@ class Pipe(object): with self.model.use_params(params): yield - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (list): String names of serialization fields to exclude. @@ -162,10 +162,9 @@ class Pipe(object): serialize["model"] = self.model.to_bytes if hasattr(self, "vocab"): serialize["vocab"] = self.vocab.to_bytes - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): """Load the pipe from a bytestring.""" def load_model(b): @@ -179,20 +178,18 @@ class Pipe(object): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["model"] = load_model - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): """Serialize the pipe to disk.""" serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): """Load the pipe from disk.""" def load_model(p): @@ -205,7 +202,6 @@ class Pipe(object): deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["model"] = load_model - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -232,7 +228,7 @@ class Tagger(Pipe): self.set_annotations([doc], tags) return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): tag_ids = self.predict(docs) self.set_annotations(docs, tag_ids) @@ -421,17 +417,16 @@ class Tagger(Pipe): with self.model.use_params(params): yield - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): serialize = {} serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): def load_model(b): try: self.model.from_bytes(b) @@ -451,11 +446,10 @@ class Tagger(Pipe): "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize = { "vocab": lambda p: self.vocab.to_disk(p), @@ -463,10 +457,9 @@ class Tagger(Pipe): "model": lambda p: self.model.to_disk(p), "cfg": lambda p: srsly.write_json(p, self.cfg), } - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): def load_model(p): with p.open("rb") as file_: try: @@ -487,7 +480,6 @@ class Tagger(Pipe): "tag_map": load_tag_map, "model": load_model, } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -566,15 +558,14 @@ class SentenceRecognizer(Tagger): def add_label(self, label, values=None): raise NotImplementedError - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): serialize = {} serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): def load_model(b): try: self.model.from_bytes(b) @@ -586,20 +577,18 @@ class SentenceRecognizer(Tagger): "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): serialize = { "vocab": lambda p: self.vocab.to_disk(p), "model": lambda p: p.open("wb").write(self.model.to_bytes()), "cfg": lambda p: srsly.write_json(p, self.cfg), } - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): def load_model(p): with p.open("rb") as file_: try: @@ -612,7 +601,6 @@ class SentenceRecognizer(Tagger): "cfg": lambda p: self.cfg.update(_load_cfg(p)), "model": load_model, } - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -825,7 +813,7 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) @@ -1198,7 +1186,7 @@ class EntityLinker(Pipe): self.set_annotations([doc], kb_ids, tensors=tensors) return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) @@ -1309,17 +1297,16 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = kb_id - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): serialize = {} self.cfg["entity_width"] = self.kb.entity_vector_length serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) serialize["model"] = lambda p: self.model.to_disk(p) - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): def load_model(p): try: self.model.from_bytes(p.open("rb").read()) @@ -1335,7 +1322,6 @@ class EntityLinker(Pipe): deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["kb"] = load_kb deserialize["model"] = load_model - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -1411,7 +1397,7 @@ class Sentencizer(Pipe): doc[start].is_sent_start = True return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 047cf5caa..a06513a73 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -51,11 +51,10 @@ class Tok2Vec(Pipe): self.set_annotations([doc], tokvecses) return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128): """Process `Doc` objects as a stream. stream (iterator): A sequence of `Doc` objects to process. batch_size (int): Number of `Doc` objects to group. - n_threads (int): Number of threads. YIELDS (iterator): A sequence of `Doc` objects, in order of input. """ for docs in minibatch(stream, batch_size): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 743b4ca1d..0295241c6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -157,7 +157,7 @@ cdef class Parser: self.set_annotations([doc], states, tensors=None) return doc - def pipe(self, docs, int batch_size=256, int n_threads=-1): + def pipe(self, docs, int batch_size=256): """Process a stream of documents. stream: The sequence of documents to process. @@ -461,24 +461,22 @@ cdef class Parser: link_vectors_to_models(self.vocab) return sgd - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'vocab': lambda p: self.vocab.to_disk(p), 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), 'cfg': lambda p: srsly.write_json(p, self.cfg) } - exclude = util.get_serialization_exclude(serializers, exclude, kwargs) util.to_disk(path, serializers, exclude) - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): deserializers = { 'vocab': lambda p: self.vocab.from_disk(p), 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'model': lambda p: None, } - exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_disk(path, deserializers, exclude) if 'model' not in exclude: path = util.ensure_path(path) @@ -491,24 +489,22 @@ cdef class Parser: raise ValueError(Errors.E149) return self - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): serializers = { "model": lambda: (self.model.to_bytes()), "vocab": lambda: self.vocab.to_bytes(), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) } - exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): deserializers = { "vocab": lambda b: self.vocab.from_bytes(b), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: None, } - exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: if 'model' in msg: diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index d88ab26bd..17166dcf5 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -60,7 +60,7 @@ cdef class TransitionSystem: states.append(state) offset += len(doc) return states - + def get_oracle_sequence(self, Example example, _debug=False): states, golds, _ = self.init_gold_batch([example]) if not states: @@ -227,22 +227,20 @@ cdef class TransitionSystem: self.from_bytes(byte_data, **kwargs) return self - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): transitions = [] serializers = { 'moves': lambda: srsly.json_dumps(self.labels), 'strings': lambda: self.strings.to_bytes() } - exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): labels = {} deserializers = { 'moves': lambda b: labels.update(srsly.json_loads(b)), 'strings': lambda b: self.strings.from_bytes(b) } - exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) self.initialize_actions(labels) return self diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 43c699d21..91b0ec922 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -66,8 +66,6 @@ def test_spans_string_fn(doc): span = doc[0:4] assert len(span) == 4 assert span.text == "This is a sentence" - assert span.upper_ == "THIS IS A SENTENCE" - assert span.lower_ == "this is a sentence" def test_spans_root2(en_tokenizer): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index b5fa933cd..a9ccda995 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,13 +1,11 @@ import pytest -from thinc.api import Adam +from thinc.api import Adam, fix_random_seed from spacy.attrs import NORM from spacy.vocab import Vocab - from spacy.gold import Example from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer -from spacy.util import fix_random_seed @pytest.fixture diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 214163a97..07d73eb6e 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,18 +1,18 @@ import pytest import random import numpy.random - +from thinc.api import fix_random_seed from spacy import util from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer from spacy.tokens import Doc -from spacy.util import fix_random_seed +from spacy.pipeline.defaults import default_tok2vec from ..util import make_tempdir -from spacy.pipeline.defaults import default_tok2vec from ...gold import Example + TRAIN_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index a37707379..1aceba68f 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -9,7 +9,6 @@ from spacy.vocab import Vocab from spacy.attrs import ENT_IOB, ENT_TYPE from spacy.compat import pickle from spacy import displacy -from spacy.util import decaying import numpy from spacy.vectors import Vectors @@ -216,21 +215,6 @@ def test_issue3345(): assert ner.moves.is_valid(state, "B-GPE") -def test_issue3410(): - texts = ["Hello world", "This is a test"] - nlp = English() - matcher = Matcher(nlp.vocab) - phrasematcher = PhraseMatcher(nlp.vocab) - with pytest.deprecated_call(): - docs = list(nlp.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - list(matcher.pipe(docs, n_threads=4)) - with pytest.deprecated_call(): - list(phrasematcher.pipe(docs, n_threads=4)) - - def test_issue3412(): data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") vectors = Vectors(data=data, keys=["A", "B", "C"]) @@ -240,16 +224,6 @@ def test_issue3412(): assert best_rows[0] == 2 -def test_issue3447(): - sizes = decaying(10.0, 1.0, 0.5) - size = next(sizes) - assert size == 10.0 - size = next(sizes) - assert size == 10.0 - 0.5 - size = next(sizes) - assert size == 10.0 - 0.5 - 0.5 - - @pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") def test_issue3449(): nlp = English() diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 67bc88466..ef189c446 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,6 +1,7 @@ import spacy +from spacy.util import minibatch +from thinc.api import compounding from spacy.gold import Example -from spacy.util import minibatch, compounding def test_issue3611(): diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index 12a320c71..e40565501 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -1,6 +1,7 @@ import spacy +from spacy.util import minibatch +from thinc.api import compounding from spacy.gold import Example -from spacy.util import minibatch, compounding def test_issue4030(): diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index d7a12d054..06b03df24 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -1,6 +1,7 @@ from spacy.gold import Example from spacy.lang.en import English -from spacy.util import minibatch, compounding +from spacy.util import minibatch +from thinc.api import compounding import pytest diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 85c21f7f9..a547b51bc 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -52,10 +52,6 @@ def test_serialize_doc_exclude(en_vocab): assert not new_doc.user_data new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) assert not new_doc.user_data - with pytest.raises(ValueError): - doc.to_bytes(user_data=False) - with pytest.raises(ValueError): - Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False) def test_serialize_doc_bin(): diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 0e3b7c59f..05529f9d1 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -62,7 +62,3 @@ def test_serialize_language_exclude(meta_data): assert not new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) assert not new_nlp.meta["name"] == name - with pytest.raises(ValueError): - nlp.to_bytes(meta=False) - with pytest.raises(ValueError): - Language().from_bytes(nlp.to_bytes(), meta=False) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index abb5ccb27..0f6a8853c 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -127,10 +127,6 @@ def test_serialize_pipe_exclude(en_vocab, Parser): parser.to_bytes(exclude=["cfg"]), exclude=["vocab"] ) assert "foo" not in new_parser.cfg - with pytest.raises(ValueError): - parser.to_bytes(cfg=False, exclude=["vocab"]) - with pytest.raises(ValueError): - get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False) def test_serialize_sentencerecognizer(en_vocab): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 0ed4d50d5..cd354ff92 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -5,9 +5,9 @@ from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.gold.converters import json2docs from spacy.lang.en import English -from spacy.syntax.nonproj import is_nonproj_tree from spacy.tokens import Doc, DocBin -from spacy.util import get_words_and_spaces, compounding, minibatch +from spacy.util import get_words_and_spaces, minibatch +from thinc.api import compounding import pytest import srsly @@ -511,9 +511,7 @@ def test_make_orth_variants(doc): # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp)) - variant_example = make_orth_variants_example( - nlp, train_example, orth_variant_level=0.2 - ) + make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @pytest.mark.parametrize( diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 20508ead7..828f4550b 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -26,8 +26,6 @@ cdef class Tokenizer: cdef int _property_init_count cdef int _property_init_max - cpdef Doc tokens_from_list(self, list strings) - cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 cdef void _filter_special_spans(self, vector[SpanC] &original, diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 2359fd5af..203488609 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -140,10 +140,6 @@ cdef class Tokenizer: self.url_match) return (self.__class__, args, None, None) - cpdef Doc tokens_from_list(self, list strings): - warnings.warn(Warnings.W002, DeprecationWarning) - return Doc(self.vocab, words=strings) - def __call__(self, unicode string): """Tokenize a string. @@ -218,7 +214,7 @@ cdef class Tokenizer: doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc - def pipe(self, texts, batch_size=1000, n_threads=-1): + def pipe(self, texts, batch_size=1000): """Tokenize a stream of texts. texts: A sequence of unicode texts. @@ -228,8 +224,6 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#pipe """ - if n_threads != -1: - warnings.warn(Warnings.W016, DeprecationWarning) for text in texts: yield self(text) @@ -746,7 +740,7 @@ cdef class Tokenizer: self.from_bytes(bytes_data, **kwargs) return self - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -763,10 +757,9 @@ cdef class Tokenizer: "url_match": lambda: _get_regex_pattern(self.url_match), "exceptions": lambda: dict(sorted(self._rules.items())) } - exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. @@ -785,7 +778,6 @@ cdef class Tokenizer: "url_match": lambda b: data.setdefault("url_match", b), "exceptions": lambda b: data.setdefault("rules", b) } - exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if "prefix_search" in data and isinstance(data["prefix_search"], str): self.prefix_search = re.compile(data["prefix_search"]).search diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 22bcd02fc..ca9230d98 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1081,9 +1081,6 @@ cdef class Doc: "cats": lambda: self.cats, "has_unknown_spaces": lambda: self.has_unknown_spaces } - for key in kwargs: - if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): - raise ValueError(Errors.E128.format(arg=key)) if "user_data" not in exclude and self.user_data: user_data_keys, user_data_values = list(zip(*self.user_data.items())) if "user_data_keys" not in exclude: @@ -1114,9 +1111,6 @@ cdef class Doc: "user_data_values": lambda b: None, "has_unknown_spaces": lambda b: None } - for key in kwargs: - if key in deserializers or key in ("user_data",): - raise ValueError(Errors.E128.format(arg=key)) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 902d46f5a..203308749 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -686,21 +686,6 @@ cdef class Span: """RETURNS (str): The span's lemma.""" return " ".join([t.lemma_ for t in self]).strip() - @property - def upper_(self): - """Deprecated. Use `Span.text.upper()` instead.""" - return "".join([t.text_with_ws.upper() for t in self]).strip() - - @property - def lower_(self): - """Deprecated. Use `Span.text.lower()` instead.""" - return "".join([t.text_with_ws.lower() for t in self]).strip() - - @property - def string(self): - """Deprecated: Use `Span.text_with_ws` instead.""" - return "".join([t.text_with_ws for t in self]) - property label_: """RETURNS (str): The span's label.""" def __get__(self): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f85a17d69..551b7a663 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -237,11 +237,6 @@ cdef class Token: index into tables, e.g. for word vectors.""" return self.c.lex.id - @property - def string(self): - """Deprecated: Use Token.text_with_ws instead.""" - return self.text_with_ws - @property def text(self): """RETURNS (str): The original verbatim text of the token.""" diff --git a/spacy/util.py b/spacy/util.py index 7c29bed8e..4a17b7f24 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,9 +4,8 @@ import importlib import importlib.util import re from pathlib import Path -import random import thinc -from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config +from thinc.api import NumpyOps, get_current_ops, Adam, Config import functools import itertools import numpy.random @@ -34,6 +33,13 @@ try: # Python 3.8 except ImportError: import importlib_metadata +# These are functions that were previously (v2.x) available from spacy.util +# and have since moved to Thinc. We're importing them here so people's code +# doesn't break, but they should always be imported from Thinc from now on, +# not from spacy.util. +from thinc.api import fix_random_seed, compounding, decaying # noqa: F401 + + from .symbols import ORTH from .compat import cupy, CudaStream, is_windows from .errors import Errors, Warnings @@ -595,15 +601,8 @@ def compile_prefix_regex(entries): entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES. RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search. """ - if "(" in entries: - # Handle deprecated data - expression = "|".join( - ["^" + re.escape(piece) for piece in entries if piece.strip()] - ) - return re.compile(expression) - else: - expression = "|".join(["^" + piece for piece in entries if piece.strip()]) - return re.compile(expression) + expression = "|".join(["^" + piece for piece in entries if piece.strip()]) + return re.compile(expression) def compile_suffix_regex(entries): @@ -723,59 +722,6 @@ def minibatch(items, size=8): yield list(batch) -def compounding(start, stop, compound): - """Yield an infinite series of compounding values. Each time the - generator is called, a value is produced by multiplying the previous - value by the compound rate. - - EXAMPLE: - >>> sizes = compounding(1., 10., 1.5) - >>> assert next(sizes) == 1. - >>> assert next(sizes) == 1 * 1.5 - >>> assert next(sizes) == 1.5 * 1.5 - """ - - def clip(value): - return max(value, stop) if (start > stop) else min(value, stop) - - curr = float(start) - while True: - yield clip(curr) - curr *= compound - - -def stepping(start, stop, steps): - """Yield an infinite series of values that step from a start value to a - final value over some number of steps. Each step is (stop-start)/steps. - - After the final value is reached, the generator continues yielding that - value. - - EXAMPLE: - >>> sizes = stepping(1., 200., 100) - >>> assert next(sizes) == 1. - >>> assert next(sizes) == 1 * (200.-1.) / 100 - >>> assert next(sizes) == 1 + (200.-1.) / 100 + (200.-1.) / 100 - """ - - def clip(value): - return max(value, stop) if (start > stop) else min(value, stop) - - curr = float(start) - while True: - yield clip(curr) - curr += (stop - start) / steps - - -def decaying(start, stop, decay): - """Yield an infinite series of linearly decaying values.""" - - curr = float(start) - while True: - yield max(curr, stop) - curr -= decay - - def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by @@ -854,35 +800,6 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): yield batch -def itershuffle(iterable, bufsize=1000): - """Shuffle an iterator. This works by holding `bufsize` items back - and yielding them sometime later. Obviously, this is not unbiased – - but should be good enough for batching. Larger bufsize means less bias. - From https://gist.github.com/andres-erbsen/1307752 - - iterable (iterable): Iterator to shuffle. - bufsize (int): Items to hold back. - YIELDS (iterable): The shuffled iterator. - """ - iterable = iter(iterable) - buf = [] - try: - while True: - for i in range(random.randint(1, bufsize - len(buf))): - buf.append(next(iterable)) - random.shuffle(buf) - for i in range(random.randint(1, bufsize)): - if buf: - yield buf.pop() - else: - break - except StopIteration: - random.shuffle(buf) - while buf: - yield buf.pop() - raise StopIteration - - def filter_spans(spans): """Filter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or @@ -989,34 +906,6 @@ def escape_html(text): return text -def use_gpu(gpu_id): - return require_gpu(gpu_id) - - -def fix_random_seed(seed=0): - random.seed(seed) - numpy.random.seed(seed) - if cupy is not None: - cupy.random.seed(seed) - - -def get_serialization_exclude(serializers, exclude, kwargs): - """Helper function to validate serialization args and manage transition from - keyword arguments (pre v2.1) to exclude argument. - """ - exclude = list(exclude) - # Split to support file names like meta.json - options = [name.split(".")[0] for name in serializers] - for key, value in kwargs.items(): - if key in ("vocab",) and value is False: - warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning) - exclude.append(key) - elif key.split(".")[0] in options: - raise ValueError(Errors.E128.format(arg=key)) - # TODO: user warning? - return exclude - - def get_words_and_spaces(words, text): if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d70f62dd3..58c1388fc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -426,7 +426,7 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def to_disk(self, path, exclude=tuple(), **kwargs): + def to_disk(self, path, exclude=tuple()): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -439,7 +439,6 @@ cdef class Vocab: if not path.exists(): path.mkdir() setters = ["strings", "vectors"] - exclude = util.get_serialization_exclude(setters, exclude, kwargs) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") if "vectors" not in "exclude" and self.vectors is not None: @@ -449,7 +448,7 @@ cdef class Vocab: if "lookups_extra" not in "exclude" and self.lookups_extra is not None: self.lookups_extra.to_disk(path, filename="lookups_extra.bin") - def from_disk(self, path, exclude=tuple(), **kwargs): + def from_disk(self, path, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -461,7 +460,6 @@ cdef class Vocab: """ path = util.ensure_path(path) getters = ["strings", "vectors"] - exclude = util.get_serialization_exclude(getters, exclude, kwargs) if "strings" not in exclude: self.strings.from_disk(path / "strings.json") # TODO: add exclude? if "vectors" not in exclude: @@ -481,7 +479,7 @@ cdef class Vocab: self._by_orth = PreshMap() return self - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -501,10 +499,9 @@ cdef class Vocab: "lookups": lambda: self.lookups.to_bytes(), "lookups_extra": lambda: self.lookups_extra.to_bytes() } - exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. @@ -526,7 +523,6 @@ cdef class Vocab: "lookups": lambda b: self.lookups.from_bytes(b), "lookups_extra": lambda b: self.lookups_extra.from_bytes(b) } - exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) if "lexeme_norm" in self.lookups: self.lex_attr_getters[NORM] = util.add_lookups(