Remove dead and/or deprecated code (#5710)

* Remove dead and/or deprecated code

* Remove n_threads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Ines Montani 2020-07-06 13:06:25 +02:00 committed by GitHub
parent fcbf899b08
commit 412dbb1f38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 88 additions and 430 deletions

View File

@ -25,9 +25,6 @@ config = registry
def load(name, **overrides): def load(name, **overrides):
depr_path = overrides.get("path")
if depr_path not in (True, False, None):
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
return util.load_model(name, **overrides) return util.load_model(name, **overrides)

View File

@ -4,6 +4,7 @@ from wasabi import Printer
from pathlib import Path from pathlib import Path
import re import re
import srsly import srsly
from thinc.api import require_gpu, fix_random_seed
from ..gold import Corpus from ..gold import Corpus
from ..tokens import Doc from ..tokens import Doc
@ -52,9 +53,9 @@ def evaluate(
silent: bool = True, silent: bool = True,
) -> Scorer: ) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
util.fix_random_seed() fix_random_seed()
if gpu_id >= 0: if gpu_id >= 0:
util.use_gpu(gpu_id) require_gpu(gpu_id)
util.set_env_log(False) util.set_env_log(False)
data_path = util.ensure_path(data_path) data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output) output_path = util.ensure_path(output)

View File

@ -5,8 +5,8 @@ import time
import re import re
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.api import use_pytorch_for_gpu_memory from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import set_dropout_rate, to_categorical from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance from thinc.api import CosineDistance, L2Distance
from wasabi import msg from wasabi import msg
import srsly import srsly
@ -36,7 +36,7 @@ def pretrain_cli(
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Two objective types using an approximate language-modelling objective. Two objective types
are available, vector-based and character-based. are available, vector-based and character-based.
In the vector-based objective, we load word vectors that have been trained In the vector-based objective, we load word vectors that have been trained
using a word2vec-style distributional similarity algorithm, and train a using a word2vec-style distributional similarity algorithm, and train a
component like a CNN, BiLSTM, etc to predict vectors which match the component like a CNN, BiLSTM, etc to predict vectors which match the
@ -76,13 +76,13 @@ def pretrain(
if use_gpu >= 0: if use_gpu >= 0:
msg.info("Using GPU") msg.info("Using GPU")
util.use_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["pretraining"]["seed"]) fix_random_seed(config["pretraining"]["seed"])
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
@ -231,12 +231,12 @@ def make_docs(nlp, batch, min_length, max_length):
def create_objective(config): def create_objective(config):
"""Create the objective for pretraining. """Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss. n_characters, for vectors you can specify the loss.
Bleh. Bleh.
""" """
objective_type = config["type"] objective_type = config["type"]

View File

@ -1,6 +1,5 @@
from typing import Optional, Dict, List, Union, Sequence from typing import Optional, Dict, List, Union, Sequence
from timeit import default_timer as timer from timeit import default_timer as timer
import srsly import srsly
import tqdm import tqdm
from pydantic import BaseModel, FilePath from pydantic import BaseModel, FilePath
@ -8,7 +7,7 @@ from pathlib import Path
from wasabi import msg from wasabi import msg
import thinc import thinc
import thinc.schedules import thinc.schedules
from thinc.api import Model, use_pytorch_for_gpu_memory from thinc.api import Model, use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
import random import random
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
@ -156,7 +155,7 @@ def train_cli(
if use_gpu >= 0: if use_gpu >= 0:
msg.info("Using GPU: {use_gpu}") msg.info("Using GPU: {use_gpu}")
util.use_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
@ -183,7 +182,7 @@ def train(
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config # Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"]) fix_random_seed(config["training"]["seed"])
if config["training"].get("use_pytorch_for_gpu_memory"): if config["training"].get("use_pytorch_for_gpu_memory"):
# It feels kind of weird to not have a default for this. # It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()

View File

@ -16,16 +16,6 @@ def add_codes(err_cls):
@add_codes @add_codes
class Warnings(object): class Warnings(object):
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
"You can now call spacy.load with the path as its first argument, "
"and the model's meta.json will be used to determine the language "
"to load. For example:\nnlp = spacy.load('{path}')")
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
"instead and pass in the strings as the `words` keyword argument, "
"for example:\nfrom spacy.tokens import Doc\n"
"doc = Doc(nlp.vocab, words=[...])")
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
"the keyword arguments, for example tag=, lemma= or ent_type=.")
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing " W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
"using ftfy.fix_text if necessary.") "using ftfy.fix_text if necessary.")
W005 = ("Doc object not parsed. This means displaCy won't be able to " W005 = ("Doc object not parsed. This means displaCy won't be able to "
@ -45,12 +35,6 @@ class Warnings(object):
"use context-sensitive tensors. You can always add your own word " "use context-sensitive tensors. You can always add your own word "
"vectors, or use one of the larger models instead if available.") "vectors, or use one of the larger models instead if available.")
W008 = ("Evaluating {obj}.similarity based on empty vectors.") W008 = ("Evaluating {obj}.similarity based on empty vectors.")
W009 = ("Custom factory '{name}' provided by entry points of another "
"package overwrites built-in factory.")
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
"limit anymore, so the max_length argument is now deprecated. "
"If you did not specify this parameter, make sure you call the "
"constructor with named arguments instead of positional ones.")
W011 = ("It looks like you're calling displacy.serve from within a " W011 = ("It looks like you're calling displacy.serve from within a "
"Jupyter notebook or a similar environment. This likely means " "Jupyter notebook or a similar environment. This likely means "
"you're already running a local web server, so there's no need to " "you're already running a local web server, so there's no need to "
@ -64,23 +48,9 @@ class Warnings(object):
"components are applied. To only create tokenized Doc objects, " "components are applied. To only create tokenized Doc objects, "
"try using `nlp.make_doc(text)` or process all texts as a stream " "try using `nlp.make_doc(text)` or process all texts as a stream "
"using `list(nlp.tokenizer.pipe(all_texts))`.") "using `list(nlp.tokenizer.pipe(all_texts))`.")
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
"efficient and less error-prone Doc.retokenize context manager "
"instead.")
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
"methods is and should be replaced with `exclude`. This makes it "
"consistent with the other serializable objects.")
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
"being serialized or deserialized is deprecated. Please use the "
"`exclude` argument instead. For example: exclude=['{arg}'].")
W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, "
"the argument `n_process` controls parallel inference via "
"multiprocessing.")
W017 = ("Alias '{alias}' already exists in the Knowledge Base.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
W018 = ("Entity '{entity}' already exists in the Knowledge Base - " W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
"ignoring the duplicate entry.") "ignoring the duplicate entry.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.")
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
"loaded. (Shape: {shape})") "loaded. (Shape: {shape})")
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
@ -91,8 +61,6 @@ class Warnings(object):
"or the language you're using doesn't have lemmatization data, " "or the language you're using doesn't have lemmatization data, "
"you can ignore this warning. If this is surprising, make sure you " "you can ignore this warning. If this is surprising, make sure you "
"have the spacy-lookups-data package installed.") "have the spacy-lookups-data package installed.")
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
"'n_process' will be set to 1.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.") "the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
@ -101,28 +69,11 @@ class Warnings(object):
W027 = ("Found a large training file of {size} bytes. Note that it may " W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple " "be more efficient to split your training data into multiple "
"smaller JSON files instead.") "smaller JSON files instead.")
W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type 'uint64' instead. This may result "
"in problems with the vocab further on in the pipeline.")
W029 = ("Unable to align tokens with entities from character offsets. "
"Discarding entity annotation for the text: {text}.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with " W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use " "entities \"{entities}\". Use "
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be " " to check the alignment. Misaligned entities ('-') will be "
"ignored during training.") "ignored during training.")
W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
"is incompatible with the current spaCy version ({current}). This "
"may lead to unexpected results or runtime errors. To resolve "
"this, download a newer compatible model or retrain your custom "
"model with the current spaCy version. For more details and "
"available updates, run: python -m spacy validate")
W032 = ("Unable to determine model compatibility for model '{model}' "
"({model_version}) with the current spaCy version ({current}). "
"This may lead to unexpected results or runtime errors. To resolve "
"this, download a newer compatible model or retrain your custom "
"model with the current spaCy version. For more details and "
"available updates, run: python -m spacy validate")
W033 = ("Training a new {model} using a model with no lexeme normalization " W033 = ("Training a new {model} using a model with no lexeme normalization "
"table. This may degrade the performance of the model to some " "table. This may degrade the performance of the model to some "
"degree. If this is intentional or the language you're using " "degree. If this is intentional or the language you're using "
@ -236,9 +187,6 @@ class Errors(object):
"the HEAD attribute would potentially override the sentence " "the HEAD attribute would potentially override the sentence "
"boundaries set by SENT_START.") "boundaries set by SENT_START.")
E033 = ("Cannot load into non-empty Doc of length {length}.") E033 = ("Cannot load into non-empty Doc of length {length}.")
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
E035 = ("Error creating span with start {start} and end {end} for Doc of " E035 = ("Error creating span with start {start} and end {end} for Doc of "
"length {length}.") "length {length}.")
E036 = ("Error calculating span: Can't find a token starting at character " E036 = ("Error calculating span: Can't find a token starting at character "
@ -347,14 +295,9 @@ class Errors(object):
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
"token can only be part of one entity, so make sure the entities " "token can only be part of one entity, so make sure the entities "
"you're setting don't overlap.") "you're setting don't overlap.")
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
"Doc.to_json() instead or write your own function.")
E106 = ("Can't find doc._.{attr} attribute specified in the underscore " E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
"settings: {opts}") "settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
"in favor of the pipe name `sentencizer`, which does the same "
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
E109 = ("Component '{name}' could not be run. Did you forget to " E109 = ("Component '{name}' could not be run. Did you forget to "
"call begin_training()?") "call begin_training()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
@ -394,10 +337,6 @@ class Errors(object):
E125 = ("Unexpected value: {value}") E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
"arguments to exclude fields from being serialized or deserialized "
"is now deprecated. Please use the `exclude` argument instead. "
"For example: exclude=['{arg}'].")
E129 = ("Cannot write the label of an existing Span object because a Span " E129 = ("Cannot write the label of an existing Span object because a Span "
"is a read-only view of the underlying Token objects stored in the " "is a read-only view of the underlying Token objects stored in the "
"Doc. Instead, create a new Span object and specify the `label` " "Doc. Instead, create a new Span object and specify the `label` "
@ -489,9 +428,6 @@ class Errors(object):
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a " E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for " "Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer") "details: https://spacy.io/api/lemmatizer")
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")
E175 = ("Can't remove rule for unknown match pattern ID: {key}") E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}") E177 = ("Ill-formed IOB input detected: {tag}")

View File

@ -8,7 +8,7 @@ from copy import copy, deepcopy
from pathlib import Path from pathlib import Path
import warnings import warnings
from thinc.api import get_current_ops, Config from thinc.api import get_current_ops, Config, require_gpu
import srsly import srsly
import multiprocessing as mp import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
@ -233,32 +233,6 @@ class Language(object):
def config(self): def config(self):
return self._config return self._config
# Conveniences to access pipeline components
# Shouldn't be used anymore!
@property
def tagger(self):
return self.get_pipe("tagger")
@property
def parser(self):
return self.get_pipe("parser")
@property
def entity(self):
return self.get_pipe("ner")
@property
def linker(self):
return self.get_pipe("entity_linker")
@property
def senter(self):
return self.get_pipe("senter")
@property
def matcher(self):
return self.get_pipe("matcher")
@property @property
def pipe_names(self): def pipe_names(self):
"""Get names of available pipeline components. """Get names of available pipeline components.
@ -314,10 +288,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#create_pipe DOCS: https://spacy.io/api/language#create_pipe
""" """
if name not in self.factories: if name not in self.factories:
if name == "sbd": raise KeyError(Errors.E002.format(name=name))
raise KeyError(Errors.E108.format(name=name))
else:
raise KeyError(Errors.E002.format(name=name))
factory = self.factories[name] factory = self.factories[name]
# transform the model's config to an actual Model # transform the model's config to an actual Model
@ -661,7 +632,7 @@ class Language(object):
_ = self.vocab[word] # noqa: F841 _ = self.vocab[word] # noqa: F841
if cfg.get("device", -1) >= 0: if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"]) require_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
@ -691,7 +662,7 @@ class Language(object):
on, and call nlp.rehearse() with a batch of Example objects. on, and call nlp.rehearse() with a batch of Example objects.
""" """
if cfg.get("device", -1) >= 0: if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"]) require_gpu(cfg["device"])
ops = get_current_ops() ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
@ -782,7 +753,6 @@ class Language(object):
self, self,
texts, texts,
as_tuples=False, as_tuples=False,
n_threads=-1,
batch_size=1000, batch_size=1000,
disable=[], disable=[],
cleanup=False, cleanup=False,
@ -807,8 +777,6 @@ class Language(object):
DOCS: https://spacy.io/api/language#pipe DOCS: https://spacy.io/api/language#pipe
""" """
if n_threads != -1:
warnings.warn(Warnings.W016, DeprecationWarning)
if n_process == -1: if n_process == -1:
n_process = mp.cpu_count() n_process = mp.cpu_count()
if as_tuples: if as_tuples:
@ -935,7 +903,7 @@ class Language(object):
if hasattr(proc2, "model"): if hasattr(proc2, "model"):
proc1.find_listeners(proc2.model) proc1.find_listeners(proc2.model)
def to_disk(self, path, exclude=tuple(), disable=None): def to_disk(self, path, exclude=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this
will include the model. will include the model.
@ -945,9 +913,6 @@ class Language(object):
DOCS: https://spacy.io/api/language#to_disk DOCS: https://spacy.io/api/language#to_disk
""" """
if disable is not None:
warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = {} serializers = {}
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
@ -966,7 +931,7 @@ class Language(object):
serializers["vocab"] = lambda p: self.vocab.to_disk(p) serializers["vocab"] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
def from_disk(self, path, exclude=tuple(), disable=None): def from_disk(self, path, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the returns it. If the saved `Language` object contains a model, the
model will be loaded. model will be loaded.
@ -991,9 +956,6 @@ class Language(object):
self.vocab.from_disk(path) self.vocab.from_disk(path)
_fix_pretrained_vectors_name(self) _fix_pretrained_vectors_name(self)
if disable is not None:
warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
path = util.ensure_path(path) path = util.ensure_path(path)
deserializers = {} deserializers = {}
@ -1020,7 +982,7 @@ class Language(object):
self._link_components() self._link_components()
return self return self
def to_bytes(self, exclude=tuple(), disable=None, **kwargs): def to_bytes(self, exclude=tuple()):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
exclude (list): Names of components or serialization fields to exclude. exclude (list): Names of components or serialization fields to exclude.
@ -1028,9 +990,6 @@ class Language(object):
DOCS: https://spacy.io/api/language#to_bytes DOCS: https://spacy.io/api/language#to_bytes
""" """
if disable is not None:
warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
serializers = {} serializers = {}
serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["vocab"] = lambda: self.vocab.to_bytes()
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
@ -1042,10 +1001,9 @@ class Language(object):
if not hasattr(proc, "to_bytes"): if not hasattr(proc, "to_bytes"):
continue continue
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
@ -1066,9 +1024,6 @@ class Language(object):
self.vocab.from_bytes(b) self.vocab.from_bytes(b)
_fix_pretrained_vectors_name(self) _fix_pretrained_vectors_name(self)
if disable is not None:
warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
deserializers = {} deserializers = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
deserializers["meta.json"] = deserialize_meta deserializers["meta.json"] = deserialize_meta
@ -1084,7 +1039,6 @@ class Language(object):
deserializers[name] = lambda b, proc=proc: proc.from_bytes( deserializers[name] = lambda b, proc=proc: proc.from_bytes(
b, exclude=["vocab"] b, exclude=["vocab"]
) )
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude) util.from_bytes(bytes_data, deserializers, exclude)
self._link_components() self._link_components()
return self return self
@ -1206,7 +1160,7 @@ class DisabledPipes(list):
def _pipe(examples, proc, kwargs): def _pipe(examples, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect. # We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs) kwargs = dict(kwargs)
for arg in ["n_threads", "batch_size"]: for arg in ["batch_size"]:
if arg in kwargs: if arg in kwargs:
kwargs.pop(arg) kwargs.pop(arg)
for eg in examples: for eg in examples:

View File

@ -1,5 +1,4 @@
from .errors import Errors from .errors import Errors
from .lookups import Lookups
from .parts_of_speech import NAMES as UPOS_NAMES from .parts_of_speech import NAMES as UPOS_NAMES
@ -15,15 +14,13 @@ class Lemmatizer(object):
def load(cls, *args, **kwargs): def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172) raise NotImplementedError(Errors.E172)
def __init__(self, lookups, *args, **kwargs): def __init__(self, lookups):
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object. RETURNS (Lemmatizer): The newly constructed object.
""" """
if args or kwargs or not isinstance(lookups, Lookups):
raise ValueError(Errors.E173)
self.lookups = lookups self.lookups = lookups
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):

View File

@ -174,8 +174,7 @@ cdef class Matcher:
return default return default
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, n_threads=-1, return_matches=False, def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
@ -188,9 +187,6 @@ cdef class Matcher:
be a sequence of ((doc, matches), context) tuples. be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
if n_threads != -1:
warnings.warn(Warnings.W016, DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in docs: for doc, context in docs:
matches = self(doc) matches = self(doc)

View File

@ -26,7 +26,7 @@ cdef class PhraseMatcher:
Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com) Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
""" """
def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): def __init__(self, Vocab vocab, attr="ORTH", validate=False):
"""Initialize the PhraseMatcher. """Initialize the PhraseMatcher.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
@ -36,8 +36,6 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#init DOCS: https://spacy.io/api/phrasematcher#init
""" """
if max_length != 0:
warnings.warn(Warnings.W010, DeprecationWarning)
self.vocab = vocab self.vocab = vocab
self._callbacks = {} self._callbacks = {}
self._docs = {} self._docs = {}
@ -287,8 +285,7 @@ cdef class PhraseMatcher:
current_node = self.c_map current_node = self.c_map
idx += 1 idx += 1
def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False, def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
@ -303,8 +300,6 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#pipe DOCS: https://spacy.io/api/phrasematcher#pipe
""" """
if n_threads != -1:
warnings.warn(Warnings.W016, DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in stream: for doc, context in stream:
matches = self(doc) matches = self(doc)

View File

@ -120,15 +120,14 @@ class Morphologizer(Tagger):
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
serialize = {} serialize = {}
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
def load_model(b): def load_model(b):
try: try:
self.model.from_bytes(b) self.model.from_bytes(b)
@ -140,20 +139,18 @@ class Morphologizer(Tagger):
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b), "model": lambda b: load_model(b),
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
serialize = { serialize = {
"vocab": lambda p: self.vocab.to_disk(p), "vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()), "model": lambda p: p.open("wb").write(self.model.to_bytes()),
"cfg": lambda p: srsly.write_json(p, self.cfg), "cfg": lambda p: srsly.write_json(p, self.cfg),
} }
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
def load_model(p): def load_model(p):
with p.open("rb") as file_: with p.open("rb") as file_:
try: try:
@ -166,6 +163,5 @@ class Morphologizer(Tagger):
"cfg": lambda p: self.cfg.update(_load_cfg(p)), "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"model": load_model, "model": load_model,
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self

View File

@ -66,7 +66,7 @@ class Pipe(object):
self.set_annotations([doc], predictions) self.set_annotations([doc], predictions)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
"""Apply the pipe to a stream of documents. """Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
@ -151,7 +151,7 @@ class Pipe(object):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
"""Serialize the pipe to a bytestring. """Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
@ -162,10 +162,9 @@ class Pipe(object):
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
if hasattr(self, "vocab"): if hasattr(self, "vocab"):
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
"""Load the pipe from a bytestring.""" """Load the pipe from a bytestring."""
def load_model(b): def load_model(b):
@ -179,20 +178,18 @@ class Pipe(object):
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["model"] = load_model deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
"""Serialize the pipe to disk.""" """Serialize the pipe to disk."""
serialize = {} serialize = {}
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
"""Load the pipe from disk.""" """Load the pipe from disk."""
def load_model(p): def load_model(p):
@ -205,7 +202,6 @@ class Pipe(object):
deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["model"] = load_model deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -232,7 +228,7 @@ class Tagger(Pipe):
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
tag_ids = self.predict(docs) tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
@ -421,17 +417,16 @@ class Tagger(Pipe):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
serialize = {} serialize = {}
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
def load_model(b): def load_model(b):
try: try:
self.model.from_bytes(b) self.model.from_bytes(b)
@ -451,11 +446,10 @@ class Tagger(Pipe):
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b), "model": lambda b: load_model(b),
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
serialize = { serialize = {
"vocab": lambda p: self.vocab.to_disk(p), "vocab": lambda p: self.vocab.to_disk(p),
@ -463,10 +457,9 @@ class Tagger(Pipe):
"model": lambda p: self.model.to_disk(p), "model": lambda p: self.model.to_disk(p),
"cfg": lambda p: srsly.write_json(p, self.cfg), "cfg": lambda p: srsly.write_json(p, self.cfg),
} }
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
def load_model(p): def load_model(p):
with p.open("rb") as file_: with p.open("rb") as file_:
try: try:
@ -487,7 +480,6 @@ class Tagger(Pipe):
"tag_map": load_tag_map, "tag_map": load_tag_map,
"model": load_model, "model": load_model,
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -566,15 +558,14 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None): def add_label(self, label, values=None):
raise NotImplementedError raise NotImplementedError
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
serialize = {} serialize = {}
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
def load_model(b): def load_model(b):
try: try:
self.model.from_bytes(b) self.model.from_bytes(b)
@ -586,20 +577,18 @@ class SentenceRecognizer(Tagger):
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b), "model": lambda b: load_model(b),
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
serialize = { serialize = {
"vocab": lambda p: self.vocab.to_disk(p), "vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()), "model": lambda p: p.open("wb").write(self.model.to_bytes()),
"cfg": lambda p: srsly.write_json(p, self.cfg), "cfg": lambda p: srsly.write_json(p, self.cfg),
} }
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
def load_model(p): def load_model(p):
with p.open("rb") as file_: with p.open("rb") as file_:
try: try:
@ -612,7 +601,6 @@ class SentenceRecognizer(Tagger):
"cfg": lambda p: self.cfg.update(_load_cfg(p)), "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"model": load_model, "model": load_model,
} }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -825,7 +813,7 @@ class TextCategorizer(Pipe):
def labels(self, value): def labels(self, value):
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
@ -1198,7 +1186,7 @@ class EntityLinker(Pipe):
self.set_annotations([doc], kb_ids, tensors=tensors) self.set_annotations([doc], kb_ids, tensors=tensors)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
kb_ids, tensors = self.predict(docs) kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors) self.set_annotations(docs, kb_ids, tensors=tensors)
@ -1309,17 +1297,16 @@ class EntityLinker(Pipe):
for token in ent: for token in ent:
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
serialize = {} serialize = {}
self.cfg["entity_width"] = self.kb.entity_vector_length self.cfg["entity_width"] = self.kb.entity_vector_length
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p) serialize["kb"] = lambda p: self.kb.dump(p)
serialize["model"] = lambda p: self.model.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
def load_model(p): def load_model(p):
try: try:
self.model.from_bytes(p.open("rb").read()) self.model.from_bytes(p.open("rb").read())
@ -1335,7 +1322,6 @@ class EntityLinker(Pipe):
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["kb"] = load_kb deserialize["kb"] = load_kb
deserialize["model"] = load_model deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -1411,7 +1397,7 @@ class Sentencizer(Pipe):
doc[start].is_sent_start = True doc[start].is_sent_start = True
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: if isinstance(predictions, tuple) and len(tuple) == 2:

View File

@ -51,11 +51,10 @@ class Tok2Vec(Pipe):
self.set_annotations([doc], tokvecses) self.set_annotations([doc], tokvecses)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128):
"""Process `Doc` objects as a stream. """Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` objects to process. stream (iterator): A sequence of `Doc` objects to process.
batch_size (int): Number of `Doc` objects to group. batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input. YIELDS (iterator): A sequence of `Doc` objects, in order of input.
""" """
for docs in minibatch(stream, batch_size): for docs in minibatch(stream, batch_size):

View File

@ -157,7 +157,7 @@ cdef class Parser:
self.set_annotations([doc], states, tensors=None) self.set_annotations([doc], states, tensors=None)
return doc return doc
def pipe(self, docs, int batch_size=256, int n_threads=-1): def pipe(self, docs, int batch_size=256):
"""Process a stream of documents. """Process a stream of documents.
stream: The sequence of documents to process. stream: The sequence of documents to process.
@ -461,24 +461,22 @@ cdef class Parser:
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
return sgd return sgd
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
serializers = { serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'vocab': lambda p: self.vocab.to_disk(p), 'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
'cfg': lambda p: srsly.write_json(p, self.cfg) 'cfg': lambda p: srsly.write_json(p, self.cfg)
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
deserializers = { deserializers = {
'vocab': lambda p: self.vocab.from_disk(p), 'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None, 'model': lambda p: None,
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
path = util.ensure_path(path) path = util.ensure_path(path)
@ -491,24 +489,22 @@ cdef class Parser:
raise ValueError(Errors.E149) raise ValueError(Errors.E149)
return self return self
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
serializers = { serializers = {
"model": lambda: (self.model.to_bytes()), "model": lambda: (self.model.to_bytes()),
"vocab": lambda: self.vocab.to_bytes(), "vocab": lambda: self.vocab.to_bytes(),
"moves": lambda: self.moves.to_bytes(exclude=["strings"]), "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
deserializers = { deserializers = {
"vocab": lambda b: self.vocab.from_bytes(b), "vocab": lambda b: self.vocab.from_bytes(b),
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: None, "model": lambda b: None,
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
if 'model' in msg: if 'model' in msg:

View File

@ -60,7 +60,7 @@ cdef class TransitionSystem:
states.append(state) states.append(state)
offset += len(doc) offset += len(doc)
return states return states
def get_oracle_sequence(self, Example example, _debug=False): def get_oracle_sequence(self, Example example, _debug=False):
states, golds, _ = self.init_gold_batch([example]) states, golds, _ = self.init_gold_batch([example])
if not states: if not states:
@ -227,22 +227,20 @@ cdef class TransitionSystem:
self.from_bytes(byte_data, **kwargs) self.from_bytes(byte_data, **kwargs)
return self return self
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
transitions = [] transitions = []
serializers = { serializers = {
'moves': lambda: srsly.json_dumps(self.labels), 'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
labels = {} labels = {}
deserializers = { deserializers = {
'moves': lambda b: labels.update(srsly.json_loads(b)), 'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b) 'strings': lambda b: self.strings.from_bytes(b)
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
self.initialize_actions(labels) self.initialize_actions(labels)
return self return self

View File

@ -66,8 +66,6 @@ def test_spans_string_fn(doc):
span = doc[0:4] span = doc[0:4]
assert len(span) == 4 assert len(span) == 4
assert span.text == "This is a sentence" assert span.text == "This is a sentence"
assert span.upper_ == "THIS IS A SENTENCE"
assert span.lower_ == "this is a sentence"
def test_spans_root2(en_tokenizer): def test_spans_root2(en_tokenizer):

View File

@ -1,13 +1,11 @@
import pytest import pytest
from thinc.api import Adam from thinc.api import Adam, fix_random_seed
from spacy.attrs import NORM from spacy.attrs import NORM
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.gold import Example from spacy.gold import Example
from spacy.pipeline.defaults import default_parser, default_ner from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline import DependencyParser, EntityRecognizer
from spacy.util import fix_random_seed
@pytest.fixture @pytest.fixture

View File

@ -1,18 +1,18 @@
import pytest import pytest
import random import random
import numpy.random import numpy.random
from thinc.api import fix_random_seed
from spacy import util from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import TextCategorizer from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import fix_random_seed from spacy.pipeline.defaults import default_tok2vec
from ..util import make_tempdir from ..util import make_tempdir
from spacy.pipeline.defaults import default_tok2vec
from ...gold import Example from ...gold import Example
TRAIN_DATA = [ TRAIN_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),

View File

@ -9,7 +9,6 @@ from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.compat import pickle from spacy.compat import pickle
from spacy import displacy from spacy import displacy
from spacy.util import decaying
import numpy import numpy
from spacy.vectors import Vectors from spacy.vectors import Vectors
@ -216,21 +215,6 @@ def test_issue3345():
assert ner.moves.is_valid(state, "B-GPE") assert ner.moves.is_valid(state, "B-GPE")
def test_issue3410():
texts = ["Hello world", "This is a test"]
nlp = English()
matcher = Matcher(nlp.vocab)
phrasematcher = PhraseMatcher(nlp.vocab)
with pytest.deprecated_call():
docs = list(nlp.pipe(texts, n_threads=4))
with pytest.deprecated_call():
docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
with pytest.deprecated_call():
list(matcher.pipe(docs, n_threads=4))
with pytest.deprecated_call():
list(phrasematcher.pipe(docs, n_threads=4))
def test_issue3412(): def test_issue3412():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
vectors = Vectors(data=data, keys=["A", "B", "C"]) vectors = Vectors(data=data, keys=["A", "B", "C"])
@ -240,16 +224,6 @@ def test_issue3412():
assert best_rows[0] == 2 assert best_rows[0] == 2
def test_issue3447():
sizes = decaying(10.0, 1.0, 0.5)
size = next(sizes)
assert size == 10.0
size = next(sizes)
assert size == 10.0 - 0.5
size = next(sizes)
assert size == 10.0 - 0.5 - 0.5
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") @pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449(): def test_issue3449():
nlp = English() nlp = English()

View File

@ -1,6 +1,7 @@
import spacy import spacy
from spacy.util import minibatch
from thinc.api import compounding
from spacy.gold import Example from spacy.gold import Example
from spacy.util import minibatch, compounding
def test_issue3611(): def test_issue3611():

View File

@ -1,6 +1,7 @@
import spacy import spacy
from spacy.util import minibatch
from thinc.api import compounding
from spacy.gold import Example from spacy.gold import Example
from spacy.util import minibatch, compounding
def test_issue4030(): def test_issue4030():

View File

@ -1,6 +1,7 @@
from spacy.gold import Example from spacy.gold import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import minibatch, compounding from spacy.util import minibatch
from thinc.api import compounding
import pytest import pytest

View File

@ -52,10 +52,6 @@ def test_serialize_doc_exclude(en_vocab):
assert not new_doc.user_data assert not new_doc.user_data
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
assert not new_doc.user_data assert not new_doc.user_data
with pytest.raises(ValueError):
doc.to_bytes(user_data=False)
with pytest.raises(ValueError):
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
def test_serialize_doc_bin(): def test_serialize_doc_bin():

View File

@ -62,7 +62,3 @@ def test_serialize_language_exclude(meta_data):
assert not new_nlp.meta["name"] == name assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
assert not new_nlp.meta["name"] == name assert not new_nlp.meta["name"] == name
with pytest.raises(ValueError):
nlp.to_bytes(meta=False)
with pytest.raises(ValueError):
Language().from_bytes(nlp.to_bytes(), meta=False)

View File

@ -127,10 +127,6 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
parser.to_bytes(exclude=["cfg"]), exclude=["vocab"] parser.to_bytes(exclude=["cfg"]), exclude=["vocab"]
) )
assert "foo" not in new_parser.cfg assert "foo" not in new_parser.cfg
with pytest.raises(ValueError):
parser.to_bytes(cfg=False, exclude=["vocab"])
with pytest.raises(ValueError):
get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False)
def test_serialize_sentencerecognizer(en_vocab): def test_serialize_sentencerecognizer(en_vocab):

View File

@ -5,9 +5,9 @@ from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.gold.converters import json2docs from spacy.gold.converters import json2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, compounding, minibatch from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding
import pytest import pytest
import srsly import srsly
@ -511,9 +511,7 @@ def test_make_orth_variants(doc):
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
train_example = next(goldcorpus.train_dataset(nlp)) train_example = next(goldcorpus.train_dataset(nlp))
variant_example = make_orth_variants_example( make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
nlp, train_example, orth_variant_level=0.2
)
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -26,8 +26,6 @@ cdef class Tokenizer:
cdef int _property_init_count cdef int _property_init_count
cdef int _property_init_max cdef int _property_init_max
cpdef Doc tokens_from_list(self, list strings)
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1 cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original, cdef void _filter_special_spans(self, vector[SpanC] &original,

View File

@ -140,10 +140,6 @@ cdef class Tokenizer:
self.url_match) self.url_match)
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
warnings.warn(Warnings.W002, DeprecationWarning)
return Doc(self.vocab, words=strings)
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -218,7 +214,7 @@ cdef class Tokenizer:
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc return doc
def pipe(self, texts, batch_size=1000, n_threads=-1): def pipe(self, texts, batch_size=1000):
"""Tokenize a stream of texts. """Tokenize a stream of texts.
texts: A sequence of unicode texts. texts: A sequence of unicode texts.
@ -228,8 +224,6 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#pipe DOCS: https://spacy.io/api/tokenizer#pipe
""" """
if n_threads != -1:
warnings.warn(Warnings.W016, DeprecationWarning)
for text in texts: for text in texts:
yield self(text) yield self(text)
@ -746,7 +740,7 @@ cdef class Tokenizer:
self.from_bytes(bytes_data, **kwargs) self.from_bytes(bytes_data, **kwargs)
return self return self
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
@ -763,10 +757,9 @@ cdef class Tokenizer:
"url_match": lambda: _get_regex_pattern(self.url_match), "url_match": lambda: _get_regex_pattern(self.url_match),
"exceptions": lambda: dict(sorted(self._rules.items())) "exceptions": lambda: dict(sorted(self._rules.items()))
} }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
@ -785,7 +778,6 @@ cdef class Tokenizer:
"url_match": lambda b: data.setdefault("url_match", b), "url_match": lambda b: data.setdefault("url_match", b),
"exceptions": lambda b: data.setdefault("rules", b) "exceptions": lambda b: data.setdefault("rules", b)
} }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if "prefix_search" in data and isinstance(data["prefix_search"], str): if "prefix_search" in data and isinstance(data["prefix_search"], str):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search

View File

@ -1081,9 +1081,6 @@ cdef class Doc:
"cats": lambda: self.cats, "cats": lambda: self.cats,
"has_unknown_spaces": lambda: self.has_unknown_spaces "has_unknown_spaces": lambda: self.has_unknown_spaces
} }
for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
raise ValueError(Errors.E128.format(arg=key))
if "user_data" not in exclude and self.user_data: if "user_data" not in exclude and self.user_data:
user_data_keys, user_data_values = list(zip(*self.user_data.items())) user_data_keys, user_data_values = list(zip(*self.user_data.items()))
if "user_data_keys" not in exclude: if "user_data_keys" not in exclude:
@ -1114,9 +1111,6 @@ cdef class Doc:
"user_data_values": lambda b: None, "user_data_values": lambda b: None,
"has_unknown_spaces": lambda b: None "has_unknown_spaces": lambda b: None
} }
for key in kwargs:
if key in deserializers or key in ("user_data",):
raise ValueError(Errors.E128.format(arg=key))
# Msgpack doesn't distinguish between lists and tuples, which is # Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within # vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope # keys, we must have tuples. In values we just have to hope

View File

@ -686,21 +686,6 @@ cdef class Span:
"""RETURNS (str): The span's lemma.""" """RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip() return " ".join([t.lemma_ for t in self]).strip()
@property
def upper_(self):
"""Deprecated. Use `Span.text.upper()` instead."""
return "".join([t.text_with_ws.upper() for t in self]).strip()
@property
def lower_(self):
"""Deprecated. Use `Span.text.lower()` instead."""
return "".join([t.text_with_ws.lower() for t in self]).strip()
@property
def string(self):
"""Deprecated: Use `Span.text_with_ws` instead."""
return "".join([t.text_with_ws for t in self])
property label_: property label_:
"""RETURNS (str): The span's label.""" """RETURNS (str): The span's label."""
def __get__(self): def __get__(self):

View File

@ -237,11 +237,6 @@ cdef class Token:
index into tables, e.g. for word vectors.""" index into tables, e.g. for word vectors."""
return self.c.lex.id return self.c.lex.id
@property
def string(self):
"""Deprecated: Use Token.text_with_ws instead."""
return self.text_with_ws
@property @property
def text(self): def text(self):
"""RETURNS (str): The original verbatim text of the token.""" """RETURNS (str): The original verbatim text of the token."""

View File

@ -4,9 +4,8 @@ import importlib
import importlib.util import importlib.util
import re import re
from pathlib import Path from pathlib import Path
import random
import thinc import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config from thinc.api import NumpyOps, get_current_ops, Adam, Config
import functools import functools
import itertools import itertools
import numpy.random import numpy.random
@ -34,6 +33,13 @@ try: # Python 3.8
except ImportError: except ImportError:
import importlib_metadata import importlib_metadata
# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
# not from spacy.util.
from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows from .compat import cupy, CudaStream, is_windows
from .errors import Errors, Warnings from .errors import Errors, Warnings
@ -595,15 +601,8 @@ def compile_prefix_regex(entries):
entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES. entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search. RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
""" """
if "(" in entries: expression = "|".join(["^" + piece for piece in entries if piece.strip()])
# Handle deprecated data return re.compile(expression)
expression = "|".join(
["^" + re.escape(piece) for piece in entries if piece.strip()]
)
return re.compile(expression)
else:
expression = "|".join(["^" + piece for piece in entries if piece.strip()])
return re.compile(expression)
def compile_suffix_regex(entries): def compile_suffix_regex(entries):
@ -723,59 +722,6 @@ def minibatch(items, size=8):
yield list(batch) yield list(batch)
def compounding(start, stop, compound):
"""Yield an infinite series of compounding values. Each time the
generator is called, a value is produced by multiplying the previous
value by the compound rate.
EXAMPLE:
>>> sizes = compounding(1., 10., 1.5)
>>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * 1.5
>>> assert next(sizes) == 1.5 * 1.5
"""
def clip(value):
return max(value, stop) if (start > stop) else min(value, stop)
curr = float(start)
while True:
yield clip(curr)
curr *= compound
def stepping(start, stop, steps):
"""Yield an infinite series of values that step from a start value to a
final value over some number of steps. Each step is (stop-start)/steps.
After the final value is reached, the generator continues yielding that
value.
EXAMPLE:
>>> sizes = stepping(1., 200., 100)
>>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * (200.-1.) / 100
>>> assert next(sizes) == 1 + (200.-1.) / 100 + (200.-1.) / 100
"""
def clip(value):
return max(value, stop) if (start > stop) else min(value, stop)
curr = float(start)
while True:
yield clip(curr)
curr += (stop - start) / steps
def decaying(start, stop, decay):
"""Yield an infinite series of linearly decaying values."""
curr = float(start)
while True:
yield max(curr, stop)
curr -= decay
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
"""Create minibatches of roughly a given number of words. If any examples """Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by are longer than the specified batch length, they will appear in a batch by
@ -854,35 +800,6 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
yield batch yield batch
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased
but should be good enough for batching. Larger bufsize means less bias.
From https://gist.github.com/andres-erbsen/1307752
iterable (iterable): Iterator to shuffle.
bufsize (int): Items to hold back.
YIELDS (iterable): The shuffled iterator.
"""
iterable = iter(iterable)
buf = []
try:
while True:
for i in range(random.randint(1, bufsize - len(buf))):
buf.append(next(iterable))
random.shuffle(buf)
for i in range(random.randint(1, bufsize)):
if buf:
yield buf.pop()
else:
break
except StopIteration:
random.shuffle(buf)
while buf:
yield buf.pop()
raise StopIteration
def filter_spans(spans): def filter_spans(spans):
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for """Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or creating named entities (where one token can only be part of one entity) or
@ -989,34 +906,6 @@ def escape_html(text):
return text return text
def use_gpu(gpu_id):
return require_gpu(gpu_id)
def fix_random_seed(seed=0):
random.seed(seed)
numpy.random.seed(seed)
if cupy is not None:
cupy.random.seed(seed)
def get_serialization_exclude(serializers, exclude, kwargs):
"""Helper function to validate serialization args and manage transition from
keyword arguments (pre v2.1) to exclude argument.
"""
exclude = list(exclude)
# Split to support file names like meta.json
options = [name.split(".")[0] for name in serializers]
for key, value in kwargs.items():
if key in ("vocab",) and value is False:
warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning)
exclude.append(key)
elif key.split(".")[0] in options:
raise ValueError(Errors.E128.format(arg=key))
# TODO: user warning?
return exclude
def get_words_and_spaces(words, text): def get_words_and_spaces(words, text):
if "".join("".join(words).split()) != "".join(text.split()): if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words)) raise ValueError(Errors.E194.format(text=text, words=words))

View File

@ -426,7 +426,7 @@ cdef class Vocab:
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
@ -439,7 +439,6 @@ cdef class Vocab:
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
setters = ["strings", "vectors"] setters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude" and self.vectors is not None: if "vectors" not in "exclude" and self.vectors is not None:
@ -449,7 +448,7 @@ cdef class Vocab:
if "lookups_extra" not in "exclude" and self.lookups_extra is not None: if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
self.lookups_extra.to_disk(path, filename="lookups_extra.bin") self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
@ -461,7 +460,6 @@ cdef class Vocab:
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
getters = ["strings", "vectors"] getters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
if "strings" not in exclude: if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude? self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "vectors" not in exclude: if "vectors" not in exclude:
@ -481,7 +479,7 @@ cdef class Vocab:
self._by_orth = PreshMap() self._by_orth = PreshMap()
return self return self
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple()):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
@ -501,10 +499,9 @@ cdef class Vocab:
"lookups": lambda: self.lookups.to_bytes(), "lookups": lambda: self.lookups.to_bytes(),
"lookups_extra": lambda: self.lookups_extra.to_bytes() "lookups_extra": lambda: self.lookups_extra.to_bytes()
} }
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, exclude=tuple()):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
@ -526,7 +523,6 @@ cdef class Vocab:
"lookups": lambda b: self.lookups.from_bytes(b), "lookups": lambda b: self.lookups.from_bytes(b),
"lookups_extra": lambda b: self.lookups_extra.from_bytes(b) "lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
} }
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
if "lexeme_norm" in self.lookups: if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups( self.lex_attr_getters[NORM] = util.add_lookups(