mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Remove dead and/or deprecated code (#5710)
* Remove dead and/or deprecated code * Remove n_threads Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
fcbf899b08
commit
412dbb1f38
|
@ -25,9 +25,6 @@ config = registry
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
depr_path = overrides.get("path")
|
|
||||||
if depr_path not in (True, False, None):
|
|
||||||
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
|
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
|
from thinc.api import require_gpu, fix_random_seed
|
||||||
|
|
||||||
from ..gold import Corpus
|
from ..gold import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
@ -52,9 +53,9 @@ def evaluate(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Scorer:
|
) -> Scorer:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
util.fix_random_seed()
|
fix_random_seed()
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
require_gpu(gpu_id)
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = util.ensure_path(output)
|
||||||
|
|
|
@ -5,8 +5,8 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import use_pytorch_for_gpu_memory
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||||
from thinc.api import set_dropout_rate, to_categorical
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -36,7 +36,7 @@ def pretrain_cli(
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
using an approximate language-modelling objective. Two objective types
|
using an approximate language-modelling objective. Two objective types
|
||||||
are available, vector-based and character-based.
|
are available, vector-based and character-based.
|
||||||
|
|
||||||
In the vector-based objective, we load word vectors that have been trained
|
In the vector-based objective, we load word vectors that have been trained
|
||||||
using a word2vec-style distributional similarity algorithm, and train a
|
using a word2vec-style distributional similarity algorithm, and train a
|
||||||
component like a CNN, BiLSTM, etc to predict vectors which match the
|
component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||||
|
@ -76,13 +76,13 @@ def pretrain(
|
||||||
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU")
|
||||||
util.use_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
util.fix_random_seed(config["pretraining"]["seed"])
|
fix_random_seed(config["pretraining"]["seed"])
|
||||||
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
|
|
||||||
|
@ -231,12 +231,12 @@ def make_docs(nlp, batch, min_length, max_length):
|
||||||
|
|
||||||
def create_objective(config):
|
def create_objective(config):
|
||||||
"""Create the objective for pretraining.
|
"""Create the objective for pretraining.
|
||||||
|
|
||||||
We'd like to replace this with a registry function but it's tricky because
|
We'd like to replace this with a registry function but it's tricky because
|
||||||
we're also making a model choice based on this. For now we hard-code support
|
we're also making a model choice based on this. For now we hard-code support
|
||||||
for two types (characters, vectors). For characters you can specify
|
for two types (characters, vectors). For characters you can specify
|
||||||
n_characters, for vectors you can specify the loss.
|
n_characters, for vectors you can specify the loss.
|
||||||
|
|
||||||
Bleh.
|
Bleh.
|
||||||
"""
|
"""
|
||||||
objective_type = config["type"]
|
objective_type = config["type"]
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
from typing import Optional, Dict, List, Union, Sequence
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
import tqdm
|
import tqdm
|
||||||
from pydantic import BaseModel, FilePath
|
from pydantic import BaseModel, FilePath
|
||||||
|
@ -8,7 +7,7 @@ from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
from thinc.api import Model, use_pytorch_for_gpu_memory
|
from thinc.api import Model, use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from ._app import app, Arg, Opt
|
from ._app import app, Arg, Opt
|
||||||
|
@ -156,7 +155,7 @@ def train_cli(
|
||||||
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU: {use_gpu}")
|
msg.info("Using GPU: {use_gpu}")
|
||||||
util.use_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
|
||||||
|
@ -183,7 +182,7 @@ def train(
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
# Read the config first without creating objects, to get to the original nlp_config
|
# Read the config first without creating objects, to get to the original nlp_config
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
util.fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||||
# It feels kind of weird to not have a default for this.
|
# It feels kind of weird to not have a default for this.
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
|
|
|
@ -16,16 +16,6 @@ def add_codes(err_cls):
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Warnings(object):
|
class Warnings(object):
|
||||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
|
||||||
"You can now call spacy.load with the path as its first argument, "
|
|
||||||
"and the model's meta.json will be used to determine the language "
|
|
||||||
"to load. For example:\nnlp = spacy.load('{path}')")
|
|
||||||
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
|
|
||||||
"instead and pass in the strings as the `words` keyword argument, "
|
|
||||||
"for example:\nfrom spacy.tokens import Doc\n"
|
|
||||||
"doc = Doc(nlp.vocab, words=[...])")
|
|
||||||
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
|
|
||||||
"the keyword arguments, for example tag=, lemma= or ent_type=.")
|
|
||||||
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
||||||
"using ftfy.fix_text if necessary.")
|
"using ftfy.fix_text if necessary.")
|
||||||
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
||||||
|
@ -45,12 +35,6 @@ class Warnings(object):
|
||||||
"use context-sensitive tensors. You can always add your own word "
|
"use context-sensitive tensors. You can always add your own word "
|
||||||
"vectors, or use one of the larger models instead if available.")
|
"vectors, or use one of the larger models instead if available.")
|
||||||
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
|
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
|
||||||
W009 = ("Custom factory '{name}' provided by entry points of another "
|
|
||||||
"package overwrites built-in factory.")
|
|
||||||
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
|
||||||
"limit anymore, so the max_length argument is now deprecated. "
|
|
||||||
"If you did not specify this parameter, make sure you call the "
|
|
||||||
"constructor with named arguments instead of positional ones.")
|
|
||||||
W011 = ("It looks like you're calling displacy.serve from within a "
|
W011 = ("It looks like you're calling displacy.serve from within a "
|
||||||
"Jupyter notebook or a similar environment. This likely means "
|
"Jupyter notebook or a similar environment. This likely means "
|
||||||
"you're already running a local web server, so there's no need to "
|
"you're already running a local web server, so there's no need to "
|
||||||
|
@ -64,23 +48,9 @@ class Warnings(object):
|
||||||
"components are applied. To only create tokenized Doc objects, "
|
"components are applied. To only create tokenized Doc objects, "
|
||||||
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
||||||
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
||||||
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
|
||||||
"efficient and less error-prone Doc.retokenize context manager "
|
|
||||||
"instead.")
|
|
||||||
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
|
||||||
"methods is and should be replaced with `exclude`. This makes it "
|
|
||||||
"consistent with the other serializable objects.")
|
|
||||||
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
|
||||||
"being serialized or deserialized is deprecated. Please use the "
|
|
||||||
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
|
||||||
W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, "
|
|
||||||
"the argument `n_process` controls parallel inference via "
|
|
||||||
"multiprocessing.")
|
|
||||||
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
||||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
||||||
"ignoring the duplicate entry.")
|
"ignoring the duplicate entry.")
|
||||||
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
|
||||||
"previously loaded vectors. See Issue #3853.")
|
|
||||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
||||||
"loaded. (Shape: {shape})")
|
"loaded. (Shape: {shape})")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
|
@ -91,8 +61,6 @@ class Warnings(object):
|
||||||
"or the language you're using doesn't have lemmatization data, "
|
"or the language you're using doesn't have lemmatization data, "
|
||||||
"you can ignore this warning. If this is surprising, make sure you "
|
"you can ignore this warning. If this is surprising, make sure you "
|
||||||
"have the spacy-lookups-data package installed.")
|
"have the spacy-lookups-data package installed.")
|
||||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
|
||||||
"'n_process' will be set to 1.")
|
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||||
|
@ -101,28 +69,11 @@ class Warnings(object):
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
"smaller JSON files instead.")
|
"smaller JSON files instead.")
|
||||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
|
||||||
"but is expecting one of type 'uint64' instead. This may result "
|
|
||||||
"in problems with the vocab further on in the pipeline.")
|
|
||||||
W029 = ("Unable to align tokens with entities from character offsets. "
|
|
||||||
"Discarding entity annotation for the text: {text}.")
|
|
||||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
"entities \"{entities}\". Use "
|
"entities \"{entities}\". Use "
|
||||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||||
" to check the alignment. Misaligned entities ('-') will be "
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
"ignored during training.")
|
"ignored during training.")
|
||||||
W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
|
|
||||||
"is incompatible with the current spaCy version ({current}). This "
|
|
||||||
"may lead to unexpected results or runtime errors. To resolve "
|
|
||||||
"this, download a newer compatible model or retrain your custom "
|
|
||||||
"model with the current spaCy version. For more details and "
|
|
||||||
"available updates, run: python -m spacy validate")
|
|
||||||
W032 = ("Unable to determine model compatibility for model '{model}' "
|
|
||||||
"({model_version}) with the current spaCy version ({current}). "
|
|
||||||
"This may lead to unexpected results or runtime errors. To resolve "
|
|
||||||
"this, download a newer compatible model or retrain your custom "
|
|
||||||
"model with the current spaCy version. For more details and "
|
|
||||||
"available updates, run: python -m spacy validate")
|
|
||||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||||
"table. This may degrade the performance of the model to some "
|
"table. This may degrade the performance of the model to some "
|
||||||
"degree. If this is intentional or the language you're using "
|
"degree. If this is intentional or the language you're using "
|
||||||
|
@ -236,9 +187,6 @@ class Errors(object):
|
||||||
"the HEAD attribute would potentially override the sentence "
|
"the HEAD attribute would potentially override the sentence "
|
||||||
"boundaries set by SENT_START.")
|
"boundaries set by SENT_START.")
|
||||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||||
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
|
|
||||||
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
|
|
||||||
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
|
|
||||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||||
"length {length}.")
|
"length {length}.")
|
||||||
E036 = ("Error calculating span: Can't find a token starting at character "
|
E036 = ("Error calculating span: Can't find a token starting at character "
|
||||||
|
@ -347,14 +295,9 @@ class Errors(object):
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||||
"token can only be part of one entity, so make sure the entities "
|
"token can only be part of one entity, so make sure the entities "
|
||||||
"you're setting don't overlap.")
|
"you're setting don't overlap.")
|
||||||
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
|
||||||
"Doc.to_json() instead or write your own function.")
|
|
||||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
|
||||||
"in favor of the pipe name `sentencizer`, which does the same "
|
|
||||||
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call begin_training()?")
|
"call begin_training()?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
|
@ -394,10 +337,6 @@ class Errors(object):
|
||||||
E125 = ("Unexpected value: {value}")
|
E125 = ("Unexpected value: {value}")
|
||||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
|
||||||
"arguments to exclude fields from being serialized or deserialized "
|
|
||||||
"is now deprecated. Please use the `exclude` argument instead. "
|
|
||||||
"For example: exclude=['{arg}'].")
|
|
||||||
E129 = ("Cannot write the label of an existing Span object because a Span "
|
E129 = ("Cannot write the label of an existing Span object because a Span "
|
||||||
"is a read-only view of the underlying Token objects stored in the "
|
"is a read-only view of the underlying Token objects stored in the "
|
||||||
"Doc. Instead, create a new Span object and specify the `label` "
|
"Doc. Instead, create a new Span object and specify the `label` "
|
||||||
|
@ -489,9 +428,6 @@ class Errors(object):
|
||||||
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
||||||
"Lemmatizer, initialize the class directly. See the docs for "
|
"Lemmatizer, initialize the class directly. See the docs for "
|
||||||
"details: https://spacy.io/api/lemmatizer")
|
"details: https://spacy.io/api/lemmatizer")
|
||||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
|
||||||
"Lookups containing the lemmatization tables. See the docs for "
|
|
||||||
"details: https://spacy.io/api/lemmatizer#init")
|
|
||||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||||
|
|
|
@ -8,7 +8,7 @@ from copy import copy, deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from thinc.api import get_current_ops, Config
|
from thinc.api import get_current_ops, Config, require_gpu
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -233,32 +233,6 @@ class Language(object):
|
||||||
def config(self):
|
def config(self):
|
||||||
return self._config
|
return self._config
|
||||||
|
|
||||||
# Conveniences to access pipeline components
|
|
||||||
# Shouldn't be used anymore!
|
|
||||||
@property
|
|
||||||
def tagger(self):
|
|
||||||
return self.get_pipe("tagger")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def parser(self):
|
|
||||||
return self.get_pipe("parser")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self):
|
|
||||||
return self.get_pipe("ner")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def linker(self):
|
|
||||||
return self.get_pipe("entity_linker")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def senter(self):
|
|
||||||
return self.get_pipe("senter")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def matcher(self):
|
|
||||||
return self.get_pipe("matcher")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pipe_names(self):
|
def pipe_names(self):
|
||||||
"""Get names of available pipeline components.
|
"""Get names of available pipeline components.
|
||||||
|
@ -314,10 +288,7 @@ class Language(object):
|
||||||
DOCS: https://spacy.io/api/language#create_pipe
|
DOCS: https://spacy.io/api/language#create_pipe
|
||||||
"""
|
"""
|
||||||
if name not in self.factories:
|
if name not in self.factories:
|
||||||
if name == "sbd":
|
raise KeyError(Errors.E002.format(name=name))
|
||||||
raise KeyError(Errors.E108.format(name=name))
|
|
||||||
else:
|
|
||||||
raise KeyError(Errors.E002.format(name=name))
|
|
||||||
factory = self.factories[name]
|
factory = self.factories[name]
|
||||||
|
|
||||||
# transform the model's config to an actual Model
|
# transform the model's config to an actual Model
|
||||||
|
@ -661,7 +632,7 @@ class Language(object):
|
||||||
_ = self.vocab[word] # noqa: F841
|
_ = self.vocab[word] # noqa: F841
|
||||||
|
|
||||||
if cfg.get("device", -1) >= 0:
|
if cfg.get("device", -1) >= 0:
|
||||||
util.use_gpu(cfg["device"])
|
require_gpu(cfg["device"])
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
|
@ -691,7 +662,7 @@ class Language(object):
|
||||||
on, and call nlp.rehearse() with a batch of Example objects.
|
on, and call nlp.rehearse() with a batch of Example objects.
|
||||||
"""
|
"""
|
||||||
if cfg.get("device", -1) >= 0:
|
if cfg.get("device", -1) >= 0:
|
||||||
util.use_gpu(cfg["device"])
|
require_gpu(cfg["device"])
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
|
@ -782,7 +753,6 @@ class Language(object):
|
||||||
self,
|
self,
|
||||||
texts,
|
texts,
|
||||||
as_tuples=False,
|
as_tuples=False,
|
||||||
n_threads=-1,
|
|
||||||
batch_size=1000,
|
batch_size=1000,
|
||||||
disable=[],
|
disable=[],
|
||||||
cleanup=False,
|
cleanup=False,
|
||||||
|
@ -807,8 +777,6 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://spacy.io/api/language#pipe
|
||||||
"""
|
"""
|
||||||
if n_threads != -1:
|
|
||||||
warnings.warn(Warnings.W016, DeprecationWarning)
|
|
||||||
if n_process == -1:
|
if n_process == -1:
|
||||||
n_process = mp.cpu_count()
|
n_process = mp.cpu_count()
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
|
@ -935,7 +903,7 @@ class Language(object):
|
||||||
if hasattr(proc2, "model"):
|
if hasattr(proc2, "model"):
|
||||||
proc1.find_listeners(proc2.model)
|
proc1.find_listeners(proc2.model)
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), disable=None):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
will include the model.
|
will include the model.
|
||||||
|
|
||||||
|
@ -945,9 +913,6 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_disk
|
DOCS: https://spacy.io/api/language#to_disk
|
||||||
"""
|
"""
|
||||||
if disable is not None:
|
|
||||||
warnings.warn(Warnings.W014, DeprecationWarning)
|
|
||||||
exclude = disable
|
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = {}
|
serializers = {}
|
||||||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
|
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
|
||||||
|
@ -966,7 +931,7 @@ class Language(object):
|
||||||
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), disable=None):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it. If the saved `Language` object contains a model, the
|
returns it. If the saved `Language` object contains a model, the
|
||||||
model will be loaded.
|
model will be loaded.
|
||||||
|
@ -991,9 +956,6 @@ class Language(object):
|
||||||
self.vocab.from_disk(path)
|
self.vocab.from_disk(path)
|
||||||
_fix_pretrained_vectors_name(self)
|
_fix_pretrained_vectors_name(self)
|
||||||
|
|
||||||
if disable is not None:
|
|
||||||
warnings.warn(Warnings.W014, DeprecationWarning)
|
|
||||||
exclude = disable
|
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
|
||||||
deserializers = {}
|
deserializers = {}
|
||||||
|
@ -1020,7 +982,7 @@ class Language(object):
|
||||||
self._link_components()
|
self._link_components()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
exclude (list): Names of components or serialization fields to exclude.
|
exclude (list): Names of components or serialization fields to exclude.
|
||||||
|
@ -1028,9 +990,6 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#to_bytes
|
DOCS: https://spacy.io/api/language#to_bytes
|
||||||
"""
|
"""
|
||||||
if disable is not None:
|
|
||||||
warnings.warn(Warnings.W014, DeprecationWarning)
|
|
||||||
exclude = disable
|
|
||||||
serializers = {}
|
serializers = {}
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||||
|
@ -1042,10 +1001,9 @@ class Language(object):
|
||||||
if not hasattr(proc, "to_bytes"):
|
if not hasattr(proc, "to_bytes"):
|
||||||
continue
|
continue
|
||||||
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
|
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
|
@ -1066,9 +1024,6 @@ class Language(object):
|
||||||
self.vocab.from_bytes(b)
|
self.vocab.from_bytes(b)
|
||||||
_fix_pretrained_vectors_name(self)
|
_fix_pretrained_vectors_name(self)
|
||||||
|
|
||||||
if disable is not None:
|
|
||||||
warnings.warn(Warnings.W014, DeprecationWarning)
|
|
||||||
exclude = disable
|
|
||||||
deserializers = {}
|
deserializers = {}
|
||||||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
|
deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
|
||||||
deserializers["meta.json"] = deserialize_meta
|
deserializers["meta.json"] = deserialize_meta
|
||||||
|
@ -1084,7 +1039,6 @@ class Language(object):
|
||||||
deserializers[name] = lambda b, proc=proc: proc.from_bytes(
|
deserializers[name] = lambda b, proc=proc: proc.from_bytes(
|
||||||
b, exclude=["vocab"]
|
b, exclude=["vocab"]
|
||||||
)
|
)
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
return self
|
return self
|
||||||
|
@ -1206,7 +1160,7 @@ class DisabledPipes(list):
|
||||||
def _pipe(examples, proc, kwargs):
|
def _pipe(examples, proc, kwargs):
|
||||||
# We added some args for pipe that __call__ doesn't expect.
|
# We added some args for pipe that __call__ doesn't expect.
|
||||||
kwargs = dict(kwargs)
|
kwargs = dict(kwargs)
|
||||||
for arg in ["n_threads", "batch_size"]:
|
for arg in ["batch_size"]:
|
||||||
if arg in kwargs:
|
if arg in kwargs:
|
||||||
kwargs.pop(arg)
|
kwargs.pop(arg)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .lookups import Lookups
|
|
||||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,15 +14,13 @@ class Lemmatizer(object):
|
||||||
def load(cls, *args, **kwargs):
|
def load(cls, *args, **kwargs):
|
||||||
raise NotImplementedError(Errors.E172)
|
raise NotImplementedError(Errors.E172)
|
||||||
|
|
||||||
def __init__(self, lookups, *args, **kwargs):
|
def __init__(self, lookups):
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||||
RETURNS (Lemmatizer): The newly constructed object.
|
RETURNS (Lemmatizer): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if args or kwargs or not isinstance(lookups, Lookups):
|
|
||||||
raise ValueError(Errors.E173)
|
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
|
|
|
@ -174,8 +174,7 @@ cdef class Matcher:
|
||||||
return default
|
return default
|
||||||
return (self._callbacks[key], self._patterns[key])
|
return (self._callbacks[key], self._patterns[key])
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=-1, return_matches=False,
|
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||||
as_tuples=False):
|
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
|
@ -188,9 +187,6 @@ cdef class Matcher:
|
||||||
be a sequence of ((doc, matches), context) tuples.
|
be a sequence of ((doc, matches), context) tuples.
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
if n_threads != -1:
|
|
||||||
warnings.warn(Warnings.W016, DeprecationWarning)
|
|
||||||
|
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
for doc, context in docs:
|
for doc, context in docs:
|
||||||
matches = self(doc)
|
matches = self(doc)
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef class PhraseMatcher:
|
||||||
Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
|
Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
|
def __init__(self, Vocab vocab, attr="ORTH", validate=False):
|
||||||
"""Initialize the PhraseMatcher.
|
"""Initialize the PhraseMatcher.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
@ -36,8 +36,6 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#init
|
DOCS: https://spacy.io/api/phrasematcher#init
|
||||||
"""
|
"""
|
||||||
if max_length != 0:
|
|
||||||
warnings.warn(Warnings.W010, DeprecationWarning)
|
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
self._docs = {}
|
self._docs = {}
|
||||||
|
@ -287,8 +285,7 @@ cdef class PhraseMatcher:
|
||||||
current_node = self.c_map
|
current_node = self.c_map
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False,
|
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
||||||
as_tuples=False):
|
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
|
@ -303,8 +300,6 @@ cdef class PhraseMatcher:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#pipe
|
DOCS: https://spacy.io/api/phrasematcher#pipe
|
||||||
"""
|
"""
|
||||||
if n_threads != -1:
|
|
||||||
warnings.warn(Warnings.W016, DeprecationWarning)
|
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
for doc, context in stream:
|
for doc, context in stream:
|
||||||
matches = self(doc)
|
matches = self(doc)
|
||||||
|
|
|
@ -120,15 +120,14 @@ class Morphologizer(Tagger):
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -140,20 +139,18 @@ class Morphologizer(Tagger):
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: load_model(b),
|
"model": lambda b: load_model(b),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
@ -166,6 +163,5 @@ class Morphologizer(Tagger):
|
||||||
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -66,7 +66,7 @@ class Pipe(object):
|
||||||
self.set_annotations([doc], predictions)
|
self.set_annotations([doc], predictions)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
"""Apply the pipe to a stream of documents.
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
@ -151,7 +151,7 @@ class Pipe(object):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
"""Serialize the pipe to a bytestring.
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
@ -162,10 +162,9 @@ class Pipe(object):
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
if hasattr(self, "vocab"):
|
if hasattr(self, "vocab"):
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
"""Load the pipe from a bytestring."""
|
"""Load the pipe from a bytestring."""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
|
@ -179,20 +178,18 @@ class Pipe(object):
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
"""Serialize the pipe to disk."""
|
"""Serialize the pipe to disk."""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
"""Load the pipe from disk."""
|
"""Load the pipe from disk."""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
|
@ -205,7 +202,6 @@ class Pipe(object):
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -232,7 +228,7 @@ class Tagger(Pipe):
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
|
@ -421,17 +417,16 @@ class Tagger(Pipe):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -451,11 +446,10 @@ class Tagger(Pipe):
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: load_model(b),
|
"model": lambda b: load_model(b),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
|
@ -463,10 +457,9 @@ class Tagger(Pipe):
|
||||||
"model": lambda p: self.model.to_disk(p),
|
"model": lambda p: self.model.to_disk(p),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
@ -487,7 +480,6 @@ class Tagger(Pipe):
|
||||||
"tag_map": load_tag_map,
|
"tag_map": load_tag_map,
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -566,15 +558,14 @@ class SentenceRecognizer(Tagger):
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -586,20 +577,18 @@ class SentenceRecognizer(Tagger):
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: load_model(b),
|
"model": lambda b: load_model(b),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
@ -612,7 +601,6 @@ class SentenceRecognizer(Tagger):
|
||||||
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -825,7 +813,7 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
@ -1198,7 +1186,7 @@ class EntityLinker(Pipe):
|
||||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||||
|
@ -1309,17 +1297,16 @@ class EntityLinker(Pipe):
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = kb_id
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
self.cfg["entity_width"] = self.kb.entity_vector_length
|
self.cfg["entity_width"] = self.kb.entity_vector_length
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["kb"] = lambda p: self.kb.dump(p)
|
serialize["kb"] = lambda p: self.kb.dump(p)
|
||||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
|
@ -1335,7 +1322,6 @@ class EntityLinker(Pipe):
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||||
deserialize["kb"] = load_kb
|
deserialize["kb"] = load_kb
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -1411,7 +1397,7 @@ class Sentencizer(Pipe):
|
||||||
doc[start].is_sent_start = True
|
doc[start].is_sent_start = True
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
|
|
|
@ -51,11 +51,10 @@ class Tok2Vec(Pipe):
|
||||||
self.set_annotations([doc], tokvecses)
|
self.set_annotations([doc], tokvecses)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128):
|
||||||
"""Process `Doc` objects as a stream.
|
"""Process `Doc` objects as a stream.
|
||||||
stream (iterator): A sequence of `Doc` objects to process.
|
stream (iterator): A sequence of `Doc` objects to process.
|
||||||
batch_size (int): Number of `Doc` objects to group.
|
batch_size (int): Number of `Doc` objects to group.
|
||||||
n_threads (int): Number of threads.
|
|
||||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
||||||
"""
|
"""
|
||||||
for docs in minibatch(stream, batch_size):
|
for docs in minibatch(stream, batch_size):
|
||||||
|
|
|
@ -157,7 +157,7 @@ cdef class Parser:
|
||||||
self.set_annotations([doc], states, tensors=None)
|
self.set_annotations([doc], states, tensors=None)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=256, int n_threads=-1):
|
def pipe(self, docs, int batch_size=256):
|
||||||
"""Process a stream of documents.
|
"""Process a stream of documents.
|
||||||
|
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
|
@ -461,24 +461,22 @@ cdef class Parser:
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'vocab': lambda p: self.vocab.from_disk(p),
|
||||||
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None,
|
'model': lambda p: None,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
@ -491,24 +489,22 @@ cdef class Parser:
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
"model": lambda: (self.model.to_bytes()),
|
"model": lambda: (self.model.to_bytes()),
|
||||||
"vocab": lambda: self.vocab.to_bytes(),
|
"vocab": lambda: self.vocab.to_bytes(),
|
||||||
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
||||||
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||||
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: None,
|
"model": lambda b: None,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
if 'model' in msg:
|
if 'model' in msg:
|
||||||
|
|
|
@ -60,7 +60,7 @@ cdef class TransitionSystem:
|
||||||
states.append(state)
|
states.append(state)
|
||||||
offset += len(doc)
|
offset += len(doc)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
def get_oracle_sequence(self, Example example, _debug=False):
|
def get_oracle_sequence(self, Example example, _debug=False):
|
||||||
states, golds, _ = self.init_gold_batch([example])
|
states, golds, _ = self.init_gold_batch([example])
|
||||||
if not states:
|
if not states:
|
||||||
|
@ -227,22 +227,20 @@ cdef class TransitionSystem:
|
||||||
self.from_bytes(byte_data, **kwargs)
|
self.from_bytes(byte_data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
'strings': lambda b: self.strings.from_bytes(b)
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
self.initialize_actions(labels)
|
self.initialize_actions(labels)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -66,8 +66,6 @@ def test_spans_string_fn(doc):
|
||||||
span = doc[0:4]
|
span = doc[0:4]
|
||||||
assert len(span) == 4
|
assert len(span) == 4
|
||||||
assert span.text == "This is a sentence"
|
assert span.text == "This is a sentence"
|
||||||
assert span.upper_ == "THIS IS A SENTENCE"
|
|
||||||
assert span.lower_ == "this is a sentence"
|
|
||||||
|
|
||||||
|
|
||||||
def test_spans_root2(en_tokenizer):
|
def test_spans_root2(en_tokenizer):
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Adam
|
from thinc.api import Adam, fix_random_seed
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.pipeline.defaults import default_parser, default_ner
|
from spacy.pipeline.defaults import default_parser, default_ner
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||||
from spacy.util import fix_random_seed
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
from thinc.api import fix_random_seed
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import fix_random_seed
|
from spacy.pipeline.defaults import default_tok2vec
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from spacy.pipeline.defaults import default_tok2vec
|
|
||||||
from ...gold import Example
|
from ...gold import Example
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||||
|
|
|
@ -9,7 +9,6 @@ from spacy.vocab import Vocab
|
||||||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from spacy.util import decaying
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
|
@ -216,21 +215,6 @@ def test_issue3345():
|
||||||
assert ner.moves.is_valid(state, "B-GPE")
|
assert ner.moves.is_valid(state, "B-GPE")
|
||||||
|
|
||||||
|
|
||||||
def test_issue3410():
|
|
||||||
texts = ["Hello world", "This is a test"]
|
|
||||||
nlp = English()
|
|
||||||
matcher = Matcher(nlp.vocab)
|
|
||||||
phrasematcher = PhraseMatcher(nlp.vocab)
|
|
||||||
with pytest.deprecated_call():
|
|
||||||
docs = list(nlp.pipe(texts, n_threads=4))
|
|
||||||
with pytest.deprecated_call():
|
|
||||||
docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
|
|
||||||
with pytest.deprecated_call():
|
|
||||||
list(matcher.pipe(docs, n_threads=4))
|
|
||||||
with pytest.deprecated_call():
|
|
||||||
list(phrasematcher.pipe(docs, n_threads=4))
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3412():
|
def test_issue3412():
|
||||||
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
|
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
|
||||||
vectors = Vectors(data=data, keys=["A", "B", "C"])
|
vectors = Vectors(data=data, keys=["A", "B", "C"])
|
||||||
|
@ -240,16 +224,6 @@ def test_issue3412():
|
||||||
assert best_rows[0] == 2
|
assert best_rows[0] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_issue3447():
|
|
||||||
sizes = decaying(10.0, 1.0, 0.5)
|
|
||||||
size = next(sizes)
|
|
||||||
assert size == 10.0
|
|
||||||
size = next(sizes)
|
|
||||||
assert size == 10.0 - 0.5
|
|
||||||
size = next(sizes)
|
|
||||||
assert size == 10.0 - 0.5 - 0.5
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
|
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
|
||||||
def test_issue3449():
|
def test_issue3449():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch
|
||||||
|
from thinc.api import compounding
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3611():
|
def test_issue3611():
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch
|
||||||
|
from thinc.api import compounding
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.util import minibatch, compounding
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4030():
|
def test_issue4030():
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch
|
||||||
|
from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,10 +52,6 @@ def test_serialize_doc_exclude(en_vocab):
|
||||||
assert not new_doc.user_data
|
assert not new_doc.user_data
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
|
||||||
assert not new_doc.user_data
|
assert not new_doc.user_data
|
||||||
with pytest.raises(ValueError):
|
|
||||||
doc.to_bytes(user_data=False)
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_bin():
|
def test_serialize_doc_bin():
|
||||||
|
|
|
@ -62,7 +62,3 @@ def test_serialize_language_exclude(meta_data):
|
||||||
assert not new_nlp.meta["name"] == name
|
assert not new_nlp.meta["name"] == name
|
||||||
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
||||||
assert not new_nlp.meta["name"] == name
|
assert not new_nlp.meta["name"] == name
|
||||||
with pytest.raises(ValueError):
|
|
||||||
nlp.to_bytes(meta=False)
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
Language().from_bytes(nlp.to_bytes(), meta=False)
|
|
||||||
|
|
|
@ -127,10 +127,6 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||||
parser.to_bytes(exclude=["cfg"]), exclude=["vocab"]
|
parser.to_bytes(exclude=["cfg"]), exclude=["vocab"]
|
||||||
)
|
)
|
||||||
assert "foo" not in new_parser.cfg
|
assert "foo" not in new_parser.cfg
|
||||||
with pytest.raises(ValueError):
|
|
||||||
parser.to_bytes(cfg=False, exclude=["vocab"])
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_sentencerecognizer(en_vocab):
|
def test_serialize_sentencerecognizer(en_vocab):
|
||||||
|
|
|
@ -5,9 +5,9 @@ from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.gold.converters import json2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.syntax.nonproj import is_nonproj_tree
|
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, compounding, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
|
from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -511,9 +511,7 @@ def test_make_orth_variants(doc):
|
||||||
|
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
train_example = next(goldcorpus.train_dataset(nlp))
|
train_example = next(goldcorpus.train_dataset(nlp))
|
||||||
variant_example = make_orth_variants_example(
|
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||||
nlp, train_example, orth_variant_level=0.2
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
|
@ -26,8 +26,6 @@ cdef class Tokenizer:
|
||||||
cdef int _property_init_count
|
cdef int _property_init_count
|
||||||
cdef int _property_init_max
|
cdef int _property_init_max
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings)
|
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
||||||
|
|
|
@ -140,10 +140,6 @@ cdef class Tokenizer:
|
||||||
self.url_match)
|
self.url_match)
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings):
|
|
||||||
warnings.warn(Warnings.W002, DeprecationWarning)
|
|
||||||
return Doc(self.vocab, words=strings)
|
|
||||||
|
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
|
@ -218,7 +214,7 @@ cdef class Tokenizer:
|
||||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=-1):
|
def pipe(self, texts, batch_size=1000):
|
||||||
"""Tokenize a stream of texts.
|
"""Tokenize a stream of texts.
|
||||||
|
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
|
@ -228,8 +224,6 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#pipe
|
DOCS: https://spacy.io/api/tokenizer#pipe
|
||||||
"""
|
"""
|
||||||
if n_threads != -1:
|
|
||||||
warnings.warn(Warnings.W016, DeprecationWarning)
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
|
||||||
|
@ -746,7 +740,7 @@ cdef class Tokenizer:
|
||||||
self.from_bytes(bytes_data, **kwargs)
|
self.from_bytes(bytes_data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
@ -763,10 +757,9 @@ cdef class Tokenizer:
|
||||||
"url_match": lambda: _get_regex_pattern(self.url_match),
|
"url_match": lambda: _get_regex_pattern(self.url_match),
|
||||||
"exceptions": lambda: dict(sorted(self._rules.items()))
|
"exceptions": lambda: dict(sorted(self._rules.items()))
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
|
@ -785,7 +778,6 @@ cdef class Tokenizer:
|
||||||
"url_match": lambda b: data.setdefault("url_match", b),
|
"url_match": lambda b: data.setdefault("url_match", b),
|
||||||
"exceptions": lambda b: data.setdefault("rules", b)
|
"exceptions": lambda b: data.setdefault("rules", b)
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
|
|
|
@ -1081,9 +1081,6 @@ cdef class Doc:
|
||||||
"cats": lambda: self.cats,
|
"cats": lambda: self.cats,
|
||||||
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
||||||
}
|
}
|
||||||
for key in kwargs:
|
|
||||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
|
||||||
raise ValueError(Errors.E128.format(arg=key))
|
|
||||||
if "user_data" not in exclude and self.user_data:
|
if "user_data" not in exclude and self.user_data:
|
||||||
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
||||||
if "user_data_keys" not in exclude:
|
if "user_data_keys" not in exclude:
|
||||||
|
@ -1114,9 +1111,6 @@ cdef class Doc:
|
||||||
"user_data_values": lambda b: None,
|
"user_data_values": lambda b: None,
|
||||||
"has_unknown_spaces": lambda b: None
|
"has_unknown_spaces": lambda b: None
|
||||||
}
|
}
|
||||||
for key in kwargs:
|
|
||||||
if key in deserializers or key in ("user_data",):
|
|
||||||
raise ValueError(Errors.E128.format(arg=key))
|
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
|
|
@ -686,21 +686,6 @@ cdef class Span:
|
||||||
"""RETURNS (str): The span's lemma."""
|
"""RETURNS (str): The span's lemma."""
|
||||||
return " ".join([t.lemma_ for t in self]).strip()
|
return " ".join([t.lemma_ for t in self]).strip()
|
||||||
|
|
||||||
@property
|
|
||||||
def upper_(self):
|
|
||||||
"""Deprecated. Use `Span.text.upper()` instead."""
|
|
||||||
return "".join([t.text_with_ws.upper() for t in self]).strip()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def lower_(self):
|
|
||||||
"""Deprecated. Use `Span.text.lower()` instead."""
|
|
||||||
return "".join([t.text_with_ws.lower() for t in self]).strip()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def string(self):
|
|
||||||
"""Deprecated: Use `Span.text_with_ws` instead."""
|
|
||||||
return "".join([t.text_with_ws for t in self])
|
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""RETURNS (str): The span's label."""
|
"""RETURNS (str): The span's label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -237,11 +237,6 @@ cdef class Token:
|
||||||
index into tables, e.g. for word vectors."""
|
index into tables, e.g. for word vectors."""
|
||||||
return self.c.lex.id
|
return self.c.lex.id
|
||||||
|
|
||||||
@property
|
|
||||||
def string(self):
|
|
||||||
"""Deprecated: Use Token.text_with_ws instead."""
|
|
||||||
return self.text_with_ws
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (str): The original verbatim text of the token."""
|
"""RETURNS (str): The original verbatim text of the token."""
|
||||||
|
|
131
spacy/util.py
131
spacy/util.py
|
@ -4,9 +4,8 @@ import importlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
|
||||||
import thinc
|
import thinc
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
|
from thinc.api import NumpyOps, get_current_ops, Adam, Config
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
@ -34,6 +33,13 @@ try: # Python 3.8
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import importlib_metadata
|
import importlib_metadata
|
||||||
|
|
||||||
|
# These are functions that were previously (v2.x) available from spacy.util
|
||||||
|
# and have since moved to Thinc. We're importing them here so people's code
|
||||||
|
# doesn't break, but they should always be imported from Thinc from now on,
|
||||||
|
# not from spacy.util.
|
||||||
|
from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, is_windows
|
from .compat import cupy, CudaStream, is_windows
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
@ -595,15 +601,8 @@ def compile_prefix_regex(entries):
|
||||||
entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
|
entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
|
||||||
RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
|
RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
|
||||||
"""
|
"""
|
||||||
if "(" in entries:
|
expression = "|".join(["^" + piece for piece in entries if piece.strip()])
|
||||||
# Handle deprecated data
|
return re.compile(expression)
|
||||||
expression = "|".join(
|
|
||||||
["^" + re.escape(piece) for piece in entries if piece.strip()]
|
|
||||||
)
|
|
||||||
return re.compile(expression)
|
|
||||||
else:
|
|
||||||
expression = "|".join(["^" + piece for piece in entries if piece.strip()])
|
|
||||||
return re.compile(expression)
|
|
||||||
|
|
||||||
|
|
||||||
def compile_suffix_regex(entries):
|
def compile_suffix_regex(entries):
|
||||||
|
@ -723,59 +722,6 @@ def minibatch(items, size=8):
|
||||||
yield list(batch)
|
yield list(batch)
|
||||||
|
|
||||||
|
|
||||||
def compounding(start, stop, compound):
|
|
||||||
"""Yield an infinite series of compounding values. Each time the
|
|
||||||
generator is called, a value is produced by multiplying the previous
|
|
||||||
value by the compound rate.
|
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
>>> sizes = compounding(1., 10., 1.5)
|
|
||||||
>>> assert next(sizes) == 1.
|
|
||||||
>>> assert next(sizes) == 1 * 1.5
|
|
||||||
>>> assert next(sizes) == 1.5 * 1.5
|
|
||||||
"""
|
|
||||||
|
|
||||||
def clip(value):
|
|
||||||
return max(value, stop) if (start > stop) else min(value, stop)
|
|
||||||
|
|
||||||
curr = float(start)
|
|
||||||
while True:
|
|
||||||
yield clip(curr)
|
|
||||||
curr *= compound
|
|
||||||
|
|
||||||
|
|
||||||
def stepping(start, stop, steps):
|
|
||||||
"""Yield an infinite series of values that step from a start value to a
|
|
||||||
final value over some number of steps. Each step is (stop-start)/steps.
|
|
||||||
|
|
||||||
After the final value is reached, the generator continues yielding that
|
|
||||||
value.
|
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
>>> sizes = stepping(1., 200., 100)
|
|
||||||
>>> assert next(sizes) == 1.
|
|
||||||
>>> assert next(sizes) == 1 * (200.-1.) / 100
|
|
||||||
>>> assert next(sizes) == 1 + (200.-1.) / 100 + (200.-1.) / 100
|
|
||||||
"""
|
|
||||||
|
|
||||||
def clip(value):
|
|
||||||
return max(value, stop) if (start > stop) else min(value, stop)
|
|
||||||
|
|
||||||
curr = float(start)
|
|
||||||
while True:
|
|
||||||
yield clip(curr)
|
|
||||||
curr += (stop - start) / steps
|
|
||||||
|
|
||||||
|
|
||||||
def decaying(start, stop, decay):
|
|
||||||
"""Yield an infinite series of linearly decaying values."""
|
|
||||||
|
|
||||||
curr = float(start)
|
|
||||||
while True:
|
|
||||||
yield max(curr, stop)
|
|
||||||
curr -= decay
|
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||||
"""Create minibatches of roughly a given number of words. If any examples
|
"""Create minibatches of roughly a given number of words. If any examples
|
||||||
are longer than the specified batch length, they will appear in a batch by
|
are longer than the specified batch length, they will appear in a batch by
|
||||||
|
@ -854,35 +800,6 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
|
||||||
yield batch
|
yield batch
|
||||||
|
|
||||||
|
|
||||||
def itershuffle(iterable, bufsize=1000):
|
|
||||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
|
||||||
and yielding them sometime later. Obviously, this is not unbiased –
|
|
||||||
but should be good enough for batching. Larger bufsize means less bias.
|
|
||||||
From https://gist.github.com/andres-erbsen/1307752
|
|
||||||
|
|
||||||
iterable (iterable): Iterator to shuffle.
|
|
||||||
bufsize (int): Items to hold back.
|
|
||||||
YIELDS (iterable): The shuffled iterator.
|
|
||||||
"""
|
|
||||||
iterable = iter(iterable)
|
|
||||||
buf = []
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
for i in range(random.randint(1, bufsize - len(buf))):
|
|
||||||
buf.append(next(iterable))
|
|
||||||
random.shuffle(buf)
|
|
||||||
for i in range(random.randint(1, bufsize)):
|
|
||||||
if buf:
|
|
||||||
yield buf.pop()
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
except StopIteration:
|
|
||||||
random.shuffle(buf)
|
|
||||||
while buf:
|
|
||||||
yield buf.pop()
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
|
|
||||||
def filter_spans(spans):
|
def filter_spans(spans):
|
||||||
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
|
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
|
||||||
creating named entities (where one token can only be part of one entity) or
|
creating named entities (where one token can only be part of one entity) or
|
||||||
|
@ -989,34 +906,6 @@ def escape_html(text):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def use_gpu(gpu_id):
|
|
||||||
return require_gpu(gpu_id)
|
|
||||||
|
|
||||||
|
|
||||||
def fix_random_seed(seed=0):
|
|
||||||
random.seed(seed)
|
|
||||||
numpy.random.seed(seed)
|
|
||||||
if cupy is not None:
|
|
||||||
cupy.random.seed(seed)
|
|
||||||
|
|
||||||
|
|
||||||
def get_serialization_exclude(serializers, exclude, kwargs):
|
|
||||||
"""Helper function to validate serialization args and manage transition from
|
|
||||||
keyword arguments (pre v2.1) to exclude argument.
|
|
||||||
"""
|
|
||||||
exclude = list(exclude)
|
|
||||||
# Split to support file names like meta.json
|
|
||||||
options = [name.split(".")[0] for name in serializers]
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
if key in ("vocab",) and value is False:
|
|
||||||
warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning)
|
|
||||||
exclude.append(key)
|
|
||||||
elif key.split(".")[0] in options:
|
|
||||||
raise ValueError(Errors.E128.format(arg=key))
|
|
||||||
# TODO: user warning?
|
|
||||||
return exclude
|
|
||||||
|
|
||||||
|
|
||||||
def get_words_and_spaces(words, text):
|
def get_words_and_spaces(words, text):
|
||||||
if "".join("".join(words).split()) != "".join(text.split()):
|
if "".join("".join(words).split()) != "".join(text.split()):
|
||||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
|
|
|
@ -426,7 +426,7 @@ cdef class Vocab:
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
return orth in self.vectors
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
|
@ -439,7 +439,6 @@ cdef class Vocab:
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
setters = ["strings", "vectors"]
|
setters = ["strings", "vectors"]
|
||||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.to_disk(path / "strings.json")
|
self.strings.to_disk(path / "strings.json")
|
||||||
if "vectors" not in "exclude" and self.vectors is not None:
|
if "vectors" not in "exclude" and self.vectors is not None:
|
||||||
|
@ -449,7 +448,7 @@ cdef class Vocab:
|
||||||
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
|
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
|
||||||
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
|
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
|
@ -461,7 +460,6 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
getters = ["strings", "vectors"]
|
getters = ["strings", "vectors"]
|
||||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||||
if "vectors" not in exclude:
|
if "vectors" not in exclude:
|
||||||
|
@ -481,7 +479,7 @@ cdef class Vocab:
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple()):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
@ -501,10 +499,9 @@ cdef class Vocab:
|
||||||
"lookups": lambda: self.lookups.to_bytes(),
|
"lookups": lambda: self.lookups.to_bytes(),
|
||||||
"lookups_extra": lambda: self.lookups_extra.to_bytes()
|
"lookups_extra": lambda: self.lookups_extra.to_bytes()
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
|
||||||
return util.to_bytes(getters, exclude)
|
return util.to_bytes(getters, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
|
@ -526,7 +523,6 @@ cdef class Vocab:
|
||||||
"lookups": lambda b: self.lookups.from_bytes(b),
|
"lookups": lambda b: self.lookups.from_bytes(b),
|
||||||
"lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
|
"lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
if "lexeme_norm" in self.lookups:
|
if "lexeme_norm" in self.lookups:
|
||||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user