mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Default settings to configurations (#4995)
* fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
This commit is contained in:
parent
f39ddda193
commit
06f0a8daa0
|
@ -1,5 +1,5 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.txt *.pyx *.pxd
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include bin/spacy
|
include bin/spacy
|
||||||
|
|
|
@ -386,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.get_ref("tok2vec").from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
# coding: utf-8
|
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from thinc.model import Model
|
from thinc.api import Model, chain, CosineDistance, Linear
|
||||||
from thinc.api import chain
|
|
||||||
from thinc.loss import CosineDistance
|
|
||||||
from thinc.layers import Linear
|
|
||||||
|
|
||||||
from spacy.util import create_default_optimizer
|
from spacy.util import create_default_optimizer
|
||||||
|
|
||||||
|
|
|
@ -39,25 +39,27 @@ factory = "tagger"
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
[nlp.pipeline.tagger.model]
|
||||||
@architectures = "tagger_model.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
[nlp.pipeline.tagger.model.tok2vec]
|
||||||
@architectures = "tok2vec_tensors.v1"
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
[nlp.pipeline.parser.model]
|
||||||
@architectures = "transition_based_parser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 8
|
nr_feature_tokens = 8
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
[nlp.pipeline.parser.model.tok2vec]
|
||||||
@architectures = "tok2vec_tensors.v1"
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
[nlp.pipeline.tok2vec.model]
|
||||||
@architectures = "hash_embed_bilstm.v1"
|
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
||||||
pretrained_vectors = ${nlp:vectors}
|
pretrained_vectors = ${nlp:vectors}
|
||||||
width = 96
|
width = 96
|
||||||
depth = 4
|
depth = 4
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
|
subword_features = true
|
||||||
|
char_embed = false
|
||||||
|
|
|
@ -39,27 +39,28 @@ factory = "tagger"
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
[nlp.pipeline.tagger.model]
|
||||||
@architectures = "tagger_model.v1"
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
[nlp.pipeline.tagger.model.tok2vec]
|
||||||
@architectures = "tok2vec_tensors.v1"
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
[nlp.pipeline.parser.model]
|
||||||
@architectures = "transition_based_parser.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 8
|
nr_feature_tokens = 8
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
[nlp.pipeline.parser.model.tok2vec]
|
||||||
@architectures = "tok2vec_tensors.v1"
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
[nlp.pipeline.tok2vec.model]
|
||||||
@architectures = "hash_embed_cnn.v1"
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
pretrained_vectors = ${nlp:vectors}
|
pretrained_vectors = ${nlp:vectors}
|
||||||
width = 96
|
width = 96
|
||||||
depth = 4
|
depth = 4
|
||||||
window_size = 1
|
window_size = 1
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
|
@ -20,9 +20,9 @@ import random
|
||||||
import ml_datasets
|
import ml_datasets
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, use_gpu, compounding
|
from spacy.util import minibatch
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
from spacy.ml.tok2vec import Tok2Vec
|
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,9 +65,7 @@ def prefer_gpu():
|
||||||
|
|
||||||
|
|
||||||
def build_textcat_model(tok2vec, nr_class, width):
|
def build_textcat_model(tok2vec, nr_class, width):
|
||||||
from thinc.model import Model
|
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
|
||||||
from thinc.layers import Softmax, chain, reduce_mean
|
|
||||||
from thinc.layers import list2ragged
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
model = (
|
model = (
|
||||||
|
@ -76,7 +74,7 @@ def build_textcat_model(tok2vec, nr_class, width):
|
||||||
>> reduce_mean()
|
>> reduce_mean()
|
||||||
>> Softmax(nr_class, width)
|
>> Softmax(nr_class, width)
|
||||||
)
|
)
|
||||||
model.tok2vec = tok2vec
|
model.set_ref("tok2vec", tok2vec)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,8 +95,9 @@ def create_pipeline(width, embed_size, vectors_model):
|
||||||
textcat = TextCategorizer(
|
textcat = TextCategorizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
labels=["POSITIVE", "NEGATIVE"],
|
labels=["POSITIVE", "NEGATIVE"],
|
||||||
|
# TODO: replace with config version
|
||||||
model=build_textcat_model(
|
model=build_textcat_model(
|
||||||
Tok2Vec(width=width, embed_size=embed_size), 2, width
|
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -121,7 +120,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
textcat = nlp.get_pipe("textcat")
|
textcat = nlp.get_pipe("textcat")
|
||||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||||
print(
|
print(
|
||||||
"Using {} examples ({} training, {} evaluation)".format(
|
"Using {} examples ({} training, {} evaluation)".format(
|
||||||
|
@ -135,7 +134,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
|
||||||
print("Training the model...")
|
print("Training the model...")
|
||||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
|
|
|
@ -74,7 +74,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
with init_tok2vec.open("rb") as file_:
|
with init_tok2vec.open("rb") as file_:
|
||||||
textcat.model.tok2vec.from_bytes(file_.read())
|
textcat.model.get_ref("tok2vec").from_bytes(file_.read())
|
||||||
print("Training the model...")
|
print("Training the model...")
|
||||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||||
batch_sizes = compounding(4.0, 32.0, 1.001)
|
batch_sizes = compounding(4.0, 32.0, 1.001)
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc==8.0.0a0",
|
"thinc==8.0.0a1",
|
||||||
"blis>=0.4.0,<0.5.0"
|
"blis>=0.4.0,<0.5.0"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc==8.0.0a0
|
thinc==8.0.0a1
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -36,13 +36,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc==8.0.0a0
|
thinc==8.0.0a1
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc==8.0.0a0
|
thinc==8.0.0a1
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.4.0,<1.1.0
|
wasabi>=0.4.0,<1.1.0
|
||||||
srsly>=2.0.0,<3.0.0
|
srsly>=2.0.0,<3.0.0
|
||||||
|
|
|
@ -11,10 +11,10 @@ import srsly
|
||||||
|
|
||||||
from ..gold import Example
|
from ..gold import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..ml.models.multi_task import build_masked_language_model
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
from ..ml.component_models import Tok2Vec
|
from ..ml.models.tok2vec import build_Tok2Vec_model
|
||||||
from ..ml.component_models import masked_language_model
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import create_default_optimizer
|
from ..util import create_default_optimizer
|
||||||
from .train import _load_pretrained_tok2vec
|
from .train import _load_pretrained_tok2vec
|
||||||
|
@ -108,14 +108,19 @@ def pretrain(
|
||||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
|
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
|
||||||
model = create_pretraining_model(
|
model = create_pretraining_model(
|
||||||
nlp,
|
nlp,
|
||||||
Tok2Vec(
|
# TODO: replace with config
|
||||||
|
build_Tok2Vec_model(
|
||||||
width,
|
width,
|
||||||
embed_rows,
|
embed_rows,
|
||||||
conv_depth=conv_depth,
|
conv_depth=conv_depth,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||||
subword_features=not use_chars, # Set to False for Chinese etc
|
subword_features=not use_chars, # Set to False for Chinese etc
|
||||||
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
||||||
|
window_size=1,
|
||||||
|
char_embed=False,
|
||||||
|
nM=64,
|
||||||
|
nC=8
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
|
@ -152,7 +157,7 @@ def pretrain(
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||||
file_.write(model.tok2vec.to_bytes())
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
log = {
|
log = {
|
||||||
"nr_word": tracker.nr_word,
|
"nr_word": tracker.nr_word,
|
||||||
"loss": tracker.loss,
|
"loss": tracker.loss,
|
||||||
|
@ -284,7 +289,7 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
# "tok2vec" has to be the same set of processes as what the components do.
|
# "tok2vec" has to be the same set of processes as what the components do.
|
||||||
tok2vec = chain(tok2vec, list2array())
|
tok2vec = chain(tok2vec, list2array())
|
||||||
model = chain(tok2vec, output_layer)
|
model = chain(tok2vec, output_layer)
|
||||||
model = masked_language_model(nlp.vocab, model)
|
model = build_masked_language_model(nlp.vocab, model)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
|
|
|
@ -9,7 +9,7 @@ from wasabi import msg
|
||||||
import contextlib
|
import contextlib
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from ..util import create_default_optimizer
|
from ..util import create_default_optimizer, registry
|
||||||
from ..util import use_gpu as set_gpu
|
from ..util import use_gpu as set_gpu
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
|
@ -111,6 +111,8 @@ def train(
|
||||||
eval_beam_widths.sort()
|
eval_beam_widths.sort()
|
||||||
has_beam_widths = eval_beam_widths != [1]
|
has_beam_widths = eval_beam_widths != [1]
|
||||||
|
|
||||||
|
default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults"
|
||||||
|
|
||||||
# Set up the base model and pipeline. If a base model is specified, load
|
# Set up the base model and pipeline. If a base model is specified, load
|
||||||
# the model and make sure the pipeline matches the pipeline setting. If
|
# the model and make sure the pipeline matches the pipeline setting. If
|
||||||
# training starts from a blank model, intitalize the language class.
|
# training starts from a blank model, intitalize the language class.
|
||||||
|
@ -118,7 +120,6 @@ def train(
|
||||||
msg.text(f"Training pipeline: {pipeline}")
|
msg.text(f"Training pipeline: {pipeline}")
|
||||||
disabled_pipes = None
|
disabled_pipes = None
|
||||||
pipes_added = False
|
pipes_added = False
|
||||||
msg.text(f"Training pipeline: {pipeline}")
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
activated_gpu = None
|
activated_gpu = None
|
||||||
try:
|
try:
|
||||||
|
@ -140,16 +141,36 @@ def train(
|
||||||
f"specified as `lang` argument ('{lang}') ",
|
f"specified as `lang` argument ('{lang}') ",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
if vectors:
|
||||||
|
msg.text(f"Loading vectors from model '{vectors}'")
|
||||||
|
|
||||||
|
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
||||||
for pipe in pipeline:
|
for pipe in pipeline:
|
||||||
pipe_cfg = {}
|
# first, create the model.
|
||||||
|
# Bit of a hack after the refactor to get the vectors into a default config
|
||||||
|
# use train-from-config instead :-)
|
||||||
if pipe == "parser":
|
if pipe == "parser":
|
||||||
pipe_cfg = {"learn_tokens": learn_tokens}
|
config_loc = default_dir / "parser_defaults.cfg"
|
||||||
|
elif pipe == "tagger":
|
||||||
|
config_loc = default_dir / "tagger_defaults.cfg"
|
||||||
|
elif pipe == "ner":
|
||||||
|
config_loc = default_dir / "ner_defaults.cfg"
|
||||||
elif pipe == "textcat":
|
elif pipe == "textcat":
|
||||||
pipe_cfg = {
|
config_loc = default_dir / "textcat_defaults.cfg"
|
||||||
"exclusive_classes": not textcat_multilabel,
|
else:
|
||||||
"architecture": textcat_arch,
|
raise ValueError(f"Component {pipe} currently not supported.")
|
||||||
"positive_label": textcat_positive_label,
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||||
}
|
if vectors:
|
||||||
|
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
||||||
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||||
|
|
||||||
|
if pipe == "parser":
|
||||||
|
pipe_cfg["learn_tokens"] = learn_tokens
|
||||||
|
elif pipe == "textcat":
|
||||||
|
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
||||||
|
pipe_cfg["architecture"] = textcat_arch
|
||||||
|
pipe_cfg["positive_label"] = textcat_positive_label
|
||||||
|
|
||||||
if pipe not in nlp.pipe_names:
|
if pipe not in nlp.pipe_names:
|
||||||
msg.text(f"Adding component to base model '{pipe}'")
|
msg.text(f"Adding component to base model '{pipe}'")
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||||
|
@ -181,26 +202,42 @@ def train(
|
||||||
msg.text(f"Starting with blank model '{lang}'")
|
msg.text(f"Starting with blank model '{lang}'")
|
||||||
lang_cls = util.get_lang_class(lang)
|
lang_cls = util.get_lang_class(lang)
|
||||||
nlp = lang_cls()
|
nlp = lang_cls()
|
||||||
|
|
||||||
|
if vectors:
|
||||||
|
msg.text(f"Loading vectors from model '{vectors}'")
|
||||||
|
|
||||||
for pipe in pipeline:
|
for pipe in pipeline:
|
||||||
|
# first, create the model.
|
||||||
|
# Bit of a hack after the refactor to get the vectors into a default config
|
||||||
|
# use train-from-config instead :-)
|
||||||
if pipe == "parser":
|
if pipe == "parser":
|
||||||
pipe_cfg = {"learn_tokens": learn_tokens}
|
config_loc = default_dir / "parser_defaults.cfg"
|
||||||
|
elif pipe == "tagger":
|
||||||
|
config_loc = default_dir / "tagger_defaults.cfg"
|
||||||
|
elif pipe == "ner":
|
||||||
|
config_loc = default_dir / "ner_defaults.cfg"
|
||||||
elif pipe == "textcat":
|
elif pipe == "textcat":
|
||||||
pipe_cfg = {
|
config_loc = default_dir / "textcat_defaults.cfg"
|
||||||
"exclusive_classes": not textcat_multilabel,
|
|
||||||
"architecture": textcat_arch,
|
|
||||||
"positive_label": textcat_positive_label,
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
pipe_cfg = {}
|
raise ValueError(f"Component {pipe} currently not supported.")
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
||||||
|
if vectors:
|
||||||
|
pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
|
||||||
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
||||||
|
|
||||||
|
if pipe == "parser":
|
||||||
|
pipe_cfg["learn_tokens"] = learn_tokens
|
||||||
|
elif pipe == "textcat":
|
||||||
|
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
||||||
|
pipe_cfg["architecture"] = textcat_arch
|
||||||
|
pipe_cfg["positive_label"] = textcat_positive_label
|
||||||
|
|
||||||
|
pipe = nlp.create_pipe(pipe, config=pipe_cfg)
|
||||||
|
nlp.add_pipe(pipe)
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Update tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
if vectors:
|
|
||||||
msg.text(f"Loading vector from model '{vectors}'")
|
|
||||||
_load_vectors(nlp, vectors)
|
|
||||||
|
|
||||||
# Multitask objectives
|
# Multitask objectives
|
||||||
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||||
for pipe_name, multitasks in multitask_options:
|
for pipe_name, multitasks in multitask_options:
|
||||||
|
@ -228,7 +265,7 @@ def train(
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
|
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights (TODO: this may be broken in the config rewrite)
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||||
|
@ -531,7 +568,7 @@ def _create_progress_bar(total):
|
||||||
|
|
||||||
|
|
||||||
def _load_vectors(nlp, vectors):
|
def _load_vectors(nlp, vectors):
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
loaded_model = util.load_model(vectors, vocab=nlp.vocab)
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
values = {}
|
values = {}
|
||||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||||
|
@ -541,6 +578,7 @@ def _load_vectors(nlp, vectors):
|
||||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||||
lex.set_attrs(**values)
|
lex.set_attrs(**values)
|
||||||
lex.is_oov = False
|
lex.is_oov = False
|
||||||
|
return loaded_model
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
|
@ -551,8 +589,8 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.get_ref("tok2vec").from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
|
||||||
|
|
|
@ -1,19 +1,17 @@
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
from typing import Optional, Dict, List, Union, Sequence
|
||||||
|
from pydantic import BaseModel, FilePath, StrictInt
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from wasabi import msg
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from pydantic import BaseModel, FilePath, StrictInt
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
# TODO: relative imports?
|
from ..gold import GoldCorpus
|
||||||
import spacy
|
from .. import util
|
||||||
from spacy.gold import GoldCorpus
|
|
||||||
from spacy.pipeline.tok2vec import Tok2VecListener
|
|
||||||
from spacy.ml import component_models
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
|
|
||||||
registry = util.registry
|
registry = util.registry
|
||||||
|
@ -57,23 +55,24 @@ factory = "tok2vec"
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
[nlp.pipeline.ner.model]
|
||||||
@architectures = "transition_based_ner.v1"
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
nr_feature_tokens = 3
|
nr_feature_tokens = 3
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
[nlp.pipeline.ner.model.tok2vec]
|
||||||
@architectures = "tok2vec_tensors.v1"
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
[nlp.pipeline.tok2vec.model]
|
||||||
@architectures = "hash_embed_cnn.v1"
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
pretrained_vectors = ${nlp:vectors}
|
pretrained_vectors = ${nlp:vectors}
|
||||||
width = 128
|
width = 128
|
||||||
depth = 4
|
depth = 4
|
||||||
window_size = 1
|
window_size = 1
|
||||||
embed_size = 10000
|
embed_size = 10000
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -113,65 +112,6 @@ class ConfigSchema(BaseModel):
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
|
|
||||||
|
|
||||||
# Of course, these would normally decorate the functions where they're defined.
|
|
||||||
# But for now...
|
|
||||||
@registry.architectures.register("hash_embed_cnn.v1")
|
|
||||||
def hash_embed_cnn(
|
|
||||||
pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size
|
|
||||||
):
|
|
||||||
return component_models.Tok2Vec(
|
|
||||||
width=width,
|
|
||||||
embed_size=embed_size,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
conv_depth=depth,
|
|
||||||
cnn_maxout_pieces=maxout_pieces,
|
|
||||||
bilstm_depth=0,
|
|
||||||
window_size=window_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("hash_embed_bilstm.v1")
|
|
||||||
def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size):
|
|
||||||
return component_models.Tok2Vec(
|
|
||||||
width=width,
|
|
||||||
embed_size=embed_size,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
bilstm_depth=depth,
|
|
||||||
conv_depth=0,
|
|
||||||
cnn_maxout_pieces=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("tagger_model.v1")
|
|
||||||
def build_tagger_model_v1(tok2vec):
|
|
||||||
return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("transition_based_parser.v1")
|
|
||||||
def create_tb_parser_model(
|
|
||||||
tok2vec: Model,
|
|
||||||
nr_feature_tokens: StrictInt = 3,
|
|
||||||
hidden_width: StrictInt = 64,
|
|
||||||
maxout_pieces: StrictInt = 3,
|
|
||||||
):
|
|
||||||
from thinc.api import Linear, chain, list2array, use_ops, zero_init
|
|
||||||
from spacy.ml._layers import PrecomputableAffine
|
|
||||||
from spacy.syntax._parser_model import ParserModel
|
|
||||||
|
|
||||||
token_vector_width = tok2vec.get_dim("nO")
|
|
||||||
tok2vec = chain(tok2vec, list2array())
|
|
||||||
tok2vec.set_dim("nO", token_vector_width)
|
|
||||||
|
|
||||||
lower = PrecomputableAffine(
|
|
||||||
hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces
|
|
||||||
)
|
|
||||||
lower.set_dim("nP", maxout_pieces)
|
|
||||||
with use_ops("numpy"):
|
|
||||||
# Initialize weights at zero, as it's a classification layer.
|
|
||||||
upper = Linear(init_W=zero_init)
|
|
||||||
return ParserModel(tok2vec, lower, upper)
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
|
@ -224,23 +164,25 @@ def train_from_config(
|
||||||
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
||||||
):
|
):
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
config = util.load_from_config(config_path, create_objects=True)
|
config = util.load_config(config_path, create_objects=True)
|
||||||
use_gpu = config["training"]["use_gpu"]
|
use_gpu = config["training"]["use_gpu"]
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU")
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info("Creating nlp from config")
|
msg.info("Creating nlp from config")
|
||||||
nlp = create_nlp_from_config(**config["nlp"])
|
nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
|
||||||
|
nlp = util.load_model_from_config(nlp_config)
|
||||||
optimizer = config["optimizer"]
|
optimizer = config["optimizer"]
|
||||||
limit = config["training"]["limit"]
|
training = config["training"]
|
||||||
|
limit = training["limit"]
|
||||||
msg.info("Loading training corpus")
|
msg.info("Loading training corpus")
|
||||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||||
msg.info("Initializing the nlp pipeline")
|
msg.info("Initializing the nlp pipeline")
|
||||||
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
|
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
|
||||||
|
|
||||||
train_batches = create_train_batches(nlp, corpus, config["training"])
|
train_batches = create_train_batches(nlp, corpus, training)
|
||||||
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
|
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
|
||||||
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
msg.info("Start training")
|
msg.info("Start training")
|
||||||
|
@ -249,16 +191,16 @@ def train_from_config(
|
||||||
optimizer,
|
optimizer,
|
||||||
train_batches,
|
train_batches,
|
||||||
evaluate,
|
evaluate,
|
||||||
config["training"]["dropout"],
|
training["dropout"],
|
||||||
config["training"]["patience"],
|
training["patience"],
|
||||||
config["training"]["eval_frequency"],
|
training["eval_frequency"],
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
print_row = setup_printer(config)
|
print_row = setup_printer(training, nlp)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||||
progress.update(1)
|
progress.update(1)
|
||||||
if is_best_checkpoint is not None:
|
if is_best_checkpoint is not None:
|
||||||
|
@ -266,9 +208,7 @@ def train_from_config(
|
||||||
print_row(info)
|
print_row(info)
|
||||||
if is_best_checkpoint and output_path is not None:
|
if is_best_checkpoint and output_path is not None:
|
||||||
nlp.to_disk(output_path)
|
nlp.to_disk(output_path)
|
||||||
progress = tqdm.tqdm(
|
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||||
total=config["training"]["eval_frequency"], leave=False
|
|
||||||
)
|
|
||||||
finally:
|
finally:
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
|
@ -280,18 +220,6 @@ def train_from_config(
|
||||||
# msg.good("Created best model", best_model_path)
|
# msg.good("Created best model", best_model_path)
|
||||||
|
|
||||||
|
|
||||||
def create_nlp_from_config(lang, vectors, pipeline):
|
|
||||||
lang_class = spacy.util.get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
if vectors is not None:
|
|
||||||
spacy.cli.train._load_vectors(nlp, vectors)
|
|
||||||
for name, component_cfg in pipeline.items():
|
|
||||||
factory = component_cfg.pop("factory")
|
|
||||||
component = nlp.create_pipe(factory, config=component_cfg)
|
|
||||||
nlp.add_pipe(component, name=name)
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(nlp, corpus, cfg):
|
||||||
while True:
|
while True:
|
||||||
train_examples = corpus.train_dataset(
|
train_examples = corpus.train_dataset(
|
||||||
|
@ -405,10 +333,10 @@ def subdivide_batch(batch):
|
||||||
return [batch]
|
return [batch]
|
||||||
|
|
||||||
|
|
||||||
def setup_printer(config):
|
def setup_printer(training, nlp):
|
||||||
score_cols = config["training"]["scores"]
|
score_cols = training["scores"]
|
||||||
score_widths = [max(len(col), 6) for col in score_cols]
|
score_widths = [max(len(col), 6) for col in score_cols]
|
||||||
loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
|
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||||
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
||||||
table_header = [col.upper() for col in table_header]
|
table_header = [col.upper() for col in table_header]
|
||||||
|
@ -420,20 +348,13 @@ def setup_printer(config):
|
||||||
|
|
||||||
def print_row(info):
|
def print_row(info):
|
||||||
losses = [
|
losses = [
|
||||||
"{0:.2f}".format(info["losses"].get(col, 0.0))
|
"{0:.2f}".format(info["losses"].get(pipe_name, 0.0))
|
||||||
for col in config["nlp"]["pipeline"]
|
for pipe_name in nlp.pipe_names
|
||||||
]
|
]
|
||||||
scores = [
|
scores = [
|
||||||
"{0:.2f}".format(info["other_scores"].get(col, 0.0))
|
"{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols
|
||||||
for col in config["training"]["scores"]
|
|
||||||
]
|
]
|
||||||
data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
|
data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
|
||||||
msg.row(data, widths=table_widths, aligns=table_aligns)
|
msg.row(data, widths=table_widths, aligns=table_aligns)
|
||||||
|
|
||||||
return print_row
|
return print_row
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("tok2vec_tensors.v1")
|
|
||||||
def tok2vec_tensors_v1(width):
|
|
||||||
tok2vec = Tok2VecListener("tok2vec", width=width)
|
|
||||||
return tok2vec
|
|
||||||
|
|
|
@ -106,6 +106,12 @@ class Warnings(object):
|
||||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||||
|
|
||||||
|
# TODO: fix numbering after merging develop into master
|
||||||
|
W098 = ("No Model config was provided to create the '{name}' component, "
|
||||||
|
"so a default configuration was used.")
|
||||||
|
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
||||||
|
"but got '{type}' instead, so ignoring it.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors(object):
|
class Errors(object):
|
||||||
|
@ -227,7 +233,7 @@ class Errors(object):
|
||||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||||
"package or a valid path to a data directory.")
|
"package or a valid path to a data directory.")
|
||||||
E052 = ("Can't find model directory: {path}")
|
E052 = ("Can't find model directory: {path}")
|
||||||
E053 = ("Could not read meta.json from {path}")
|
E053 = ("Could not read {name} from {path}")
|
||||||
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
||||||
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
||||||
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
||||||
|
@ -345,8 +351,8 @@ class Errors(object):
|
||||||
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
||||||
"in favor of the pipe name `sentencizer`, which does the same "
|
"in favor of the pipe name `sentencizer`, which does the same "
|
||||||
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
||||||
E109 = ("Model for component '{name}' not initialized. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"load a model, or forget to call begin_training()?")
|
"call begin_training()?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||||
"of the parent Doc and can't exist on their own. A pickled token "
|
"of the parent Doc and can't exist on their own. A pickled token "
|
||||||
|
@ -532,6 +538,9 @@ class Errors(object):
|
||||||
"make sure the gold EL data refers to valid results of the "
|
"make sure the gold EL data refers to valid results of the "
|
||||||
"named entity recognizer in the `nlp` pipeline.")
|
"named entity recognizer in the `nlp` pipeline.")
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E993 = ("The config for 'nlp' should include either a key 'name' to "
|
||||||
|
"refer to an existing model by name or path, or a key 'lang' "
|
||||||
|
"to create a new blank model.")
|
||||||
E996 = ("Could not parse {file}: {msg}")
|
E996 = ("Could not parse {file}: {msg}")
|
||||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||||
|
|
|
@ -4,7 +4,9 @@ import weakref
|
||||||
import functools
|
import functools
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import copy, deepcopy
|
from copy import copy, deepcopy
|
||||||
from thinc.api import get_current_ops
|
from pathlib import Path
|
||||||
|
|
||||||
|
from thinc.api import get_current_ops, Config
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -16,7 +18,7 @@ from .lookups import Lookups
|
||||||
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||||
from .gold import Example
|
from .gold import Example
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import link_vectors_to_models, create_default_optimizer
|
from .util import link_vectors_to_models, create_default_optimizer, registry
|
||||||
from .attrs import IS_STOP, LANG
|
from .attrs import IS_STOP, LANG
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
|
@ -24,7 +26,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings, deprecation_warning, user_warning
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
@ -128,7 +130,7 @@ class Language(object):
|
||||||
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
|
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs
|
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
|
||||||
):
|
):
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
|
||||||
|
@ -138,6 +140,7 @@ class Language(object):
|
||||||
object. Usually a `Tokenizer`.
|
object. Usually a `Tokenizer`.
|
||||||
meta (dict): Custom meta data for the Language class. Is written to by
|
meta (dict): Custom meta data for the Language class. Is written to by
|
||||||
models to add model meta data.
|
models to add model meta data.
|
||||||
|
config (Config): Configuration data for creating the pipeline components.
|
||||||
max_length (int) :
|
max_length (int) :
|
||||||
Maximum number of characters in a single text. The current v2 models
|
Maximum number of characters in a single text. The current v2 models
|
||||||
may run out memory on extremely long texts, due to large internal
|
may run out memory on extremely long texts, due to large internal
|
||||||
|
@ -152,6 +155,9 @@ class Language(object):
|
||||||
user_factories = util.registry.factories.get_all()
|
user_factories = util.registry.factories.get_all()
|
||||||
self.factories.update(user_factories)
|
self.factories.update(user_factories)
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
|
self._config = config
|
||||||
|
if not self._config:
|
||||||
|
self._config = Config()
|
||||||
self._path = None
|
self._path = None
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
factory = self.Defaults.create_vocab
|
factory = self.Defaults.create_vocab
|
||||||
|
@ -170,6 +176,21 @@ class Language(object):
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
|
||||||
|
from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
|
||||||
|
default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
|
||||||
|
default_tensorizer_config, default_tok2vec_config
|
||||||
|
|
||||||
|
self.defaults = {"tagger": default_tagger_config(),
|
||||||
|
"parser": default_parser_config(),
|
||||||
|
"ner": default_ner_config(),
|
||||||
|
"textcat": default_textcat_config(),
|
||||||
|
"entity_linker": default_nel_config(),
|
||||||
|
"morphologizer": default_morphologizer_config(),
|
||||||
|
"sentrec": default_sentrec_config(),
|
||||||
|
"tensorizer": default_tensorizer_config(),
|
||||||
|
"tok2vec": default_tok2vec_config(),
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self):
|
def path(self):
|
||||||
return self._path
|
return self._path
|
||||||
|
@ -203,6 +224,10 @@ class Language(object):
|
||||||
def meta(self, value):
|
def meta(self, value):
|
||||||
self._meta = value
|
self._meta = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self):
|
||||||
|
return self._config
|
||||||
|
|
||||||
# Conveniences to access pipeline components
|
# Conveniences to access pipeline components
|
||||||
# Shouldn't be used anymore!
|
# Shouldn't be used anymore!
|
||||||
@property
|
@property
|
||||||
|
@ -293,7 +318,24 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
raise KeyError(Errors.E002.format(name=name))
|
raise KeyError(Errors.E002.format(name=name))
|
||||||
factory = self.factories[name]
|
factory = self.factories[name]
|
||||||
return factory(self, **config)
|
default_config = self.defaults.get(name, None)
|
||||||
|
|
||||||
|
# transform the model's config to an actual Model
|
||||||
|
model_cfg = None
|
||||||
|
if "model" in config:
|
||||||
|
model_cfg = config["model"]
|
||||||
|
if not isinstance(model_cfg, dict):
|
||||||
|
user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name))
|
||||||
|
model_cfg = None
|
||||||
|
del config["model"]
|
||||||
|
if model_cfg is None and default_config is not None:
|
||||||
|
user_warning(Warnings.W098)
|
||||||
|
model_cfg = default_config["model"]
|
||||||
|
model = None
|
||||||
|
if model_cfg is not None:
|
||||||
|
self.config[name] = {"model": model_cfg}
|
||||||
|
model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
|
||||||
|
return factory(self, model, **config)
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
self, component, name=None, before=None, after=None, first=None, last=None
|
self, component, name=None, before=None, after=None, first=None, last=None
|
||||||
|
@ -430,7 +472,10 @@ class Language(object):
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "__call__"):
|
if not hasattr(proc, "__call__"):
|
||||||
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
||||||
doc = proc(doc, **component_cfg.get(name, {}))
|
try:
|
||||||
|
doc = proc(doc, **component_cfg.get(name, {}))
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(Errors.E109.format(name=name))
|
||||||
if doc is None:
|
if doc is None:
|
||||||
raise ValueError(Errors.E005.format(name=name))
|
raise ValueError(Errors.E005.format(name=name))
|
||||||
return doc
|
return doc
|
||||||
|
@ -578,9 +623,6 @@ class Language(object):
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.vocab.vectors.data.shape[1]:
|
|
||||||
cfg["pretrained_vectors"] = self.vocab.vectors.name
|
|
||||||
cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1]
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = create_default_optimizer()
|
sgd = create_default_optimizer()
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
|
@ -611,8 +653,6 @@ class Language(object):
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.vocab.vectors.data.shape[1]:
|
|
||||||
cfg["pretrained_vectors"] = self.vocab.vectors
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = create_default_optimizer()
|
sgd = create_default_optimizer()
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
|
@ -868,6 +908,7 @@ class Language(object):
|
||||||
serializers["meta.json"] = lambda p: p.open("w").write(
|
serializers["meta.json"] = lambda p: p.open("w").write(
|
||||||
srsly.json_dumps(self.meta)
|
srsly.json_dumps(self.meta)
|
||||||
)
|
)
|
||||||
|
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, "name"):
|
if not hasattr(proc, "name"):
|
||||||
continue
|
continue
|
||||||
|
@ -895,6 +936,8 @@ class Language(object):
|
||||||
exclude = disable
|
exclude = disable
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
deserializers = {}
|
deserializers = {}
|
||||||
|
if Path(path / "config.cfg").exists():
|
||||||
|
deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
|
||||||
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
||||||
deserializers["vocab"] = lambda p: self.vocab.from_disk(
|
deserializers["vocab"] = lambda p: self.vocab.from_disk(
|
||||||
p
|
p
|
||||||
|
@ -933,6 +976,7 @@ class Language(object):
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||||
|
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
|
@ -955,6 +999,7 @@ class Language(object):
|
||||||
deprecation_warning(Warnings.W014)
|
deprecation_warning(Warnings.W014)
|
||||||
exclude = disable
|
exclude = disable
|
||||||
deserializers = {}
|
deserializers = {}
|
||||||
|
deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
|
||||||
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
||||||
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
|
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
|
||||||
b
|
b
|
||||||
|
@ -981,8 +1026,8 @@ class component(object):
|
||||||
and class components and will automatically register components in the
|
and class components and will automatically register components in the
|
||||||
Language.factories. If the component is a class and needs access to the
|
Language.factories. If the component is a class and needs access to the
|
||||||
nlp object or config parameters, it can expose a from_nlp classmethod
|
nlp object or config parameters, it can expose a from_nlp classmethod
|
||||||
that takes the nlp object and **cfg arguments and returns the initialized
|
that takes the nlp & model objects and **cfg arguments, and returns the
|
||||||
component.
|
initialized component.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# NB: This decorator needs to live here, because it needs to write to
|
# NB: This decorator needs to live here, because it needs to write to
|
||||||
|
@ -1011,9 +1056,9 @@ class component(object):
|
||||||
obj.requires = self.requires
|
obj.requires = self.requires
|
||||||
obj.retokenizes = self.retokenizes
|
obj.retokenizes = self.retokenizes
|
||||||
|
|
||||||
def factory(nlp, **cfg):
|
def factory(nlp, model, **cfg):
|
||||||
if hasattr(obj, "from_nlp"):
|
if hasattr(obj, "from_nlp"):
|
||||||
return obj.from_nlp(nlp, **cfg)
|
return obj.from_nlp(nlp, model, **cfg)
|
||||||
elif isinstance(obj, type):
|
elif isinstance(obj, type):
|
||||||
return obj()
|
return obj()
|
||||||
return obj
|
return obj
|
||||||
|
|
|
@ -1,227 +0,0 @@
|
||||||
from spacy import util
|
|
||||||
from spacy.ml.extract_ngrams import extract_ngrams
|
|
||||||
|
|
||||||
from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
|
||||||
from ..errors import Errors
|
|
||||||
from ._character_embed import CharacterEmbed
|
|
||||||
|
|
||||||
from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
|
|
||||||
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
|
|
||||||
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
|
|
||||||
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
|
|
||||||
from thinc.api import zero_init
|
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(arch, config):
|
|
||||||
if arch == "cnn":
|
|
||||||
return build_simple_cnn_text_classifier(**config)
|
|
||||||
elif arch == "bow":
|
|
||||||
return build_bow_text_classifier(**config)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unexpected textcat arch")
|
|
||||||
|
|
||||||
|
|
||||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg):
|
|
||||||
"""
|
|
||||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
|
||||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
|
||||||
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
|
||||||
"""
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
if exclusive_classes:
|
|
||||||
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
|
||||||
else:
|
|
||||||
# TODO: experiment with init_w=zero_init
|
|
||||||
output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
|
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
model.set_dim("nO", nr_class)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_bow_text_classifier(
|
|
||||||
nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg
|
|
||||||
):
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class)
|
|
||||||
model.to_cpu()
|
|
||||||
if not no_output_layer:
|
|
||||||
output_layer = (
|
|
||||||
Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class)
|
|
||||||
)
|
|
||||||
output_layer.to_cpu()
|
|
||||||
model = model >> output_layer
|
|
||||||
model.set_dim("nO", nr_class)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
|
|
||||||
if "entity_width" not in cfg:
|
|
||||||
raise ValueError(Errors.E144.format(param="entity_width"))
|
|
||||||
|
|
||||||
conv_depth = cfg.get("conv_depth", 2)
|
|
||||||
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors", None)
|
|
||||||
context_width = cfg.get("entity_width")
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
|
||||||
nel_tok2vec = Tok2Vec(
|
|
||||||
width=hidden_width,
|
|
||||||
embed_size=embed_width,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
cnn_maxout_pieces=cnn_maxout_pieces,
|
|
||||||
subword_features=True,
|
|
||||||
conv_depth=conv_depth,
|
|
||||||
bilstm_depth=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
model = (
|
|
||||||
nel_tok2vec
|
|
||||||
>> list2ragged()
|
|
||||||
>> reduce_mean()
|
|
||||||
>> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0))
|
|
||||||
>> Linear(nO=context_width, nI=hidden_width)
|
|
||||||
)
|
|
||||||
model.initialize()
|
|
||||||
|
|
||||||
model.set_ref("tok2vec", nel_tok2vec)
|
|
||||||
model.set_dim("nO", context_width)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def masked_language_model(*args, **kwargs):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, tok2vec):
|
|
||||||
token_vector_width = tok2vec.get_dim("nO")
|
|
||||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
|
||||||
softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init))
|
|
||||||
model = chain(tok2vec, softmax)
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
model.set_ref("softmax", softmax)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_morphologizer_model(class_nums, **cfg):
|
|
||||||
embed_size = util.env_opt("embed_size", 7000)
|
|
||||||
if "token_vector_width" in cfg:
|
|
||||||
token_vector_width = cfg["token_vector_width"]
|
|
||||||
else:
|
|
||||||
token_vector_width = util.env_opt("token_vector_width", 128)
|
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
|
||||||
char_embed = cfg.get("char_embed", True)
|
|
||||||
with Model.define_operators({">>": chain, "+": add, "**": clone}):
|
|
||||||
if "tok2vec" in cfg:
|
|
||||||
tok2vec = cfg["tok2vec"]
|
|
||||||
else:
|
|
||||||
tok2vec = Tok2Vec(
|
|
||||||
token_vector_width,
|
|
||||||
embed_size,
|
|
||||||
char_embed=char_embed,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
)
|
|
||||||
softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width))
|
|
||||||
model = tok2vec >> softmax
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
model.set_ref("softmax", softmax)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(
|
|
||||||
width,
|
|
||||||
embed_size,
|
|
||||||
pretrained_vectors=None,
|
|
||||||
window_size=1,
|
|
||||||
cnn_maxout_pieces=3,
|
|
||||||
subword_features=True,
|
|
||||||
char_embed=False,
|
|
||||||
conv_depth=4,
|
|
||||||
bilstm_depth=0,
|
|
||||||
):
|
|
||||||
if char_embed:
|
|
||||||
subword_features = False
|
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
|
||||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
|
|
||||||
if subword_features:
|
|
||||||
prefix = HashEmbed(
|
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
|
|
||||||
)
|
|
||||||
suffix = HashEmbed(
|
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
|
|
||||||
)
|
|
||||||
shape = HashEmbed(
|
|
||||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
prefix, suffix, shape = (None, None, None)
|
|
||||||
if pretrained_vectors is not None:
|
|
||||||
glove = StaticVectors(
|
|
||||||
vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
|
|
||||||
)
|
|
||||||
|
|
||||||
if subword_features:
|
|
||||||
embed = uniqued(
|
|
||||||
(glove | norm | prefix | suffix | shape)
|
|
||||||
>> Maxout(
|
|
||||||
nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True
|
|
||||||
),
|
|
||||||
column=cols.index(ORTH),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
embed = uniqued(
|
|
||||||
(glove | norm)
|
|
||||||
>> Maxout(
|
|
||||||
nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True
|
|
||||||
),
|
|
||||||
column=cols.index(ORTH),
|
|
||||||
)
|
|
||||||
elif subword_features:
|
|
||||||
embed = uniqued(
|
|
||||||
concatenate(norm, prefix, suffix, shape)
|
|
||||||
>> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True),
|
|
||||||
column=cols.index(ORTH),
|
|
||||||
)
|
|
||||||
elif char_embed:
|
|
||||||
embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array(
|
|
||||||
norm
|
|
||||||
)
|
|
||||||
reduce_dimensions = Maxout(
|
|
||||||
nO=width,
|
|
||||||
nI=64 * 8 + width,
|
|
||||||
nP=cnn_maxout_pieces,
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
embed = norm
|
|
||||||
|
|
||||||
convolution = residual(
|
|
||||||
expand_window(window_size=window_size)
|
|
||||||
>> Maxout(
|
|
||||||
nO=width,
|
|
||||||
nI=width * 3,
|
|
||||||
nP=cnn_maxout_pieces,
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if char_embed:
|
|
||||||
tok2vec = embed >> with_array(
|
|
||||||
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
tok2vec = FeatureExtractor(cols) >> with_array(
|
|
||||||
embed >> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
|
|
||||||
if bilstm_depth >= 1:
|
|
||||||
tok2vec = tok2vec >> PyTorchLSTM(
|
|
||||||
nO=width, nI=width, depth=bilstm_depth, bi=True
|
|
||||||
)
|
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
|
||||||
tok2vec.set_dim("nO", width)
|
|
||||||
tok2vec.set_ref("embed", embed)
|
|
||||||
return tok2vec
|
|
6
spacy/ml/models/__init__.py
Normal file
6
spacy/ml/models/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from .entity_linker import *
|
||||||
|
from .parser import *
|
||||||
|
from .tagger import *
|
||||||
|
from .tensorizer import *
|
||||||
|
from .textcat import *
|
||||||
|
from .tok2vec import *
|
93
spacy/ml/models/defaults/__init__.py
Normal file
93
spacy/ml/models/defaults/__init__.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .... import util
|
||||||
|
|
||||||
|
|
||||||
|
def default_nel_config():
|
||||||
|
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_nel():
|
||||||
|
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_morphologizer_config():
|
||||||
|
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_morphologizer():
|
||||||
|
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_parser_config():
|
||||||
|
loc = Path(__file__).parent / "parser_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_parser():
|
||||||
|
loc = Path(__file__).parent / "parser_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_ner_config():
|
||||||
|
loc = Path(__file__).parent / "ner_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_ner():
|
||||||
|
loc = Path(__file__).parent / "ner_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_sentrec_config():
|
||||||
|
loc = Path(__file__).parent / "sentrec_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_sentrec():
|
||||||
|
loc = Path(__file__).parent / "sentrec_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_tagger_config():
|
||||||
|
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_tagger():
|
||||||
|
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_tensorizer_config():
|
||||||
|
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_tensorizer():
|
||||||
|
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_textcat_config():
|
||||||
|
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_textcat():
|
||||||
|
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
def default_tok2vec_config():
|
||||||
|
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=False)
|
||||||
|
|
||||||
|
|
||||||
|
def default_tok2vec():
|
||||||
|
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
||||||
|
return util.load_config(loc, create_objects=True)["model"]
|
12
spacy/ml/models/defaults/entity_linker_defaults.cfg
Normal file
12
spacy/ml/models/defaults/entity_linker_defaults.cfg
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.EntityLinker.v1"
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 2
|
||||||
|
embed_size = 300
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
14
spacy/ml/models/defaults/morphologizer_defaults.cfg
Normal file
14
spacy/ml/models/defaults/morphologizer_defaults.cfg
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashCharEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 128
|
||||||
|
depth = 4
|
||||||
|
embed_size = 7000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
nM = 64
|
||||||
|
nC = 8
|
15
spacy/ml/models/defaults/ner_defaults.cfg
Normal file
15
spacy/ml/models/defaults/ner_defaults.cfg
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 6
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
15
spacy/ml/models/defaults/parser_defaults.cfg
Normal file
15
spacy/ml/models/defaults/parser_defaults.cfg
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
14
spacy/ml/models/defaults/sentrec_defaults.cfg
Normal file
14
spacy/ml/models/defaults/sentrec_defaults.cfg
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashCharEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 12
|
||||||
|
depth = 1
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 2
|
||||||
|
subword_features = true
|
||||||
|
nM = 64
|
||||||
|
nC = 8
|
12
spacy/ml/models/defaults/tagger_defaults.cfg
Normal file
12
spacy/ml/models/defaults/tagger_defaults.cfg
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
4
spacy/ml/models/defaults/tensorizer_defaults.cfg
Normal file
4
spacy/ml/models/defaults/tensorizer_defaults.cfg
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.Tensorizer.v1"
|
||||||
|
input_size=96
|
||||||
|
output_size=300
|
13
spacy/ml/models/defaults/textcat_defaults.cfg
Normal file
13
spacy/ml/models/defaults/textcat_defaults.cfg
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.TextCatCNN.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
9
spacy/ml/models/defaults/tok2vec_defaults.cfg
Normal file
9
spacy/ml/models/defaults/tok2vec_defaults.cfg
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
23
spacy/ml/models/entity_linker.py
Normal file
23
spacy/ml/models/entity_linker.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
|
from thinc.api import Model, Maxout, Linear
|
||||||
|
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||||
|
def build_nel_encoder(tok2vec, nO=None):
|
||||||
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
|
token_width = tok2vec.get_dim("nO")
|
||||||
|
output_layer = Linear(nO=nO, nI=token_width)
|
||||||
|
model = (
|
||||||
|
tok2vec
|
||||||
|
>> list2ragged()
|
||||||
|
>> reduce_mean()
|
||||||
|
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0))
|
||||||
|
>> output_layer
|
||||||
|
)
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
return model
|
29
spacy/ml/models/multi_task.py
Normal file
29
spacy/ml/models/multi_task.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init
|
||||||
|
|
||||||
|
|
||||||
|
def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
|
||||||
|
model = chain(
|
||||||
|
tok2vec,
|
||||||
|
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0),
|
||||||
|
LayerNorm(token_vector_width * 2),
|
||||||
|
Softmax(nO=n_tags, nI=token_vector_width * 2),
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_cloze_multi_task_model(vocab, tok2vec):
|
||||||
|
output_size = vocab.vectors.data.shape[1]
|
||||||
|
output_layer = chain(
|
||||||
|
Maxout(
|
||||||
|
nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0
|
||||||
|
),
|
||||||
|
Linear(nO=output_size, nI=output_size, init_W=zero_init),
|
||||||
|
)
|
||||||
|
model = chain(tok2vec, output_layer)
|
||||||
|
model = build_masked_language_model(vocab, model)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_masked_language_model(*args, **kwargs):
|
||||||
|
# TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828
|
||||||
|
raise NotImplementedError
|
33
spacy/ml/models/parser.py
Normal file
33
spacy/ml/models/parser.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from pydantic import StrictInt
|
||||||
|
|
||||||
|
from spacy.util import registry
|
||||||
|
from spacy.ml._layers import PrecomputableAffine
|
||||||
|
from spacy.syntax._parser_model import ParserModel
|
||||||
|
|
||||||
|
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
|
def build_tb_parser_model(
|
||||||
|
tok2vec: Model,
|
||||||
|
nr_feature_tokens: StrictInt,
|
||||||
|
hidden_width: StrictInt,
|
||||||
|
maxout_pieces: StrictInt,
|
||||||
|
nO=None,
|
||||||
|
):
|
||||||
|
token_vector_width = tok2vec.get_dim("nO")
|
||||||
|
tok2vec = chain(tok2vec, list2array())
|
||||||
|
tok2vec.set_dim("nO", token_vector_width)
|
||||||
|
|
||||||
|
lower = PrecomputableAffine(
|
||||||
|
nO=hidden_width,
|
||||||
|
nF=nr_feature_tokens,
|
||||||
|
nI=tok2vec.get_dim("nO"),
|
||||||
|
nP=maxout_pieces,
|
||||||
|
)
|
||||||
|
lower.set_dim("nP", maxout_pieces)
|
||||||
|
with use_ops("numpy"):
|
||||||
|
# Initialize weights at zero, as it's a classification layer.
|
||||||
|
upper = Linear(nO=nO, init_W=zero_init)
|
||||||
|
model = ParserModel(tok2vec, lower, upper)
|
||||||
|
return model
|
16
spacy/ml/models/tagger.py
Normal file
16
spacy/ml/models/tagger.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
|
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.Tagger.v1")
|
||||||
|
def build_tagger_model(tok2vec, nO=None) -> Model:
|
||||||
|
token_vector_width = tok2vec.get_dim("nO")
|
||||||
|
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||||
|
output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
|
||||||
|
softmax = with_array(output_layer)
|
||||||
|
model = chain(tok2vec, softmax)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_ref("softmax", softmax)
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
return model
|
10
spacy/ml/models/tensorizer.py
Normal file
10
spacy/ml/models/tensorizer.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
from thinc.api import Linear, zero_init
|
||||||
|
|
||||||
|
from ... import util
|
||||||
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.Tensorizer.v1")
|
||||||
|
def build_tensorizer(input_size, output_size):
|
||||||
|
input_size = util.env_opt("token_vector_width", input_size)
|
||||||
|
return Linear(output_size, input_size, init_W=zero_init)
|
42
spacy/ml/models/textcat.py
Normal file
42
spacy/ml/models/textcat.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
from spacy.attrs import ORTH
|
||||||
|
from spacy.util import registry
|
||||||
|
from spacy.ml.extract_ngrams import extract_ngrams
|
||||||
|
|
||||||
|
from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
||||||
|
"""
|
||||||
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||||
|
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||||
|
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
||||||
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
|
"""
|
||||||
|
with Model.define_operators({">>": chain}):
|
||||||
|
if exclusive_classes:
|
||||||
|
output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||||
|
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
else:
|
||||||
|
# TODO: experiment with init_w=zero_init
|
||||||
|
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||||
|
model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||||
|
model.set_ref("output_layer", linear_layer)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TextCatBOW.v1")
|
||||||
|
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
|
||||||
|
# Note: original defaults were ngram_size=1 and no_output_layer=False
|
||||||
|
with Model.define_operators({">>": chain}):
|
||||||
|
model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO)
|
||||||
|
model.to_cpu()
|
||||||
|
if not no_output_layer:
|
||||||
|
output_layer = Softmax(nO) if exclusive_classes else Logistic(nO)
|
||||||
|
output_layer.to_cpu()
|
||||||
|
model = model >> output_layer
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
return model
|
390
spacy/ml/models/tok2vec.py
Normal file
390
spacy/ml/models/tok2vec.py
Normal file
|
@ -0,0 +1,390 @@
|
||||||
|
from thinc.api import chain, clone, concatenate, with_array, uniqued
|
||||||
|
from thinc.api import Model, noop, with_padded, Maxout, expand_window
|
||||||
|
from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM
|
||||||
|
from thinc.api import residual, LayerNorm, FeatureExtractor, Mish
|
||||||
|
|
||||||
|
from ... import util
|
||||||
|
from ...util import registry, make_layer
|
||||||
|
from ...ml import _character_embed
|
||||||
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
|
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.Tok2VecTensors.v1")
|
||||||
|
def tok2vec_tensors_v1(width):
|
||||||
|
tok2vec = Tok2VecListener("tok2vec", width=width)
|
||||||
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.VocabVectors.v1")
|
||||||
|
def get_vocab_vectors(name):
|
||||||
|
nlp = util.load_model(name)
|
||||||
|
return nlp.vocab.vectors
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
||||||
|
def Tok2Vec(config):
|
||||||
|
doc2feats = make_layer(config["@doc2feats"])
|
||||||
|
embed = make_layer(config["@embed"])
|
||||||
|
encode = make_layer(config["@encode"])
|
||||||
|
field_size = 0
|
||||||
|
if encode.has_attr("receptive_field"):
|
||||||
|
field_size = encode.attrs["receptive_field"]
|
||||||
|
tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
|
||||||
|
tok2vec.attrs["cfg"] = config
|
||||||
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
||||||
|
tok2vec.set_ref("embed", embed)
|
||||||
|
tok2vec.set_ref("encode", encode)
|
||||||
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.Doc2Feats.v1")
|
||||||
|
def Doc2Feats(config):
|
||||||
|
columns = config["columns"]
|
||||||
|
return FeatureExtractor(columns)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.HashEmbedCNN.v1")
|
||||||
|
def hash_embed_cnn(
|
||||||
|
pretrained_vectors,
|
||||||
|
width,
|
||||||
|
depth,
|
||||||
|
embed_size,
|
||||||
|
maxout_pieces,
|
||||||
|
window_size,
|
||||||
|
subword_features,
|
||||||
|
):
|
||||||
|
# Does not use character embeddings: set to False by default
|
||||||
|
return build_Tok2Vec_model(
|
||||||
|
width=width,
|
||||||
|
embed_size=embed_size,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
conv_depth=depth,
|
||||||
|
bilstm_depth=0,
|
||||||
|
maxout_pieces=maxout_pieces,
|
||||||
|
window_size=window_size,
|
||||||
|
subword_features=subword_features,
|
||||||
|
char_embed=False,
|
||||||
|
nM=0,
|
||||||
|
nC=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.HashCharEmbedCNN.v1")
|
||||||
|
def hash_charembed_cnn(
|
||||||
|
pretrained_vectors,
|
||||||
|
width,
|
||||||
|
depth,
|
||||||
|
embed_size,
|
||||||
|
maxout_pieces,
|
||||||
|
window_size,
|
||||||
|
subword_features,
|
||||||
|
nM=0,
|
||||||
|
nC=0,
|
||||||
|
):
|
||||||
|
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||||
|
return build_Tok2Vec_model(
|
||||||
|
width=width,
|
||||||
|
embed_size=embed_size,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
conv_depth=depth,
|
||||||
|
bilstm_depth=0,
|
||||||
|
maxout_pieces=maxout_pieces,
|
||||||
|
window_size=window_size,
|
||||||
|
subword_features=subword_features,
|
||||||
|
char_embed=True,
|
||||||
|
nM=nM,
|
||||||
|
nC=nC,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
||||||
|
def hash_embed_bilstm_v1(
|
||||||
|
pretrained_vectors, width, depth, embed_size, subword_features
|
||||||
|
):
|
||||||
|
# Does not use character embeddings: set to False by default
|
||||||
|
return build_Tok2Vec_model(
|
||||||
|
width=width,
|
||||||
|
embed_size=embed_size,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
bilstm_depth=depth,
|
||||||
|
conv_depth=0,
|
||||||
|
maxout_pieces=0,
|
||||||
|
window_size=1,
|
||||||
|
subword_features=subword_features,
|
||||||
|
char_embed=False,
|
||||||
|
nM=0,
|
||||||
|
nC=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
||||||
|
def hash_embed_bilstm_v1(
|
||||||
|
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
|
||||||
|
):
|
||||||
|
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||||
|
return build_Tok2Vec_model(
|
||||||
|
width=width,
|
||||||
|
embed_size=embed_size,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
bilstm_depth=depth,
|
||||||
|
conv_depth=0,
|
||||||
|
maxout_pieces=0,
|
||||||
|
window_size=1,
|
||||||
|
subword_features=subword_features,
|
||||||
|
char_embed=True,
|
||||||
|
nM=nM,
|
||||||
|
nC=nC,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||||
|
def MultiHashEmbed(config):
|
||||||
|
# For backwards compatibility with models before the architecture registry,
|
||||||
|
# we have to be careful to get exactly the same model structure. One subtle
|
||||||
|
# trick is that when we define concatenation with the operator, the operator
|
||||||
|
# is actually binary associative. So when we write (a | b | c), we're actually
|
||||||
|
# getting concatenate(concatenate(a, b), c). That's why the implementation
|
||||||
|
# is a bit ugly here.
|
||||||
|
cols = config["columns"]
|
||||||
|
width = config["width"]
|
||||||
|
rows = config["rows"]
|
||||||
|
|
||||||
|
norm = HashEmbed(width, rows, column=cols.index("NORM"))
|
||||||
|
if config["use_subwords"]:
|
||||||
|
prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"))
|
||||||
|
suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"))
|
||||||
|
shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"))
|
||||||
|
if config.get("@pretrained_vectors"):
|
||||||
|
glove = make_layer(config["@pretrained_vectors"])
|
||||||
|
mix = make_layer(config["@mix"])
|
||||||
|
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
|
if config["use_subwords"] and config["@pretrained_vectors"]:
|
||||||
|
mix._layers[0].set_dim("nI", width * 5)
|
||||||
|
layer = uniqued(
|
||||||
|
(glove | norm | prefix | suffix | shape) >> mix,
|
||||||
|
column=cols.index("ORTH"),
|
||||||
|
)
|
||||||
|
elif config["use_subwords"]:
|
||||||
|
mix._layers[0].set_dim("nI", width * 4)
|
||||||
|
layer = uniqued(
|
||||||
|
(norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
|
||||||
|
)
|
||||||
|
elif config["@pretrained_vectors"]:
|
||||||
|
mix._layers[0].set_dim("nI", width * 2)
|
||||||
|
layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"))
|
||||||
|
else:
|
||||||
|
layer = norm
|
||||||
|
layer.attrs["cfg"] = config
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
|
def CharacterEmbed(config):
|
||||||
|
width = config["width"]
|
||||||
|
chars = config["chars"]
|
||||||
|
|
||||||
|
chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
|
||||||
|
other_tables = make_layer(config["@embed_features"])
|
||||||
|
mix = make_layer(config["@mix"])
|
||||||
|
|
||||||
|
model = chain(concatenate(chr_embed, other_tables), mix)
|
||||||
|
model.attrs["cfg"] = config
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
||||||
|
def MaxoutWindowEncoder(config):
|
||||||
|
nO = config["width"]
|
||||||
|
nW = config["window_size"]
|
||||||
|
nP = config["pieces"]
|
||||||
|
depth = config["depth"]
|
||||||
|
|
||||||
|
cnn = (
|
||||||
|
expand_window(window_size=nW),
|
||||||
|
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
|
||||||
|
)
|
||||||
|
model = clone(residual(cnn), depth)
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
model.attrs["receptive_field"] = nW * depth
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||||
|
def MishWindowEncoder(config):
|
||||||
|
nO = config["width"]
|
||||||
|
nW = config["window_size"]
|
||||||
|
depth = config["depth"]
|
||||||
|
|
||||||
|
cnn = chain(
|
||||||
|
expand_window(window_size=nW),
|
||||||
|
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
|
||||||
|
LayerNorm(nO),
|
||||||
|
)
|
||||||
|
model = clone(residual(cnn), depth)
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||||
|
def TorchBiLSTMEncoder(config):
|
||||||
|
import torch.nn
|
||||||
|
|
||||||
|
# TODO FIX
|
||||||
|
from thinc.api import PyTorchRNNWrapper
|
||||||
|
|
||||||
|
width = config["width"]
|
||||||
|
depth = config["depth"]
|
||||||
|
if depth == 0:
|
||||||
|
return noop()
|
||||||
|
return with_padded(
|
||||||
|
PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: update
|
||||||
|
_EXAMPLE_CONFIG = {
|
||||||
|
"@doc2feats": {
|
||||||
|
"arch": "Doc2Feats",
|
||||||
|
"config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
|
||||||
|
},
|
||||||
|
"@embed": {
|
||||||
|
"arch": "spacy.MultiHashEmbed.v1",
|
||||||
|
"config": {
|
||||||
|
"width": 96,
|
||||||
|
"rows": 2000,
|
||||||
|
"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
|
||||||
|
"use_subwords": True,
|
||||||
|
"@pretrained_vectors": {
|
||||||
|
"arch": "TransformedStaticVectors",
|
||||||
|
"config": {
|
||||||
|
"vectors_name": "en_vectors_web_lg.vectors",
|
||||||
|
"width": 96,
|
||||||
|
"column": 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"@mix": {
|
||||||
|
"arch": "LayerNormalizedMaxout",
|
||||||
|
"config": {"width": 96, "pieces": 3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"@encode": {
|
||||||
|
"arch": "MaxoutWindowEncode",
|
||||||
|
"config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_Tok2Vec_model(
|
||||||
|
width,
|
||||||
|
embed_size,
|
||||||
|
pretrained_vectors,
|
||||||
|
window_size,
|
||||||
|
maxout_pieces,
|
||||||
|
subword_features,
|
||||||
|
char_embed,
|
||||||
|
nM,
|
||||||
|
nC,
|
||||||
|
conv_depth,
|
||||||
|
bilstm_depth,
|
||||||
|
) -> Model:
|
||||||
|
if char_embed:
|
||||||
|
subword_features = False
|
||||||
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
|
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM))
|
||||||
|
if subword_features:
|
||||||
|
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX))
|
||||||
|
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX))
|
||||||
|
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE))
|
||||||
|
else:
|
||||||
|
prefix, suffix, shape = (None, None, None)
|
||||||
|
if pretrained_vectors is not None:
|
||||||
|
glove = StaticVectors(
|
||||||
|
vectors=pretrained_vectors.data,
|
||||||
|
nO=width,
|
||||||
|
column=cols.index(ID),
|
||||||
|
dropout=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if subword_features:
|
||||||
|
columns = 5
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm | prefix | suffix | shape)
|
||||||
|
>> Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=width * columns,
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
columns = 2
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm)
|
||||||
|
>> Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=width * columns,
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
elif subword_features:
|
||||||
|
columns = 4
|
||||||
|
embed = uniqued(
|
||||||
|
concatenate(norm, prefix, suffix, shape)
|
||||||
|
>> Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=width * columns,
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
elif char_embed:
|
||||||
|
embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor(
|
||||||
|
cols
|
||||||
|
) >> with_array(norm)
|
||||||
|
reduce_dimensions = Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=nM * nC + width,
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
embed = norm
|
||||||
|
|
||||||
|
convolution = residual(
|
||||||
|
expand_window(window_size=window_size)
|
||||||
|
>> Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=width * ((window_size * 2) + 1),
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if char_embed:
|
||||||
|
tok2vec = embed >> with_array(
|
||||||
|
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tok2vec = FeatureExtractor(cols) >> with_array(
|
||||||
|
embed >> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
|
||||||
|
if bilstm_depth >= 1:
|
||||||
|
tok2vec = tok2vec >> PyTorchLSTM(
|
||||||
|
nO=width, nI=width, depth=bilstm_depth, bi=True
|
||||||
|
)
|
||||||
|
tok2vec.set_dim("nO", width)
|
||||||
|
tok2vec.set_ref("embed", embed)
|
||||||
|
return tok2vec
|
|
@ -1,178 +0,0 @@
|
||||||
from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
|
|
||||||
from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
|
|
||||||
from thinc.api import residual, LayerNorm, FeatureExtractor
|
|
||||||
|
|
||||||
from ..ml import _character_embed
|
|
||||||
from ..util import make_layer, registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
|
||||||
def Tok2Vec(config):
|
|
||||||
doc2feats = make_layer(config["@doc2feats"])
|
|
||||||
embed = make_layer(config["@embed"])
|
|
||||||
encode = make_layer(config["@encode"])
|
|
||||||
field_size = 0
|
|
||||||
if encode.has_attr("receptive_field"):
|
|
||||||
field_size = encode.attrs["receptive_field"]
|
|
||||||
tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
|
|
||||||
tok2vec.attrs["cfg"] = config
|
|
||||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
|
||||||
tok2vec.set_ref("embed", embed)
|
|
||||||
tok2vec.set_ref("encode", encode)
|
|
||||||
return tok2vec
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Doc2Feats.v1")
|
|
||||||
def Doc2Feats(config):
|
|
||||||
columns = config["columns"]
|
|
||||||
return FeatureExtractor(columns)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
|
||||||
def MultiHashEmbed(config):
|
|
||||||
# For backwards compatibility with models before the architecture registry,
|
|
||||||
# we have to be careful to get exactly the same model structure. One subtle
|
|
||||||
# trick is that when we define concatenation with the operator, the operator
|
|
||||||
# is actually binary associative. So when we write (a | b | c), we're actually
|
|
||||||
# getting concatenate(concatenate(a, b), c). That's why the implementation
|
|
||||||
# is a bit ugly here.
|
|
||||||
cols = config["columns"]
|
|
||||||
width = config["width"]
|
|
||||||
rows = config["rows"]
|
|
||||||
|
|
||||||
norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0)
|
|
||||||
if config["use_subwords"]:
|
|
||||||
prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0)
|
|
||||||
suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0)
|
|
||||||
shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0)
|
|
||||||
if config.get("@pretrained_vectors"):
|
|
||||||
glove = make_layer(config["@pretrained_vectors"])
|
|
||||||
mix = make_layer(config["@mix"])
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
|
||||||
if config["use_subwords"] and config["@pretrained_vectors"]:
|
|
||||||
mix._layers[0].set_dim("nI", width * 5)
|
|
||||||
layer = uniqued(
|
|
||||||
(glove | norm | prefix | suffix | shape) >> mix,
|
|
||||||
column=cols.index("ORTH"),
|
|
||||||
)
|
|
||||||
elif config["use_subwords"]:
|
|
||||||
mix._layers[0].set_dim("nI", width * 4)
|
|
||||||
layer = uniqued(
|
|
||||||
(norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
|
|
||||||
)
|
|
||||||
elif config["@pretrained_vectors"]:
|
|
||||||
mix._layers[0].set_dim("nI", width * 2)
|
|
||||||
layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
|
|
||||||
else:
|
|
||||||
layer = norm
|
|
||||||
layer.attrs["cfg"] = config
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
|
||||||
def CharacterEmbed(config):
|
|
||||||
width = config["width"]
|
|
||||||
chars = config["chars"]
|
|
||||||
|
|
||||||
chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
|
|
||||||
other_tables = make_layer(config["@embed_features"])
|
|
||||||
mix = make_layer(config["@mix"])
|
|
||||||
|
|
||||||
model = chain(concatenate(chr_embed, other_tables), mix)
|
|
||||||
model.attrs["cfg"] = config
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
|
||||||
def MaxoutWindowEncoder(config):
|
|
||||||
nO = config["width"]
|
|
||||||
nW = config["window_size"]
|
|
||||||
nP = config["pieces"]
|
|
||||||
depth = config["depth"]
|
|
||||||
cnn = (
|
|
||||||
expand_window(window_size=nW),
|
|
||||||
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
|
|
||||||
)
|
|
||||||
model = clone(residual(cnn), depth)
|
|
||||||
model.set_dim("nO", nO)
|
|
||||||
model.attrs["receptive_field"] = nW * depth
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
|
||||||
def MishWindowEncoder(config):
|
|
||||||
from thinc.api import Mish
|
|
||||||
|
|
||||||
nO = config["width"]
|
|
||||||
nW = config["window_size"]
|
|
||||||
depth = config["depth"]
|
|
||||||
cnn = chain(
|
|
||||||
expand_window(window_size=nW),
|
|
||||||
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
|
|
||||||
LayerNorm(nO),
|
|
||||||
)
|
|
||||||
model = clone(residual(cnn), depth)
|
|
||||||
model.set_dim("nO", nO)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.PretrainedVectors.v1")
|
|
||||||
def PretrainedVectors(config):
|
|
||||||
# TODO: actual vectors instead of name
|
|
||||||
return StaticVectors(
|
|
||||||
vectors=config["vectors_name"],
|
|
||||||
nO=config["width"],
|
|
||||||
column=config["column"],
|
|
||||||
dropout=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
|
||||||
def TorchBiLSTMEncoder(config):
|
|
||||||
import torch.nn
|
|
||||||
|
|
||||||
# TODO: FIX
|
|
||||||
from thinc.api import PyTorchRNNWrapper
|
|
||||||
|
|
||||||
width = config["width"]
|
|
||||||
depth = config["depth"]
|
|
||||||
if depth == 0:
|
|
||||||
return noop()
|
|
||||||
return with_padded(
|
|
||||||
PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: update
|
|
||||||
_EXAMPLE_CONFIG = {
|
|
||||||
"@doc2feats": {
|
|
||||||
"arch": "Doc2Feats",
|
|
||||||
"config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
|
|
||||||
},
|
|
||||||
"@embed": {
|
|
||||||
"arch": "spacy.MultiHashEmbed.v1",
|
|
||||||
"config": {
|
|
||||||
"width": 96,
|
|
||||||
"rows": 2000,
|
|
||||||
"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
|
|
||||||
"use_subwords": True,
|
|
||||||
"@pretrained_vectors": {
|
|
||||||
"arch": "TransformedStaticVectors",
|
|
||||||
"config": {
|
|
||||||
"vectors_name": "en_vectors_web_lg.vectors",
|
|
||||||
"width": 96,
|
|
||||||
"column": 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"@mix": {
|
|
||||||
"arch": "LayerNormalizedMaxout",
|
|
||||||
"config": {"width": 96, "pieces": 3},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"@encode": {
|
|
||||||
"arch": "MaxoutWindowEncode",
|
|
||||||
"config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -66,7 +66,7 @@ class EntityRuler(object):
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_nlp(cls, nlp, **cfg):
|
def from_nlp(cls, nlp, model=None, **cfg):
|
||||||
return cls(nlp, **cfg)
|
return cls(nlp, **cfg)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
|
|
@ -76,11 +76,9 @@ class SimilarityHook(Pipe):
|
||||||
yield self(doc)
|
yield self(doc)
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
def predict(self, doc1, doc2):
|
||||||
self.require_model()
|
|
||||||
return self.model.predict([(doc1, doc2)])
|
return self.model.predict([(doc1, doc2)])
|
||||||
|
|
||||||
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
|
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
|
||||||
self.require_model()
|
|
||||||
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||||
|
|
|
@ -15,25 +15,15 @@ from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
|
|
||||||
from ..ml.component_models import build_morphologizer_model
|
|
||||||
|
|
||||||
|
|
||||||
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
||||||
class Morphologizer(Pipe):
|
class Morphologizer(Pipe):
|
||||||
|
|
||||||
@classmethod
|
def __init__(self, vocab, model, **cfg):
|
||||||
def Model(cls, **cfg):
|
|
||||||
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
|
||||||
raise ValueError(TempErrors.T008)
|
|
||||||
class_map = Morphology.create_class_map()
|
|
||||||
return build_morphologizer_model(class_map.field_sizes, **cfg)
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ?
|
||||||
self._class_map = self.vocab.morphology.create_class_map()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -58,6 +48,14 @@ class Morphologizer(Pipe):
|
||||||
self.set_annotations(docs, features, tensors=tokvecs)
|
self.set_annotations(docs, features, tensors=tokvecs)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
**kwargs):
|
||||||
|
self.set_output(len(self.labels))
|
||||||
|
self.model.initialize()
|
||||||
|
if sgd is None:
|
||||||
|
sgd = self.create_optimizer()
|
||||||
|
return sgd
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle case where there are no tokens in any docs.
|
# Handle case where there are no tokens in any docs.
|
||||||
|
@ -65,8 +63,8 @@ class Morphologizer(Pipe):
|
||||||
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
||||||
tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
|
tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
return guesses, tokvecs
|
return guesses, tokvecs
|
||||||
tokvecs = self.model.tok2vec(docs)
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
scores = self.model.softmax(tokvecs)
|
scores = self.model.get_ref("softmax")(tokvecs)
|
||||||
return scores, tokvecs
|
return scores, tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_scores, tensors=None):
|
def set_annotations(self, docs, batch_scores, tensors=None):
|
||||||
|
|
|
@ -3,8 +3,7 @@
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
import random
|
import random
|
||||||
from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
|
from thinc.api import CosineDistance, to_categorical, get_array_module
|
||||||
from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
|
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -22,11 +21,6 @@ from ..attrs import POS, ID
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
from ..kb import KnowledgeBase
|
from ..kb import KnowledgeBase
|
||||||
from ..ml.component_models import Tok2Vec, build_tagger_model
|
|
||||||
from ..ml.component_models import build_text_classifier
|
|
||||||
from ..ml.component_models import build_simple_cnn_text_classifier
|
|
||||||
from ..ml.component_models import build_bow_text_classifier, build_nel_encoder
|
|
||||||
from ..ml.component_models import masked_language_model
|
|
||||||
from ..errors import Errors, TempErrors, user_warning, Warnings
|
from ..errors import Errors, TempErrors, user_warning, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -47,13 +41,8 @@ class Pipe(object):
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, *shape, **kwargs):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
"""Initialize a model for the pipe."""
|
return cls(nlp.vocab, model, **cfg)
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_nlp(cls, nlp, **cfg):
|
|
||||||
return cls(nlp.vocab, **cfg)
|
|
||||||
|
|
||||||
def _get_doc(self, example):
|
def _get_doc(self, example):
|
||||||
""" Use this method if the `example` can be both a Doc or an Example """
|
""" Use this method if the `example` can be both a Doc or an Example """
|
||||||
|
@ -61,7 +50,7 @@ class Pipe(object):
|
||||||
return example
|
return example
|
||||||
return example.doc
|
return example.doc
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -72,7 +61,6 @@ class Pipe(object):
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
|
||||||
doc = self._get_doc(example)
|
doc = self._get_doc(example)
|
||||||
predictions = self.predict([doc])
|
predictions = self.predict([doc])
|
||||||
if isinstance(predictions, tuple) and len(predictions) == 2:
|
if isinstance(predictions, tuple) and len(predictions) == 2:
|
||||||
|
@ -85,11 +73,6 @@ class Pipe(object):
|
||||||
return example
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def require_model(self):
|
|
||||||
"""Raise an error if the component's model is not initialized."""
|
|
||||||
if getattr(self, "model", None) in (None, True, False):
|
|
||||||
raise ValueError(Errors.E109.format(name=self.name))
|
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
"""Apply the pipe to a stream of documents.
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
|
@ -116,7 +99,6 @@ class Pipe(object):
|
||||||
"""Apply the pipeline's model to a batch of docs, without
|
"""Apply the pipeline's model to a batch of docs, without
|
||||||
modifying them.
|
modifying them.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
|
@ -158,22 +140,23 @@ class Pipe(object):
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
if self.model is True:
|
self.model.initialize()
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
if hasattr(self, "vocab"):
|
if hasattr(self, "vocab"):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
self.model.initialize()
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
|
def set_output(self, nO):
|
||||||
|
self.model.set_dim("nO", nO)
|
||||||
|
if self.model.has_ref("output_layer"):
|
||||||
|
self.model.get_ref("output_layer").set_dim("nO", nO)
|
||||||
|
|
||||||
def get_gradients(self):
|
def get_gradients(self):
|
||||||
"""Get non-zero gradients of the model's parameters, as a dictionary
|
"""Get non-zero gradients of the model's parameters, as a dictionary
|
||||||
keyed by the parameter ID. The values are (weights, gradients) tuples.
|
keyed by the parameter ID. The values are (weights, gradients) tuples.
|
||||||
"""
|
"""
|
||||||
gradients = {}
|
gradients = {}
|
||||||
if self.model in (None, True, False):
|
|
||||||
return gradients
|
|
||||||
queue = [self.model]
|
queue = [self.model]
|
||||||
seen = set()
|
seen = set()
|
||||||
for node in queue:
|
for node in queue:
|
||||||
|
@ -199,8 +182,7 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
if self.model not in (True, False, None):
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
if hasattr(self, "vocab"):
|
if hasattr(self, "vocab"):
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
|
@ -210,20 +192,15 @@ class Pipe(object):
|
||||||
"""Load the pipe from a bytestring."""
|
"""Load the pipe from a bytestring."""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
|
||||||
self.cfg["pretrained_vectors"] = self.vocab.vectors
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
|
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
|
||||||
if hasattr(self, "vocab"):
|
if hasattr(self, "vocab"):
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||||
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
@ -234,8 +211,7 @@ class Pipe(object):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
if self.model not in (None, True, False):
|
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||||
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
@ -243,19 +219,14 @@ class Pipe(object):
|
||||||
"""Load the pipe from disk."""
|
"""Load the pipe from disk."""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
|
||||||
self.cfg["pretrained_vectors"] = self.vocab.vectors
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
|
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
|
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
@ -266,31 +237,13 @@ class Pipe(object):
|
||||||
class Tensorizer(Pipe):
|
class Tensorizer(Pipe):
|
||||||
"""Pre-train position-sensitive vectors for tokens."""
|
"""Pre-train position-sensitive vectors for tokens."""
|
||||||
|
|
||||||
@classmethod
|
def __init__(self, vocab, model, **cfg):
|
||||||
def Model(cls, output_size=300, **cfg):
|
|
||||||
"""Create a new statistical model for the class.
|
|
||||||
|
|
||||||
width (int): Output size of the model.
|
|
||||||
embed_size (int): Number of vectors in the embedding table.
|
|
||||||
**cfg: Config parameters.
|
|
||||||
RETURNS (Model): A `thinc.model.Model` or similar instance.
|
|
||||||
"""
|
|
||||||
input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
|
|
||||||
return Linear(output_size, input_size, init_W=zero_init)
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
initialisation.
|
initialisation.
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||||
`Vocab` instance with the `Doc` objects it will process.
|
`Vocab` instance with the `Doc` objects it will process.
|
||||||
model (Model): A `Model` instance or `True` to allocate one later.
|
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
>>> from spacy.pipeline import TokenVectorEncoder
|
|
||||||
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
|
|
||||||
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -337,7 +290,6 @@ class Tensorizer(Pipe):
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
RETURNS (object): Vector representations for each token in the docs.
|
RETURNS (object): Vector representations for each token in the docs.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
|
||||||
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
||||||
outputs = self.model(inputs)
|
outputs = self.model(inputs)
|
||||||
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
||||||
|
@ -362,7 +314,6 @@ class Tensorizer(Pipe):
|
||||||
sgd (callable): An optimizer.
|
sgd (callable): An optimizer.
|
||||||
RETURNS (dict): Results from the update.
|
RETURNS (dict): Results from the update.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
inputs = []
|
inputs = []
|
||||||
bp_inputs = []
|
bp_inputs = []
|
||||||
|
@ -405,10 +356,8 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
for name, model in pipeline:
|
for name, model in pipeline:
|
||||||
if getattr(model, "tok2vec", None):
|
if model.has_ref("tok2vec"):
|
||||||
self.input_models.append(model.tok2vec)
|
self.input_models.append(model.get_ref("tok2vec"))
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -423,7 +372,7 @@ class Tagger(Pipe):
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
@ -433,13 +382,6 @@ class Tagger(Pipe):
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return tuple(self.vocab.morphology.tag_names)
|
return tuple(self.vocab.morphology.tag_names)
|
||||||
|
|
||||||
@property
|
|
||||||
def tok2vec(self):
|
|
||||||
if self.model in (None, True, False):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return chain(self.model.get_ref("tok2vec"), list2array())
|
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
doc = self._get_doc(example)
|
doc = self._get_doc(example)
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
|
@ -465,7 +407,6 @@ class Tagger(Pipe):
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
self.require_model()
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
n_labels = len(self.labels)
|
n_labels = len(self.labels)
|
||||||
|
@ -513,7 +454,6 @@ class Tagger(Pipe):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
@ -600,52 +540,21 @@ class Tagger(Pipe):
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
vocab.morphology.lemmatizer,
|
vocab.morphology.lemmatizer,
|
||||||
exc=vocab.morphology.exc)
|
exc=vocab.morphology.exc)
|
||||||
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
|
self.set_output(len(self.labels))
|
||||||
if self.model is True:
|
self.model.initialize()
|
||||||
for hp in ["token_vector_width", "conv_depth"]:
|
|
||||||
if hp in kwargs:
|
|
||||||
self.cfg[hp] = kwargs[hp]
|
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
|
||||||
# Get batch of example docs, example outputs to call begin_training().
|
# Get batch of example docs, example outputs to call begin_training().
|
||||||
# This lets the model infer shapes.
|
# This lets the model infer shapes.
|
||||||
n_tags = self.vocab.morphology.n_tags
|
|
||||||
for node in self.model.walk():
|
|
||||||
# TODO: softmax hack ?
|
|
||||||
if node.name == "softmax" and node.has_dim("nO") is None:
|
|
||||||
node.set_dim("nO", n_tags)
|
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
self.model.initialize()
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def Model(cls, n_tags=None, **cfg):
|
|
||||||
if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
|
|
||||||
raise ValueError(TempErrors.T008)
|
|
||||||
if "tok2vec" in cfg:
|
|
||||||
tok2vec = cfg["tok2vec"]
|
|
||||||
else:
|
|
||||||
config = {
|
|
||||||
"width": cfg.get("token_vector_width", 96),
|
|
||||||
"embed_size": cfg.get("embed_size", 2000),
|
|
||||||
"pretrained_vectors": cfg.get("pretrained_vectors", None),
|
|
||||||
"window_size": cfg.get("window_size", 1),
|
|
||||||
"cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
|
|
||||||
"subword_features": cfg.get("subword_features", True),
|
|
||||||
"char_embed": cfg.get("char_embed", False),
|
|
||||||
"conv_depth": cfg.get("conv_depth", 4),
|
|
||||||
"bilstm_depth": cfg.get("bilstm_depth", 0),
|
|
||||||
}
|
|
||||||
tok2vec = Tok2Vec(**config)
|
|
||||||
return build_tagger_model(n_tags, tok2vec)
|
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
if self.model not in (True, False, None):
|
if self.model.has_dim("nO"):
|
||||||
# Here's how the model resizing will work, once the
|
# Here's how the model resizing will work, once the
|
||||||
# neuron-to-tag mapping is no longer controlled by
|
# neuron-to-tag mapping is no longer controlled by
|
||||||
# the Morphology class, which sorts the tag names.
|
# the Morphology class, which sorts the tag names.
|
||||||
|
@ -672,8 +581,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
if self.model not in (None, True, False):
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
|
@ -683,14 +591,6 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
|
||||||
self.cfg["pretrained_vectors"] = self.vocab.vectors
|
|
||||||
if self.model is True:
|
|
||||||
token_vector_width = util.env_opt(
|
|
||||||
"token_vector_width",
|
|
||||||
self.cfg.get("token_vector_width", 96))
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
@ -719,18 +619,13 @@ class Tagger(Pipe):
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
|
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg)
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
|
||||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
|
||||||
self.cfg["pretrained_vectors"] = self.vocab.vectors
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(file_.read())
|
self.model.from_bytes(file_.read())
|
||||||
|
@ -745,8 +640,8 @@ class Tagger(Pipe):
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
"vocab": lambda p: self.vocab.from_disk(p),
|
||||||
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
"tag_map": load_tag_map,
|
"tag_map": load_tag_map,
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
|
@ -762,16 +657,11 @@ class SentenceRecognizer(Tagger):
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.cfg.setdefault("cnn_maxout_pieces", 2)
|
|
||||||
self.cfg.setdefault("subword_features", True)
|
|
||||||
self.cfg.setdefault("token_vector_width", 12)
|
|
||||||
self.cfg.setdefault("conv_depth", 1)
|
|
||||||
self.cfg.setdefault("pretrained_vectors", None)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -797,7 +687,6 @@ class SentenceRecognizer(Tagger):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None):
|
def update(self, examples, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
@ -844,20 +733,12 @@ class SentenceRecognizer(Tagger):
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
if self.model is True:
|
self.set_output(len(self.labels))
|
||||||
for hp in ["token_vector_width", "conv_depth"]:
|
self.model.initialize()
|
||||||
if hp in kwargs:
|
|
||||||
self.cfg[hp] = kwargs[hp]
|
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
self.model.initialize()
|
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def Model(cls, n_tags, **cfg):
|
|
||||||
return build_tagger_model(n_tags, **cfg)
|
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -867,8 +748,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
if self.model not in (None, True, False):
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
|
@ -876,8 +756,6 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
@ -896,15 +774,13 @@ class SentenceRecognizer(Tagger):
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg)
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(file_.read())
|
self.model.from_bytes(file_.read())
|
||||||
|
@ -912,8 +788,8 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
|
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
"vocab": lambda p: self.vocab.from_disk(p),
|
||||||
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
|
@ -927,7 +803,7 @@ class MultitaskObjective(Tagger):
|
||||||
side-objective.
|
side-objective.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
def __init__(self, vocab, model, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
if target == "dep":
|
if target == "dep":
|
||||||
|
@ -947,7 +823,8 @@ class MultitaskObjective(Tagger):
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E016)
|
raise ValueError(Errors.E016)
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault("cnn_maxout_pieces", 2)
|
# TODO: remove - put in config
|
||||||
|
self.cfg.setdefault("maxout_pieces", 2)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -969,30 +846,15 @@ class MultitaskObjective(Tagger):
|
||||||
label = self.make_label(i, example.token_annotation)
|
label = self.make_label(i, example.token_annotation)
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
if self.model is True:
|
|
||||||
token_vector_width = util.env_opt("token_vector_width")
|
|
||||||
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
|
|
||||||
link_vectors_to_models(self.vocab)
|
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
|
||||||
token_vector_width = util.env_opt("token_vector_width", 96)
|
|
||||||
model = chain(
|
|
||||||
tok2vec,
|
|
||||||
Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0),
|
|
||||||
LayerNorm(token_vector_width*2),
|
|
||||||
Softmax(nO=n_tags, nI=token_vector_width*2)
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
self.require_model()
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
tokvecs = self.model.tok2vec(docs)
|
scores = self.model.get_ref("softmax")(tokvecs)
|
||||||
scores = self.model.softmax(tokvecs)
|
|
||||||
return tokvecs, scores
|
return tokvecs, scores
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
@ -1097,18 +959,7 @@ class MultitaskObjective(Tagger):
|
||||||
|
|
||||||
|
|
||||||
class ClozeMultitask(Pipe):
|
class ClozeMultitask(Pipe):
|
||||||
@classmethod
|
def __init__(self, vocab, model, **cfg):
|
||||||
def Model(cls, vocab, tok2vec, **cfg):
|
|
||||||
output_size = vocab.vectors.data.shape[1]
|
|
||||||
output_layer = chain(
|
|
||||||
Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0),
|
|
||||||
Linear(nO=output_size, nI=output_size, init_W=zero_init)
|
|
||||||
)
|
|
||||||
model = chain(tok2vec, output_layer)
|
|
||||||
model = masked_language_model(vocab, model)
|
|
||||||
return model
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
@ -1120,19 +971,16 @@ class ClozeMultitask(Pipe):
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
tok2vec=None, sgd=None, **kwargs):
|
tok2vec=None, sgd=None, **kwargs):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(self.vocab, tok2vec)
|
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.begin_training(X)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
self.require_model()
|
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||||
tokvecs = self.model.tok2vec(docs)
|
vectors = self.model.get_ref("output_layer")(tokvecs)
|
||||||
vectors = self.model.output_layer(tokvecs)
|
|
||||||
return tokvecs, vectors
|
return tokvecs, vectors
|
||||||
|
|
||||||
def get_loss(self, examples, vectors, prediction):
|
def get_loss(self, examples, vectors, prediction):
|
||||||
|
@ -1150,7 +998,6 @@ class ClozeMultitask(Pipe):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
@ -1171,62 +1018,11 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer
|
DOCS: https://spacy.io/api/textcategorizer
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, vocab, model, **cfg):
|
||||||
@classmethod
|
|
||||||
def Model(cls, nr_class=1, exclusive_classes=None, **cfg):
|
|
||||||
if nr_class == 1:
|
|
||||||
exclusive_classes = False
|
|
||||||
if exclusive_classes is None:
|
|
||||||
raise ValueError(
|
|
||||||
"TextCategorizer Model must specify 'exclusive_classes'. "
|
|
||||||
"This setting determines whether the model will output "
|
|
||||||
"scores that sum to 1 for each example. If only one class "
|
|
||||||
"is true for each example, you should set exclusive_classes=True. "
|
|
||||||
"For 'multi_label' classification, set exclusive_classes=False."
|
|
||||||
)
|
|
||||||
if "embed_size" not in cfg:
|
|
||||||
cfg["embed_size"] = util.env_opt("embed_size", 2000)
|
|
||||||
if "token_vector_width" not in cfg:
|
|
||||||
cfg["token_vector_width"] = util.env_opt("token_vector_width", 96)
|
|
||||||
if cfg.get("architecture") == "bow":
|
|
||||||
return build_bow_text_classifier(nr_class, exclusive_classes, **cfg)
|
|
||||||
else:
|
|
||||||
if "tok2vec" in cfg:
|
|
||||||
tok2vec = cfg["tok2vec"]
|
|
||||||
else:
|
|
||||||
config = {
|
|
||||||
"width": cfg.get("token_vector_width", 96),
|
|
||||||
"embed_size": cfg.get("embed_size", 2000),
|
|
||||||
"pretrained_vectors": cfg.get("pretrained_vectors", None),
|
|
||||||
"window_size": cfg.get("window_size", 1),
|
|
||||||
"cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
|
|
||||||
"subword_features": cfg.get("subword_features", True),
|
|
||||||
"char_embed": cfg.get("char_embed", False),
|
|
||||||
"conv_depth": cfg.get("conv_depth", 4),
|
|
||||||
"bilstm_depth": cfg.get("bilstm_depth", 0),
|
|
||||||
}
|
|
||||||
tok2vec = Tok2Vec(**config)
|
|
||||||
return build_simple_cnn_text_classifier(
|
|
||||||
tok2vec,
|
|
||||||
nr_class,
|
|
||||||
exclusive_classes,
|
|
||||||
**cfg
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tok2vec(self):
|
|
||||||
if self.model in (None, True, False):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return self.model.tok2vec
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
if "exclusive_classes" not in cfg:
|
|
||||||
self.cfg["exclusive_classes"] = True
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -1255,7 +1051,6 @@ class TextCategorizer(Pipe):
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
self.require_model()
|
|
||||||
tensors = [doc.tensor for doc in docs]
|
tensors = [doc.tensor for doc in docs]
|
||||||
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
@ -1274,7 +1069,6 @@ class TextCategorizer(Pipe):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -1311,7 +1105,7 @@ class TextCategorizer(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def _examples_to_truth(self, examples):
|
||||||
golds = [ex.gold for ex in examples]
|
golds = [ex.gold for ex in examples]
|
||||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
|
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
|
||||||
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
|
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
|
||||||
|
@ -1322,6 +1116,10 @@ class TextCategorizer(Pipe):
|
||||||
else:
|
else:
|
||||||
not_missing[i, j] = 0.
|
not_missing[i, j] = 0.
|
||||||
truths = self.model.ops.asarray(truths)
|
truths = self.model.ops.asarray(truths)
|
||||||
|
return truths, not_missing
|
||||||
|
|
||||||
|
def get_loss(self, examples, scores):
|
||||||
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
not_missing = self.model.ops.asarray(not_missing)
|
not_missing = self.model.ops.asarray(not_missing)
|
||||||
d_scores = (scores-truths) / scores.shape[0]
|
d_scores = (scores-truths) / scores.shape[0]
|
||||||
d_scores *= not_missing
|
d_scores *= not_missing
|
||||||
|
@ -1333,7 +1131,7 @@ class TextCategorizer(Pipe):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
if self.model not in (None, True, False):
|
if self.model.has_dim("nO"):
|
||||||
# This functionality was available previously, but was broken.
|
# This functionality was available previously, but was broken.
|
||||||
# The problem is that we resize the last layer, but the last layer
|
# The problem is that we resize the last layer, but the last layer
|
||||||
# is actually just an ensemble. We're not resizing the child layers
|
# is actually just an ensemble. We're not resizing the child layers
|
||||||
|
@ -1348,19 +1146,18 @@ class TextCategorizer(Pipe):
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
for example in get_examples():
|
# TODO: begin_training is not guaranteed to see all data / labels ?
|
||||||
|
examples = list(get_examples())
|
||||||
|
for example in examples:
|
||||||
for cat in example.doc_annotation.cats:
|
for cat in example.doc_annotation.cats:
|
||||||
self.add_label(cat)
|
self.add_label(cat)
|
||||||
if self.model is True:
|
self.require_labels()
|
||||||
self.cfg.update(kwargs)
|
docs = [Doc(Vocab(), words=["hello"])]
|
||||||
self.require_labels()
|
truths, _ = self._examples_to_truth(examples)
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
self.set_output(len(self.labels))
|
||||||
link_vectors_to_models(self.vocab)
|
self.model.initialize(X=docs, Y=truths)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
# TODO: use get_examples instead
|
|
||||||
docs = [Doc(Vocab(), words=["hello"])]
|
|
||||||
self.model.initialize(X=docs)
|
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
|
|
||||||
|
@ -1393,7 +1190,7 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.tok2vec
|
tok2vec = self.model.get_ref("tok2vec")
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline,
|
labeller.begin_training(get_examples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec, sgd=sgd)
|
tok2vec=tok2vec, sgd=sgd)
|
||||||
|
|
||||||
|
@ -1423,7 +1220,6 @@ cdef class EntityRecognizer(Parser):
|
||||||
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
|
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
|
||||||
requires = []
|
requires = []
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
nr_feature = 6
|
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, target):
|
||||||
if target == "cloze":
|
if target == "cloze":
|
||||||
|
@ -1435,7 +1231,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.tok2vec
|
tok2vec = self.model.get_ref("tok2vec")
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline,
|
labeller.begin_training(get_examples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec)
|
tok2vec=tok2vec)
|
||||||
|
|
||||||
|
@ -1464,18 +1260,9 @@ class EntityLinker(Pipe):
|
||||||
"""
|
"""
|
||||||
NIL = "NIL" # string used to refer to a non-existing link
|
NIL = "NIL" # string used to refer to a non-existing link
|
||||||
|
|
||||||
@classmethod
|
def __init__(self, vocab, model, **cfg):
|
||||||
def Model(cls, **cfg):
|
|
||||||
embed_width = cfg.get("embed_width", 300)
|
|
||||||
hidden_width = cfg.get("hidden_width", 128)
|
|
||||||
type_to_int = cfg.get("type_to_int", dict())
|
|
||||||
|
|
||||||
model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
|
|
||||||
return model
|
|
||||||
|
|
||||||
def __init__(self, vocab, **cfg):
|
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = True
|
self.model = model
|
||||||
self.kb = None
|
self.kb = None
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
|
@ -1483,11 +1270,6 @@ class EntityLinker(Pipe):
|
||||||
def set_kb(self, kb):
|
def set_kb(self, kb):
|
||||||
self.kb = kb
|
self.kb = kb
|
||||||
|
|
||||||
def require_model(self):
|
|
||||||
# Raise an error if the component's model is not initialized.
|
|
||||||
if getattr(self, "model", None) in (None, True, False):
|
|
||||||
raise ValueError(Errors.E109.format(name=self.name))
|
|
||||||
|
|
||||||
def require_kb(self):
|
def require_kb(self):
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
if getattr(self, "kb", None) in (None, True, False):
|
if getattr(self, "kb", None) in (None, True, False):
|
||||||
|
@ -1495,16 +1277,14 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
self.cfg["entity_width"] = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
if self.model is True:
|
self.set_output(nO)
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
self.require_model()
|
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -1614,7 +1394,6 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
|
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
|
||||||
self.require_model()
|
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
|
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
|
@ -1714,15 +1493,12 @@ class EntityLinker(Pipe):
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["kb"] = lambda p: self.kb.dump(p)
|
serialize["kb"] = lambda p: self.kb.dump(p)
|
||||||
if self.model not in (None, True, False):
|
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||||
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
|
||||||
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
@ -1734,8 +1510,8 @@ class EntityLinker(Pipe):
|
||||||
self.set_kb(kb)
|
self.set_kb(kb)
|
||||||
|
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||||
|
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
|
||||||
deserialize["kb"] = load_kb
|
deserialize["kb"] = load_kb
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
|
||||||
|
@ -1782,7 +1558,7 @@ class Sentencizer(Pipe):
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_nlp(cls, nlp, **cfg):
|
def from_nlp(cls, nlp, model=None, **cfg):
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
|
@ -1915,8 +1691,8 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
|
|
||||||
# Cython classes can't be decorated, so we need to add the factories here
|
# Cython classes can't be decorated, so we need to add the factories here
|
||||||
Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
|
Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg)
|
||||||
Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
|
Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||||
|
|
|
@ -5,32 +5,21 @@ from ..gold import Example
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..language import component
|
from ..language import component
|
||||||
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
|
from ..util import link_vectors_to_models, minibatch, eg2doc
|
||||||
|
|
||||||
|
|
||||||
@component("tok2vec", assigns=["doc.tensor"])
|
@component("tok2vec", assigns=["doc.tensor"])
|
||||||
class Tok2Vec(Pipe):
|
class Tok2Vec(Pipe):
|
||||||
@classmethod
|
|
||||||
def from_nlp(cls, nlp, **cfg):
|
|
||||||
return cls(nlp.vocab, **cfg)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, architecture, **cfg):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
"""Create a new statistical model for the class.
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
||||||
architecture (str): The registered model architecture to use.
|
def __init__(self, vocab, model, **cfg):
|
||||||
**cfg: Config parameters.
|
|
||||||
RETURNS (Model): A `thinc.model.Model` or similar instance.
|
|
||||||
"""
|
|
||||||
model = registry.architectures.get(architecture)
|
|
||||||
return model(**cfg)
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
initialisation.
|
initialisation.
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||||
instance with the `Doc` objects it will process.
|
instance with the `Doc` objects it will process.
|
||||||
model (Model): A `Model` instance or `True` to allocate one later.
|
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -143,8 +132,6 @@ class Tok2Vec(Pipe):
|
||||||
get_examples (function): Function returning example training data.
|
get_examples (function): Function returning example training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
# TODO: use examples instead ?
|
# TODO: use examples instead ?
|
||||||
docs = [Doc(Vocab(), words=["hello"])]
|
docs = [Doc(Vocab(), words=["hello"])]
|
||||||
self.model.initialize(X=docs)
|
self.model.initialize(X=docs)
|
||||||
|
|
|
@ -221,7 +221,10 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
||||||
|
|
||||||
class ParserModel(Model):
|
class ParserModel(Model):
|
||||||
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
|
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
|
||||||
Model.__init__(self, name="parser_model", forward=forward)
|
# don't define nO for this object, because we can't dynamically change it
|
||||||
|
Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None})
|
||||||
|
if tok2vec.has_dim("nI"):
|
||||||
|
self.set_dim("nI", tok2vec.get_dim("nI"))
|
||||||
self._layers = [tok2vec, lower_model]
|
self._layers = [tok2vec, lower_model]
|
||||||
if upper_model is not None:
|
if upper_model is not None:
|
||||||
self._layers.append(upper_model)
|
self._layers.append(upper_model)
|
||||||
|
@ -229,6 +232,7 @@ class ParserModel(Model):
|
||||||
if unseen_classes:
|
if unseen_classes:
|
||||||
for class_ in unseen_classes:
|
for class_ in unseen_classes:
|
||||||
self.unseen_classes.add(class_)
|
self.unseen_classes.add(class_)
|
||||||
|
self.set_ref("tok2vec", tok2vec)
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
step_model = ParserStepModel(docs, self._layers,
|
step_model = ParserStepModel(docs, self._layers,
|
||||||
|
@ -238,25 +242,32 @@ class ParserModel(Model):
|
||||||
def resize_output(self, new_nO):
|
def resize_output(self, new_nO):
|
||||||
if len(self._layers) == 2:
|
if len(self._layers) == 2:
|
||||||
return
|
return
|
||||||
if new_nO == self.upper.get_dim("nO"):
|
if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")):
|
||||||
return
|
return
|
||||||
smaller = self.upper
|
smaller = self.upper
|
||||||
nI = smaller.get_dim("nI")
|
nI = None
|
||||||
|
if smaller.has_dim("nI"):
|
||||||
|
nI = smaller.get_dim("nI")
|
||||||
with use_ops('numpy'):
|
with use_ops('numpy'):
|
||||||
larger = Linear(new_nO, nI)
|
larger = Linear(nO=new_nO, nI=nI)
|
||||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
larger._init = smaller._init
|
||||||
larger_b = larger.ops.alloc1f(new_nO)
|
# it could be that the model is not initialized yet, then skip this bit
|
||||||
smaller_W = smaller.get_param("W")
|
if nI:
|
||||||
smaller_b = smaller.get_param("b")
|
larger_W = larger.ops.alloc2f(new_nO, nI)
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
larger_b = larger.ops.alloc1f(new_nO)
|
||||||
# just adding rows here.
|
smaller_W = smaller.get_param("W")
|
||||||
larger_W[:smaller.get_dim("nO")] = smaller_W
|
smaller_b = smaller.get_param("b")
|
||||||
larger_b[:smaller.get_dim("nO")] = smaller_b
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||||
larger.set_param("W", larger_W)
|
# just adding rows here.
|
||||||
larger.set_param("b", larger_b)
|
if smaller.has_dim("nO"):
|
||||||
|
larger_W[:smaller.get_dim("nO")] = smaller_W
|
||||||
|
larger_b[:smaller.get_dim("nO")] = smaller_b
|
||||||
|
for i in range(smaller.get_dim("nO"), new_nO):
|
||||||
|
self.unseen_classes.add(i)
|
||||||
|
|
||||||
|
larger.set_param("W", larger_W)
|
||||||
|
larger.set_param("b", larger_b)
|
||||||
self._layers[-1] = larger
|
self._layers[-1] = larger
|
||||||
for i in range(smaller.get_dim("nO"), new_nO):
|
|
||||||
self.unseen_classes.add(i)
|
|
||||||
|
|
||||||
def initialize(self, X=None, Y=None):
|
def initialize(self, X=None, Y=None):
|
||||||
self.tok2vec.initialize()
|
self.tok2vec.initialize()
|
||||||
|
@ -412,7 +423,7 @@ cdef class precompute_hiddens:
|
||||||
we can do all our hard maths up front, packed into large multiplications,
|
we can do all our hard maths up front, packed into large multiplications,
|
||||||
and do the hard-to-program parsing on the CPU.
|
and do the hard-to-program parsing on the CPU.
|
||||||
"""
|
"""
|
||||||
cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc
|
cdef readonly int nF, nO, nP
|
||||||
cdef bint _is_synchronized
|
cdef bint _is_synchronized
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
cdef np.ndarray _features
|
cdef np.ndarray _features
|
||||||
|
@ -458,6 +469,16 @@ cdef class precompute_hiddens:
|
||||||
self._is_synchronized = True
|
self._is_synchronized = True
|
||||||
return <float*>self._cached.data
|
return <float*>self._cached.data
|
||||||
|
|
||||||
|
def has_dim(self, name):
|
||||||
|
if name == "nF":
|
||||||
|
return self.nF if self.nF is not None else True
|
||||||
|
elif name == "nP":
|
||||||
|
return self.nP if self.nP is not None else True
|
||||||
|
elif name == "nO":
|
||||||
|
return self.nO if self.nO is not None else True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def get_dim(self, name):
|
def get_dim(self, name):
|
||||||
if name == "nF":
|
if name == "nF":
|
||||||
return self.nF
|
return self.nF
|
||||||
|
@ -468,6 +489,16 @@ cdef class precompute_hiddens:
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
|
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
|
||||||
|
|
||||||
|
def set_dim(self, name, value):
|
||||||
|
if name == "nF":
|
||||||
|
self.nF = value
|
||||||
|
elif name == "nP":
|
||||||
|
self.nP = value
|
||||||
|
elif name == "nO":
|
||||||
|
self.nO = value
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
|
||||||
|
|
||||||
def __call__(self, X, bint is_train):
|
def __call__(self, X, bint is_train):
|
||||||
if is_train:
|
if is_train:
|
||||||
return self.begin_update(X)
|
return self.begin_update(X)
|
||||||
|
|
|
@ -27,11 +27,11 @@ from ._parser_model cimport predict_states, arg_max_if_valid
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
from ._parser_model cimport get_c_weights, get_c_sizes
|
from ._parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ._parser_model import ParserModel
|
from ._parser_model import ParserModel
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer, registry
|
||||||
from ..compat import copy_array
|
from ..compat import copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, user_warning, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -41,114 +41,42 @@ from . import _beam_utils
|
||||||
from . import nonproj
|
from . import nonproj
|
||||||
|
|
||||||
|
|
||||||
from ..ml._layers import PrecomputableAffine
|
|
||||||
from ..ml.component_models import Tok2Vec
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
@classmethod
|
|
||||||
def Model(cls, nr_class, **cfg):
|
|
||||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
|
||||||
subword_features = util.env_opt('subword_features',
|
|
||||||
cfg.get('subword_features', True))
|
|
||||||
conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4))
|
|
||||||
conv_window = util.env_opt('conv_window', cfg.get('conv_window', 1))
|
|
||||||
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
|
|
||||||
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
|
|
||||||
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
|
|
||||||
nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
|
|
||||||
if depth not in (0, 1):
|
|
||||||
raise ValueError(TempErrors.T004.format(value=depth))
|
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
|
||||||
cfg.get('maxout_pieces', 2))
|
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
|
||||||
cfg.get('token_vector_width', 96))
|
|
||||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
|
|
||||||
if depth == 0:
|
|
||||||
hidden_width = nr_class
|
|
||||||
parser_maxout_pieces = 1
|
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
|
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors', None)
|
|
||||||
tok2vec = Tok2Vec(width=token_vector_width,
|
|
||||||
embed_size=embed_size,
|
|
||||||
conv_depth=conv_depth,
|
|
||||||
window_size=conv_window,
|
|
||||||
cnn_maxout_pieces=t2v_pieces,
|
|
||||||
subword_features=subword_features,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
bilstm_depth=bilstm_depth)
|
|
||||||
tok2vec = chain(tok2vec, list2array())
|
|
||||||
tok2vec.set_dim("nO", token_vector_width)
|
|
||||||
lower = PrecomputableAffine(hidden_width,
|
|
||||||
nF=nr_feature_tokens, nI=token_vector_width,
|
|
||||||
nP=parser_maxout_pieces)
|
|
||||||
lower.set_dim("nP", parser_maxout_pieces)
|
|
||||||
if depth == 1:
|
|
||||||
with use_ops('numpy'):
|
|
||||||
upper = Linear(nr_class, hidden_width, init_W=zero_init)
|
|
||||||
else:
|
|
||||||
upper = None
|
|
||||||
|
|
||||||
cfg = {
|
|
||||||
'nr_class': nr_class,
|
|
||||||
'nr_feature_tokens': nr_feature_tokens,
|
|
||||||
'hidden_depth': depth,
|
|
||||||
'token_vector_width': token_vector_width,
|
|
||||||
'hidden_width': hidden_width,
|
|
||||||
'maxout_pieces': parser_maxout_pieces,
|
|
||||||
'pretrained_vectors': pretrained_vectors,
|
|
||||||
'bilstm_depth': bilstm_depth,
|
|
||||||
'self_attn_depth': self_attn_depth,
|
|
||||||
'conv_depth': conv_depth,
|
|
||||||
'window_size': conv_window,
|
|
||||||
'embed_size': embed_size,
|
|
||||||
'cnn_maxout_pieces': t2v_pieces
|
|
||||||
}
|
|
||||||
model = ParserModel(tok2vec, lower, upper)
|
|
||||||
model.initialize()
|
|
||||||
return model, cfg
|
|
||||||
|
|
||||||
name = 'base_parser'
|
name = 'base_parser'
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
|
||||||
|
def __init__(self, Vocab vocab, model, **cfg):
|
||||||
"""Create a Parser.
|
"""Create a Parser.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object. Must be shared with documents
|
vocab (Vocab): The vocabulary object. Must be shared with documents
|
||||||
to be processed. The value is set to the `.vocab` attribute.
|
to be processed. The value is set to the `.vocab` attribute.
|
||||||
moves (TransitionSystem): Defines how the parse-state is created,
|
**cfg: Configuration parameters. Set to the `.cfg` attribute.
|
||||||
updated and evaluated. The value is set to the .moves attribute
|
If it doesn't include a value for 'moves', a new instance is
|
||||||
unless True (default), in which case a new instance is created with
|
created with `self.TransitionSystem()`. This defines how the
|
||||||
`Parser.Moves()`.
|
parse-state is created, updated and evaluated.
|
||||||
model (object): Defines how the parse-state is created, updated and
|
|
||||||
evaluated. The value is set to the .model attribute. If set to True
|
|
||||||
(default), a new instance will be created with `Parser.Model()`
|
|
||||||
in parser.begin_training(), parser.from_disk() or parser.from_bytes().
|
|
||||||
**cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
|
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if moves is True:
|
moves = cfg.get("moves", None)
|
||||||
self.moves = self.TransitionSystem(self.vocab.strings)
|
if moves is None:
|
||||||
else:
|
# defined by EntityRecognizer as a BiluoPushDown
|
||||||
self.moves = moves
|
moves = self.TransitionSystem(self.vocab.strings)
|
||||||
if 'beam_width' not in cfg:
|
self.moves = moves
|
||||||
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
cfg.setdefault('min_action_freq', 30)
|
||||||
if 'beam_density' not in cfg:
|
cfg.setdefault('learn_tokens', False)
|
||||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
cfg.setdefault('beam_width', 1)
|
||||||
if 'beam_update_prob' not in cfg:
|
cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used)
|
||||||
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
|
|
||||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
|
||||||
cfg.setdefault("nr_feature_tokens", self.nr_feature)
|
|
||||||
self.cfg = cfg
|
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.set_output(self.moves.n_moves)
|
||||||
|
self.cfg = cfg
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_nlp(cls, nlp, **cfg):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
return cls(nlp.vocab, **cfg)
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
@ -163,8 +91,6 @@ cdef class Parser:
|
||||||
names.append(name)
|
names.append(name)
|
||||||
return names
|
return names
|
||||||
|
|
||||||
nr_feature = 8
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||||
|
@ -173,7 +99,7 @@ cdef class Parser:
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
'''Return the embedding and convolutional layer of the model.'''
|
'''Return the embedding and convolutional layer of the model.'''
|
||||||
return None if self.model in (None, True, False) else self.model.tok2vec
|
return self.model.tok2vec
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
|
@ -190,10 +116,7 @@ cdef class Parser:
|
||||||
self._resize()
|
self._resize()
|
||||||
|
|
||||||
def _resize(self):
|
def _resize(self):
|
||||||
if "nr_class" in self.cfg:
|
self.model.resize_output(self.moves.n_moves)
|
||||||
self.cfg["nr_class"] = self.moves.n_moves
|
|
||||||
if self.model not in (True, False, None):
|
|
||||||
self.model.resize_output(self.moves.n_moves)
|
|
||||||
if self._rehearsal_model not in (True, False, None):
|
if self._rehearsal_model not in (True, False, None):
|
||||||
self._rehearsal_model.resize_output(self.moves.n_moves)
|
self._rehearsal_model.resize_output(self.moves.n_moves)
|
||||||
|
|
||||||
|
@ -227,7 +150,7 @@ cdef class Parser:
|
||||||
doc (Doc): The document to be processed.
|
doc (Doc): The document to be processed.
|
||||||
"""
|
"""
|
||||||
if beam_width is None:
|
if beam_width is None:
|
||||||
beam_width = self.cfg.get('beam_width', 1)
|
beam_width = self.cfg['beam_width']
|
||||||
beam_density = self.cfg.get('beam_density', 0.)
|
beam_density = self.cfg.get('beam_density', 0.)
|
||||||
states = self.predict([doc], beam_width=beam_width,
|
states = self.predict([doc], beam_width=beam_width,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
|
@ -243,7 +166,7 @@ cdef class Parser:
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
if beam_width is None:
|
if beam_width is None:
|
||||||
beam_width = self.cfg.get('beam_width', 1)
|
beam_width = self.cfg['beam_width']
|
||||||
beam_density = self.cfg.get('beam_density', 0.)
|
beam_density = self.cfg.get('beam_density', 0.)
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
for batch in util.minibatch(docs, size=batch_size):
|
for batch in util.minibatch(docs, size=batch_size):
|
||||||
|
@ -264,13 +187,7 @@ cdef class Parser:
|
||||||
else:
|
else:
|
||||||
yield from batch_in_order
|
yield from batch_in_order
|
||||||
|
|
||||||
def require_model(self):
|
|
||||||
"""Raise an error if the component's model is not initialized."""
|
|
||||||
if getattr(self, 'model', None) in (None, True, False):
|
|
||||||
raise ValueError(Errors.E109.format(name=self.name))
|
|
||||||
|
|
||||||
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
|
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
|
||||||
self.require_model()
|
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
@ -313,11 +230,11 @@ cdef class Parser:
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
self._resize()
|
self._resize()
|
||||||
|
cdef int nr_feature = self.model.lower.get_dim("nF")
|
||||||
model = self.model.predict(docs)
|
model = self.model.predict(docs)
|
||||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
token_ids = numpy.zeros((len(docs) * beam_width, nr_feature),
|
||||||
dtype='i', order='C')
|
dtype='i', order='C')
|
||||||
cdef int* c_ids
|
cdef int* c_ids
|
||||||
cdef int nr_feature = self.cfg["nr_feature_tokens"]
|
|
||||||
cdef int n_states
|
cdef int n_states
|
||||||
model = self.model.predict(docs)
|
model = self.model.predict(docs)
|
||||||
todo = [beam for beam in beams if not beam.is_done]
|
todo = [beam for beam in beams if not beam.is_done]
|
||||||
|
@ -430,7 +347,6 @@ cdef class Parser:
|
||||||
return [b for b in beams if not b.is_done]
|
return [b for b in beams if not b.is_done]
|
||||||
|
|
||||||
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
self.require_model()
|
|
||||||
examples = Example.to_example_objects(examples)
|
examples = Example.to_example_objects(examples)
|
||||||
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -440,9 +356,9 @@ cdef class Parser:
|
||||||
multitask.update(examples, drop=drop, sgd=sgd)
|
multitask.update(examples, drop=drop, sgd=sgd)
|
||||||
# The probability we use beam update, instead of falling back to
|
# The probability we use beam update, instead of falling back to
|
||||||
# a greedy update
|
# a greedy update
|
||||||
beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
|
beam_update_prob = self.cfg['beam_update_prob']
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
|
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
|
||||||
return self.update_beam(examples, self.cfg.get('beam_width', 1),
|
return self.update_beam(examples, self.cfg['beam_width'],
|
||||||
drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
|
drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
|
||||||
beam_density=self.cfg.get('beam_density', 0.001))
|
beam_density=self.cfg.get('beam_density', 0.001))
|
||||||
|
|
||||||
|
@ -533,7 +449,7 @@ cdef class Parser:
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||||
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
||||||
self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds,
|
self.moves, self.model.lower.get_dim("nF"), 10000, states, golds,
|
||||||
model.state2vec, model.vec2scores, width, losses=losses,
|
model.state2vec, model.vec2scores, width, losses=losses,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
@ -562,8 +478,6 @@ cdef class Parser:
|
||||||
keyed by the parameter ID. The values are (weights, gradients) tuples.
|
keyed by the parameter ID. The values are (weights, gradients) tuples.
|
||||||
"""
|
"""
|
||||||
gradients = {}
|
gradients = {}
|
||||||
if self.model in (None, True, False):
|
|
||||||
return gradients
|
|
||||||
queue = [self.model]
|
queue = [self.model]
|
||||||
seen = set()
|
seen = set()
|
||||||
for node in queue:
|
for node in queue:
|
||||||
|
@ -647,45 +561,40 @@ cdef class Parser:
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
return create_default_optimizer()
|
return create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
|
def set_output(self, nO):
|
||||||
if 'model' in cfg:
|
if self.model.upper.has_dim("nO") is None:
|
||||||
self.model = cfg['model']
|
self.model.upper.set_dim("nO", nO)
|
||||||
|
|
||||||
|
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||||
|
self.cfg.update(kwargs)
|
||||||
if not hasattr(get_examples, '__call__'):
|
if not hasattr(get_examples, '__call__'):
|
||||||
gold_tuples = get_examples
|
gold_tuples = get_examples
|
||||||
get_examples = lambda: gold_tuples
|
get_examples = lambda: gold_tuples
|
||||||
cfg.setdefault('min_action_freq', 30)
|
|
||||||
actions = self.moves.get_actions(gold_parses=get_examples(),
|
actions = self.moves.get_actions(gold_parses=get_examples(),
|
||||||
min_freq=cfg.get('min_action_freq', 30),
|
min_freq=self.cfg['min_action_freq'],
|
||||||
learn_tokens=self.cfg.get("learn_tokens", False))
|
learn_tokens=self.cfg["learn_tokens"])
|
||||||
for action, labels in self.moves.labels.items():
|
for action, labels in self.moves.labels.items():
|
||||||
actions.setdefault(action, {})
|
actions.setdefault(action, {})
|
||||||
for label, freq in labels.items():
|
for label, freq in labels.items():
|
||||||
if label not in actions[action]:
|
if label not in actions[action]:
|
||||||
actions[action][label] = freq
|
actions[action][label] = freq
|
||||||
self.moves.initialize_actions(actions)
|
self.moves.initialize_actions(actions)
|
||||||
cfg.setdefault('token_vector_width', 96)
|
# make sure we resize so we have an appropriate upper layer
|
||||||
if self.model is True:
|
self._resize()
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
if sgd is None:
|
||||||
if sgd is None:
|
sgd = self.create_optimizer()
|
||||||
sgd = self.create_optimizer()
|
doc_sample = []
|
||||||
doc_sample = []
|
gold_sample = []
|
||||||
gold_sample = []
|
for example in islice(get_examples(), 1000):
|
||||||
for example in islice(get_examples(), 1000):
|
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
|
||||||
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
|
for doc, gold in parses:
|
||||||
for doc, gold in parses:
|
doc_sample.append(doc)
|
||||||
doc_sample.append(doc)
|
gold_sample.append(gold)
|
||||||
gold_sample.append(gold)
|
|
||||||
self.model.initialize(doc_sample, gold_sample)
|
self.model.initialize(doc_sample, gold_sample)
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
|
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
else:
|
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
if self.model.upper.has_dim("nO") is None:
|
|
||||||
self.model.upper.set_dim("nO", self.moves.n_moves)
|
|
||||||
self.model.initialize()
|
|
||||||
self.cfg.update(cfg)
|
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def _get_doc(self, example):
|
def _get_doc(self, example):
|
||||||
|
@ -709,28 +618,24 @@ cdef class Parser:
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'vocab': lambda p: self.vocab.from_disk(p),
|
||||||
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None
|
'model': lambda p: None,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if self.model is True:
|
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
|
||||||
else:
|
|
||||||
cfg = {}
|
|
||||||
with (path / 'model').open('rb') as file_:
|
with (path / 'model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
try:
|
try:
|
||||||
|
self._resize()
|
||||||
self.model.from_bytes(bytes_data)
|
self.model.from_bytes(bytes_data)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
self.cfg.update(cfg)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
serializers = {
|
serializers = {
|
||||||
"model": lambda: (self.model.to_bytes() if self.model is not True else True),
|
"model": lambda: (self.model.to_bytes()),
|
||||||
"vocab": lambda: self.vocab.to_bytes(),
|
"vocab": lambda: self.vocab.to_bytes(),
|
||||||
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
||||||
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
||||||
|
@ -743,22 +648,14 @@ cdef class Parser:
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||||
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: None
|
"model": lambda b: None,
|
||||||
}
|
}
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
|
||||||
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
|
|
||||||
self.cfg['pretrained_vectors'] = self.vocab.vectors
|
|
||||||
if self.model is True:
|
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
|
||||||
else:
|
|
||||||
cfg = {}
|
|
||||||
if 'model' in msg:
|
if 'model' in msg:
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(msg['model'])
|
self.model.from_bytes(msg['model'])
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149)
|
raise ValueError(Errors.E149)
|
||||||
self.cfg.update(cfg)
|
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -3,12 +3,13 @@ from spacy.tokens import Span
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
from ...ml.models.defaults import default_ner
|
||||||
|
|
||||||
|
|
||||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
ner = EntityRecognizer(en_vocab)
|
ner = EntityRecognizer(en_vocab, default_ner())
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
|
@ -24,7 +25,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
def test_ents_reset(en_vocab):
|
def test_ents_reset(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
ner = EntityRecognizer(en_vocab)
|
ner = EntityRecognizer(en_vocab, default_ner())
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||||
|
|
|
@ -3,6 +3,8 @@ from thinc.api import Adam, NumpyOps
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from spacy.ml.models.defaults import default_parser, default_ner
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||||
from spacy.util import fix_random_seed
|
from spacy.util import fix_random_seed
|
||||||
|
@ -15,7 +17,7 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab)
|
parser = DependencyParser(vocab, default_parser())
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,27 +57,31 @@ def test_add_label(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_add_label_deserializes_correctly():
|
def test_add_label_deserializes_correctly():
|
||||||
ner1 = EntityRecognizer(Vocab())
|
ner1 = EntityRecognizer(Vocab(), default_ner())
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training([])
|
ner1.begin_training([])
|
||||||
ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes())
|
ner2 = EntityRecognizer(Vocab(), default_ner())
|
||||||
|
|
||||||
|
# the second model needs to be resized before we can call from_bytes
|
||||||
|
ner2.model.resize_output(ner1.moves.n_moves)
|
||||||
|
ner2.from_bytes(ner1.to_bytes())
|
||||||
assert ner1.moves.n_moves == ner2.moves.n_moves
|
assert ner1.moves.n_moves == ner2.moves.n_moves
|
||||||
for i in range(ner1.moves.n_moves):
|
for i in range(ner1.moves.n_moves):
|
||||||
assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
|
assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipe_cls,n_moves", [(DependencyParser, 5), (EntityRecognizer, 4)]
|
"pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
|
||||||
)
|
)
|
||||||
def test_add_label_get_label(pipe_cls, n_moves):
|
def test_add_label_get_label(pipe_cls, n_moves, model):
|
||||||
"""Test that added labels are returned correctly. This test was added to
|
"""Test that added labels are returned correctly. This test was added to
|
||||||
test for a bug in DependencyParser.labels that'd cause it to fail when
|
test for a bug in DependencyParser.labels that'd cause it to fail when
|
||||||
splitting the move names.
|
splitting the move names.
|
||||||
"""
|
"""
|
||||||
labels = ["A", "B", "C"]
|
labels = ["A", "B", "C"]
|
||||||
pipe = pipe_cls(Vocab())
|
pipe = pipe_cls(Vocab(), model)
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pipe.add_label(label)
|
pipe.add_label(label)
|
||||||
assert len(pipe.move_names) == len(labels) * n_moves
|
assert len(pipe.move_names) == len(labels) * n_moves
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from spacy.ml.models.defaults import default_parser
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
@ -136,7 +138,7 @@ def test_get_oracle_actions():
|
||||||
deps.append(dep)
|
deps.append(dep)
|
||||||
ents.append(ent)
|
ents.append(ent)
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
parser = DependencyParser(doc.vocab)
|
parser = DependencyParser(doc.vocab, default_parser())
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.ml.models.defaults import default_ner
|
||||||
|
|
||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
from spacy.syntax.ner import BiluoPushDown
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
@ -134,7 +139,7 @@ def test_accept_blocked_token():
|
||||||
# 1. test normal behaviour
|
# 1. test normal behaviour
|
||||||
nlp1 = English()
|
nlp1 = English()
|
||||||
doc1 = nlp1("I live in New York")
|
doc1 = nlp1("I live in New York")
|
||||||
ner1 = EntityRecognizer(doc1.vocab)
|
ner1 = EntityRecognizer(doc1.vocab, default_ner())
|
||||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||||
|
|
||||||
|
@ -152,7 +157,7 @@ def test_accept_blocked_token():
|
||||||
# 2. test blocking behaviour
|
# 2. test blocking behaviour
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
doc2 = nlp2("I live in New York")
|
doc2 = nlp2("I live in New York")
|
||||||
ner2 = EntityRecognizer(doc2.vocab)
|
ner2 = EntityRecognizer(doc2.vocab, default_ner())
|
||||||
|
|
||||||
# set "New York" to a blocked entity
|
# set "New York" to a blocked entity
|
||||||
doc2.ents = [(0, 3, 5)]
|
doc2.ents = [(0, 3, 5)]
|
||||||
|
@ -188,7 +193,7 @@ def test_overwrite_token():
|
||||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||||
|
|
||||||
# Check that a new ner can overwrite O
|
# Check that a new ner can overwrite O
|
||||||
ner2 = EntityRecognizer(doc.vocab)
|
ner2 = EntityRecognizer(doc.vocab, default_ner())
|
||||||
ner2.moves.add_action(5, "")
|
ner2.moves.add_action(5, "")
|
||||||
ner2.add_label("GPE")
|
ner2.add_label("GPE")
|
||||||
state = ner2.moves.init_batch([doc])[0]
|
state = ner2.moves.init_batch([doc])[0]
|
||||||
|
@ -199,6 +204,17 @@ def test_overwrite_token():
|
||||||
assert ner2.moves.is_valid(state, "L-GPE")
|
assert ner2.moves.is_valid(state, "L-GPE")
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_ner():
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("MY_LABEL")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
doc = nlp("John is watching the news about Croatia's elections")
|
||||||
|
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||||
|
assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
||||||
|
|
||||||
|
|
||||||
def test_ruler_before_ner():
|
def test_ruler_before_ner():
|
||||||
""" Test that an NER works after an entity_ruler: the second can add annotations """
|
""" Test that an NER works after an entity_ruler: the second can add annotations """
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -214,7 +230,6 @@ def test_ruler_before_ner():
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.add_pipe(untrained_ner)
|
nlp.add_pipe(untrained_ner)
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
|
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
@ -261,28 +276,7 @@ def test_block_ner():
|
||||||
assert [token.ent_type_ for token in doc] == expected_types
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
def test_change_number_features():
|
def test_overfitting_IO():
|
||||||
# Test the default number features
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
ner.add_label("PERSON")
|
|
||||||
nlp.begin_training()
|
|
||||||
assert ner.model.lower.get_dim("nF") == ner.nr_feature
|
|
||||||
# Test we can change it
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
ner.add_label("PERSON")
|
|
||||||
nlp.begin_training(
|
|
||||||
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
|
|
||||||
)
|
|
||||||
assert ner.model.lower.get_dim("nF") == 3
|
|
||||||
# Test the model runs
|
|
||||||
nlp("hello world")
|
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting():
|
|
||||||
# Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner")
|
||||||
|
@ -301,11 +295,20 @@ def test_overfitting():
|
||||||
test_text = "I like London."
|
test_text = "I like London."
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
ents = doc.ents
|
ents = doc.ents
|
||||||
|
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
assert ents[0].text == "London"
|
assert ents[0].text == "London"
|
||||||
assert ents[0].label_ == "LOC"
|
assert ents[0].label_ == "LOC"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
ents2 = doc2.ents
|
||||||
|
assert len(ents2) == 1
|
||||||
|
assert ents2[0].text == "London"
|
||||||
|
assert ents2[0].label_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
class BlockerComponent1(object):
|
class BlockerComponent1(object):
|
||||||
name = "my_blocker"
|
name = "my_blocker"
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.ml.component_models import Tok2Vec
|
from spacy.ml.models.defaults import default_parser, default_tok2vec
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
from spacy.syntax.nn_parser import Parser
|
from spacy.syntax.nn_parser import Parser
|
||||||
|
from spacy.syntax._parser_model import ParserModel
|
||||||
from spacy.tokens.doc import Doc
|
from spacy.tokens.doc import Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
|
||||||
|
@ -20,19 +21,22 @@ def arc_eager(vocab):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tok2vec():
|
def tok2vec():
|
||||||
tok2vec = Tok2Vec(8, 100)
|
tok2vec = default_tok2vec()
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab, arc_eager):
|
def parser(vocab, arc_eager):
|
||||||
return Parser(vocab, moves=arc_eager, model=None)
|
return Parser(vocab, model=default_parser(), moves=arc_eager)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def model(arc_eager, tok2vec):
|
def model(arc_eager, tok2vec, vocab):
|
||||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0]
|
model = default_parser()
|
||||||
|
model.resize_output(arc_eager.n_moves)
|
||||||
|
model.initialize()
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -46,11 +50,11 @@ def gold(doc):
|
||||||
|
|
||||||
|
|
||||||
def test_can_init_nn_parser(parser):
|
def test_can_init_nn_parser(parser):
|
||||||
assert parser.model is None
|
assert isinstance(parser.model, ParserModel)
|
||||||
|
|
||||||
|
|
||||||
def test_build_model(parser):
|
def test_build_model(parser, vocab):
|
||||||
parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0]
|
parser.model = Parser(vocab, model=default_parser(), moves=parser.moves).model
|
||||||
assert parser.model is not None
|
assert parser.model is not None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
from spacy.ml.models.defaults import default_parser
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
@ -93,7 +94,7 @@ def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
|
||||||
def test_beam_parse():
|
def test_beam_parse():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
|
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser")
|
||||||
nlp.parser.add_label("nsubj")
|
nlp.parser.add_label("nsubj")
|
||||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||||
doc = nlp.make_doc("Australia is a country")
|
doc = nlp.make_doc("Australia is a country")
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from ..util import get_doc, apply_transition_sequence
|
from ..util import get_doc, apply_transition_sequence, make_tempdir
|
||||||
|
from ... import util
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
|
@ -182,7 +183,7 @@ def test_parser_set_sent_starts(en_vocab):
|
||||||
assert token.head in sent
|
assert token.head in sent
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
parser = nlp.create_pipe("parser")
|
parser = nlp.create_pipe("parser")
|
||||||
|
@ -200,7 +201,15 @@ def test_overfitting():
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like securities."
|
test_text = "I like securities."
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
|
|
||||||
assert doc[0].dep_ is "nsubj"
|
assert doc[0].dep_ is "nsubj"
|
||||||
assert doc[2].dep_ is "dobj"
|
assert doc[2].dep_ is "dobj"
|
||||||
assert doc[3].dep_ is "punct"
|
assert doc[3].dep_ is "punct"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].dep_ is "nsubj"
|
||||||
|
assert doc2[2].dep_ is "dobj"
|
||||||
|
assert doc2[3].dep_ is "punct"
|
||||||
|
|
|
@ -3,6 +3,8 @@ from thinc.api import Adam
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from spacy.ml.models.defaults import default_parser
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
|
|
||||||
|
@ -14,7 +16,7 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab)
|
parser = DependencyParser(vocab, default_parser())
|
||||||
parser.cfg["token_vector_width"] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
|
|
|
@ -111,7 +111,8 @@ def test_component_factories_from_nlp():
|
||||||
nlp.add_pipe(pipe)
|
nlp.add_pipe(pipe)
|
||||||
assert nlp("hello world")
|
assert nlp("hello world")
|
||||||
# The first argument here is the class itself, so we're accepting any here
|
# The first argument here is the class itself, so we're accepting any here
|
||||||
mock.assert_called_once_with(ANY, nlp, foo="bar")
|
# The model will be initialized to None by the factory
|
||||||
|
mock.assert_called_once_with(ANY, nlp, None, foo="bar")
|
||||||
|
|
||||||
|
|
||||||
def test_analysis_validate_attrs_valid():
|
def test_analysis_validate_attrs_valid():
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_label_types():
|
def test_label_types():
|
||||||
|
@ -18,9 +22,9 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
||||||
nlp = Language()
|
nlp = English()
|
||||||
tagger = nlp.create_pipe("tagger")
|
tagger = nlp.create_pipe("tagger")
|
||||||
for tag, values in TAG_MAP.items():
|
for tag, values in TAG_MAP.items():
|
||||||
tagger.add_label(tag, values)
|
tagger.add_label(tag, values)
|
||||||
|
@ -35,8 +39,17 @@ def test_overfitting():
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
|
|
||||||
assert doc[0].tag_ is "N"
|
assert doc[0].tag_ is "N"
|
||||||
assert doc[1].tag_ is "V"
|
assert doc[1].tag_ is "V"
|
||||||
assert doc[2].tag_ is "J"
|
assert doc[2].tag_ is "J"
|
||||||
assert doc[3].tag_ is "N"
|
assert doc[3].tag_ is "N"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].tag_ is "N"
|
||||||
|
assert doc2[1].tag_ is "V"
|
||||||
|
assert doc2[2].tag_ is "J"
|
||||||
|
assert doc2[3].tag_ is "N"
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
|
||||||
|
@ -74,9 +78,9 @@ def test_label_types():
|
||||||
nlp.get_pipe("textcat").add_label(9)
|
nlp.get_pipe("textcat").add_label(9)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||||
nlp = Language()
|
nlp = English()
|
||||||
textcat = nlp.create_pipe("textcat")
|
textcat = nlp.create_pipe("textcat")
|
||||||
for _, annotations in TRAIN_DATA:
|
for _, annotations in TRAIN_DATA:
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
|
@ -87,11 +91,21 @@ def test_overfitting():
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
|
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
|
||||||
assert losses["textcat"] < 0.00001
|
assert losses["textcat"] < 0.01
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I am happy."
|
test_text = "I am happy."
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
cats = doc.cats
|
cats = doc.cats
|
||||||
|
# note that by default, exclusive_classes = false so we need a bigger error margin
|
||||||
assert cats["POSITIVE"] > 0.9
|
assert cats["POSITIVE"] > 0.9
|
||||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
cats2 = doc2.cats
|
||||||
|
assert cats2["POSITIVE"] > 0.9
|
||||||
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
|
@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import is_stop
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
from spacy.ml.models.defaults import default_ner, default_tagger
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
from spacy.pipeline import Tagger, EntityRecognizer
|
from spacy.pipeline import Tagger, EntityRecognizer
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
|
@ -123,7 +124,7 @@ def test_issue1727():
|
||||||
correctly after vectors are added."""
|
correctly after vectors are added."""
|
||||||
data = numpy.ones((3, 300), dtype="f")
|
data = numpy.ones((3, 300), dtype="f")
|
||||||
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||||
tagger = Tagger(Vocab())
|
tagger = Tagger(Vocab(), default_tagger())
|
||||||
tagger.add_label("PRP")
|
tagger.add_label("PRP")
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
tagger.begin_training()
|
tagger.begin_training()
|
||||||
|
@ -131,7 +132,7 @@ def test_issue1727():
|
||||||
tagger.vocab.vectors = vectors
|
tagger.vocab.vectors = vectors
|
||||||
with make_tempdir() as path:
|
with make_tempdir() as path:
|
||||||
tagger.to_disk(path)
|
tagger.to_disk(path)
|
||||||
tagger = Tagger(Vocab()).from_disk(path)
|
tagger = Tagger(Vocab(), default_tagger()).from_disk(path)
|
||||||
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -236,6 +237,7 @@ def test_issue1889(word):
|
||||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
|
||||||
def test_issue1915():
|
def test_issue1915():
|
||||||
cfg = {"hidden_depth": 2} # should error out
|
cfg = {"hidden_depth": 2} # should error out
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
@ -268,7 +270,7 @@ def test_issue1963(en_tokenizer):
|
||||||
|
|
||||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab())
|
ner = EntityRecognizer(Vocab(), default_ner())
|
||||||
example = Example(doc=None)
|
example = Example(doc=None)
|
||||||
example.set_token_annotation(
|
example.set_token_annotation(
|
||||||
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
|
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
|
||||||
|
|
|
@ -32,6 +32,9 @@ def test_issue2179():
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
nlp2 = Italian()
|
nlp2 = Italian()
|
||||||
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||||||
|
|
||||||
|
assert len(nlp2.get_pipe("ner").labels) == 0
|
||||||
|
nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves)
|
||||||
nlp2.from_bytes(nlp.to_bytes())
|
nlp2.from_bytes(nlp.to_bytes())
|
||||||
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
||||||
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
|
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
|
from spacy.ml.models.defaults import default_ner
|
||||||
from spacy.pipeline import EntityRuler, EntityRecognizer
|
from spacy.pipeline import EntityRuler, EntityRecognizer
|
||||||
from spacy.matcher import Matcher, PhraseMatcher
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
@ -103,6 +104,7 @@ def test_issue3209():
|
||||||
assert ner.move_names == move_names
|
assert ner.move_names == move_names
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||||||
|
nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
|
||||||
nlp2.from_bytes(nlp.to_bytes())
|
nlp2.from_bytes(nlp.to_bytes())
|
||||||
assert nlp2.get_pipe("ner").move_names == move_names
|
assert nlp2.get_pipe("ner").move_names == move_names
|
||||||
|
|
||||||
|
@ -193,7 +195,7 @@ def test_issue3345():
|
||||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||||
doc[4].is_sent_start = True
|
doc[4].is_sent_start = True
|
||||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||||
ner = EntityRecognizer(doc.vocab)
|
ner = EntityRecognizer(doc.vocab, default_ner())
|
||||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||||
ner.moves.add_action(5, "")
|
ner.moves.add_action(5, "")
|
||||||
ner.add_label("GPE")
|
ner.add_label("GPE")
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from spacy.pipeline.pipes import DependencyParser
|
from spacy.pipeline.pipes import DependencyParser
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from spacy.ml.models.defaults import default_parser
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
def test_issue3830_no_subtok():
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
parser = DependencyParser(Vocab())
|
parser = DependencyParser(Vocab(), default_parser())
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [])
|
parser.begin_training(lambda: [])
|
||||||
|
@ -13,7 +15,7 @@ def test_issue3830_no_subtok():
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
def test_issue3830_with_subtok():
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
parser = DependencyParser(Vocab(), learn_tokens=True)
|
parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [])
|
parser.begin_training(lambda: [])
|
||||||
|
|
|
@ -3,6 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy.util import ensure_path
|
from spacy.util import ensure_path
|
||||||
|
from spacy.ml.models.defaults import default_ner
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -73,6 +74,6 @@ def test_issue4042_bug2():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
ner1.to_disk(output_dir)
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
ner2 = EntityRecognizer(vocab)
|
ner2 = EntityRecognizer(vocab, default_ner())
|
||||||
ner2.from_disk(output_dir)
|
ner2.from_disk(output_dir)
|
||||||
assert len(ner2.labels) == 2
|
assert len(ner2.labels) == 2
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from spacy.ml.models.defaults import default_ner
|
||||||
from spacy.pipeline import EntityRecognizer
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -11,7 +12,7 @@ def test_issue4313():
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
beam_density = 0.0001
|
beam_density = 0.0001
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = EntityRecognizer(nlp.vocab)
|
ner = EntityRecognizer(nlp.vocab, default_ner())
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
126
spacy/tests/serialize/test_serialize_config.py
Normal file
126
spacy/tests/serialize/test_serialize_config.py
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
from ...ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
|
|
||||||
|
nlp_config_string = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
|
||||||
|
[nlp.pipeline.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[nlp.pipeline.tok2vec.model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[nlp.pipeline.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[nlp.pipeline.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[nlp.pipeline.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecTensors.v1"
|
||||||
|
width = ${nlp.pipeline.tok2vec.model:width}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
parser_config_string = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 99
|
||||||
|
hidden_width = 66
|
||||||
|
maxout_pieces = 2
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 333
|
||||||
|
depth = 4
|
||||||
|
embed_size = 5555
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 7
|
||||||
|
subword_features = false
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("my_test_parser")
|
||||||
|
def my_parser():
|
||||||
|
tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
|
||||||
|
maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
|
||||||
|
conv_depth=2, bilstm_depth=0)
|
||||||
|
parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_nlp():
|
||||||
|
""" Create a custom nlp pipeline from config and ensure it serializes it correctly """
|
||||||
|
nlp_config = Config().from_str(nlp_config_string)
|
||||||
|
nlp = util.load_model_from_config(nlp_config["nlp"])
|
||||||
|
nlp.begin_training()
|
||||||
|
assert "tok2vec" in nlp.pipe_names
|
||||||
|
assert "tagger" in nlp.pipe_names
|
||||||
|
assert "parser" not in nlp.pipe_names
|
||||||
|
assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp2 = spacy.load(d)
|
||||||
|
assert "tok2vec" in nlp2.pipe_names
|
||||||
|
assert "tagger" in nlp2.pipe_names
|
||||||
|
assert "parser" not in nlp2.pipe_names
|
||||||
|
assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_custom_nlp():
|
||||||
|
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
|
||||||
|
nlp = English()
|
||||||
|
parser_cfg = dict()
|
||||||
|
parser_cfg["model"] = {'@architectures': "my_test_parser"}
|
||||||
|
parser = nlp.create_pipe("parser", parser_cfg)
|
||||||
|
nlp.add_pipe(parser)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp2 = spacy.load(d)
|
||||||
|
model = nlp2.get_pipe("parser").model
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
upper = model.upper
|
||||||
|
|
||||||
|
# check that we have the correct settings, not the default ones
|
||||||
|
assert tok2vec.get_dim("nO") == 321
|
||||||
|
assert upper.get_dim("nI") == 65
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_parser():
|
||||||
|
""" Create a non-default parser config to check nlp serializes it correctly """
|
||||||
|
nlp = English()
|
||||||
|
model_config = Config().from_str(parser_config_string)
|
||||||
|
parser = nlp.create_pipe("parser", config=model_config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
nlp.add_pipe(parser)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp2 = spacy.load(d)
|
||||||
|
model = nlp2.get_pipe("parser").model
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
upper = model.upper
|
||||||
|
|
||||||
|
# check that we have the correct settings, not the default ones
|
||||||
|
assert upper.get_dim("nI") == 66
|
||||||
|
assert tok2vec.get_dim("nO") == 333
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
|
@ -56,7 +57,7 @@ def test_serialize_language_exclude(meta_data):
|
||||||
nlp = Language(meta=meta_data)
|
nlp = Language(meta=meta_data)
|
||||||
assert nlp.meta["name"] == name
|
assert nlp.meta["name"] == name
|
||||||
new_nlp = Language().from_bytes(nlp.to_bytes())
|
new_nlp = Language().from_bytes(nlp.to_bytes())
|
||||||
assert nlp.meta["name"] == name
|
assert new_nlp.meta["name"] == name
|
||||||
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
|
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
|
||||||
assert not new_nlp.meta["name"] == name
|
assert not new_nlp.meta["name"] == name
|
||||||
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||||
|
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -10,58 +11,58 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(en_vocab):
|
def parser(en_vocab):
|
||||||
parser = DependencyParser(en_vocab)
|
parser = DependencyParser(en_vocab, default_parser())
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
|
||||||
parser.cfg.update(cfg)
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def blank_parser(en_vocab):
|
def blank_parser(en_vocab):
|
||||||
parser = DependencyParser(en_vocab)
|
parser = DependencyParser(en_vocab, default_parser())
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def taggers(en_vocab):
|
def taggers(en_vocab):
|
||||||
tagger1 = Tagger(en_vocab)
|
model = default_tagger()
|
||||||
tagger2 = Tagger(en_vocab)
|
tagger1 = Tagger(en_vocab, model)
|
||||||
tagger1.model = tagger1.Model(8)
|
tagger2 = Tagger(en_vocab, model)
|
||||||
tagger2.model = tagger1.model
|
return tagger1, tagger2
|
||||||
return (tagger1, tagger2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
parser = Parser(en_vocab)
|
parser = Parser(en_vocab, default_parser())
|
||||||
parser.model, _ = parser.Model(10)
|
new_parser = Parser(en_vocab, default_parser())
|
||||||
new_parser = Parser(en_vocab)
|
|
||||||
new_parser.model, _ = new_parser.Model(10)
|
|
||||||
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
|
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||||
assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"])
|
bytes_2 = new_parser.to_bytes(exclude=["vocab"])
|
||||||
|
bytes_3 = parser.to_bytes(exclude=["vocab"])
|
||||||
|
assert len(bytes_2) == len(bytes_3)
|
||||||
|
assert bytes_2 == bytes_3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||||
parser = Parser(en_vocab)
|
parser = Parser(en_vocab, default_parser())
|
||||||
parser.model, _ = parser.Model(0)
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "parser"
|
file_path = d / "parser"
|
||||||
parser.to_disk(file_path)
|
parser.to_disk(file_path)
|
||||||
parser_d = Parser(en_vocab)
|
parser_d = Parser(en_vocab, default_parser())
|
||||||
parser_d.model, _ = parser_d.Model(0)
|
|
||||||
parser_d = parser_d.from_disk(file_path)
|
parser_d = parser_d.from_disk(file_path)
|
||||||
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
|
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
|
||||||
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
|
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
|
||||||
|
assert len(parser_bytes) == len(parser_d_bytes)
|
||||||
assert parser_bytes == parser_d_bytes
|
assert parser_bytes == parser_d_bytes
|
||||||
|
|
||||||
|
|
||||||
def test_to_from_bytes(parser, blank_parser):
|
def test_to_from_bytes(parser, blank_parser):
|
||||||
assert parser.model is not True
|
assert parser.model is not True
|
||||||
assert blank_parser.model is True
|
assert blank_parser.model is not True
|
||||||
assert blank_parser.moves.n_moves != parser.moves.n_moves
|
assert blank_parser.moves.n_moves != parser.moves.n_moves
|
||||||
bytes_data = parser.to_bytes(exclude=["vocab"])
|
bytes_data = parser.to_bytes(exclude=["vocab"])
|
||||||
|
|
||||||
|
# the blank parser needs to be resized before we can call from_bytes
|
||||||
|
blank_parser.model.resize_output(parser.moves.n_moves)
|
||||||
blank_parser.from_bytes(bytes_data)
|
blank_parser.from_bytes(bytes_data)
|
||||||
assert blank_parser.model is not True
|
assert blank_parser.model is not True
|
||||||
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
||||||
|
@ -75,8 +76,10 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||||
tagger1_b = tagger1.to_bytes()
|
tagger1_b = tagger1.to_bytes()
|
||||||
tagger1 = tagger1.from_bytes(tagger1_b)
|
tagger1 = tagger1.from_bytes(tagger1_b)
|
||||||
assert tagger1.to_bytes() == tagger1_b
|
assert tagger1.to_bytes() == tagger1_b
|
||||||
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
new_tagger1 = Tagger(en_vocab, default_tagger()).from_bytes(tagger1_b)
|
||||||
assert new_tagger1.to_bytes() == tagger1_b
|
new_tagger1_b = new_tagger1.to_bytes()
|
||||||
|
assert len(new_tagger1_b) == len(tagger1_b)
|
||||||
|
assert new_tagger1_b == tagger1_b
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
|
@ -86,26 +89,24 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
file_path2 = d / "tagger2"
|
file_path2 = d / "tagger2"
|
||||||
tagger1.to_disk(file_path1)
|
tagger1.to_disk(file_path1)
|
||||||
tagger2.to_disk(file_path2)
|
tagger2.to_disk(file_path2)
|
||||||
tagger1_d = Tagger(en_vocab).from_disk(file_path1)
|
tagger1_d = Tagger(en_vocab, default_tagger()).from_disk(file_path1)
|
||||||
tagger2_d = Tagger(en_vocab).from_disk(file_path2)
|
tagger2_d = Tagger(en_vocab, default_tagger()).from_disk(file_path2)
|
||||||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
||||||
tensorizer = Tensorizer(en_vocab)
|
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||||
tensorizer.model = tensorizer.Model()
|
|
||||||
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
||||||
new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b)
|
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
|
||||||
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
tensorizer = Tensorizer(en_vocab)
|
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||||
tensorizer.model = tensorizer.Model()
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "tensorizer"
|
file_path = d / "tensorizer"
|
||||||
tensorizer.to_disk(file_path)
|
tensorizer.to_disk(file_path)
|
||||||
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
|
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
|
||||||
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
||||||
exclude=["vocab"]
|
exclude=["vocab"]
|
||||||
)
|
)
|
||||||
|
@ -113,19 +114,17 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
|
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||||
textcat.to_bytes(exclude=["vocab"])
|
textcat.to_bytes(exclude=["vocab"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_pipe_exclude(en_vocab, Parser):
|
def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||||
def get_new_parser():
|
def get_new_parser():
|
||||||
new_parser = Parser(en_vocab)
|
new_parser = Parser(en_vocab, default_parser())
|
||||||
new_parser.model, _ = new_parser.Model(0)
|
|
||||||
return new_parser
|
return new_parser
|
||||||
|
|
||||||
parser = Parser(en_vocab)
|
parser = Parser(en_vocab, default_parser())
|
||||||
parser.model, _ = parser.Model(0)
|
|
||||||
parser.cfg["foo"] = "bar"
|
parser.cfg["foo"] = "bar"
|
||||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
|
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||||
assert "foo" in new_parser.cfg
|
assert "foo" in new_parser.cfg
|
||||||
|
@ -144,7 +143,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_sentencerecognizer(en_vocab):
|
def test_serialize_sentencerecognizer(en_vocab):
|
||||||
sr = SentenceRecognizer(en_vocab)
|
sr = SentenceRecognizer(en_vocab, default_sentrec())
|
||||||
sr_b = sr.to_bytes()
|
sr_b = sr.to_bytes()
|
||||||
sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b)
|
sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b)
|
||||||
assert sr.to_bytes() == sr_d.to_bytes()
|
assert sr.to_bytes() == sr_d.to_bytes()
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.ml.component_models import Tok2Vec
|
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@ -25,7 +25,8 @@ def test_empty_doc():
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
doc = Doc(vocab, words=[])
|
doc = Doc(vocab, words=[])
|
||||||
tok2vec = Tok2Vec(width, embed_size)
|
# TODO: fix tok2vec arguments
|
||||||
|
tok2vec = build_Tok2Vec_model(width, embed_size)
|
||||||
vectors, backprop = tok2vec.begin_update([doc])
|
vectors, backprop = tok2vec.begin_update([doc])
|
||||||
assert len(vectors) == 1
|
assert len(vectors) == 1
|
||||||
assert vectors[0].shape == (0, width)
|
assert vectors[0].shape == (0, width)
|
||||||
|
@ -36,7 +37,19 @@ def test_empty_doc():
|
||||||
)
|
)
|
||||||
def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
batch = get_batch(batch_size)
|
batch = get_batch(batch_size)
|
||||||
tok2vec = Tok2Vec(width, embed_size)
|
tok2vec = build_Tok2Vec_model(
|
||||||
|
width,
|
||||||
|
embed_size,
|
||||||
|
pretrained_vectors=None,
|
||||||
|
conv_depth=4,
|
||||||
|
bilstm_depth=0,
|
||||||
|
window_size=1,
|
||||||
|
maxout_pieces=3,
|
||||||
|
subword_features=True,
|
||||||
|
char_embed=False,
|
||||||
|
nM=64,
|
||||||
|
nC=8,
|
||||||
|
)
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
vectors, backprop = tok2vec.begin_update(batch)
|
vectors, backprop = tok2vec.begin_update(batch)
|
||||||
assert len(vectors) == len(batch)
|
assert len(vectors) == len(batch)
|
||||||
|
@ -44,19 +57,24 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
assert doc_vec.shape == (len(doc), width)
|
assert doc_vec.shape == (len(doc), width)
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tok2vec_config",
|
"tok2vec_config",
|
||||||
[
|
[
|
||||||
{"width": 8, "embed_size": 100, "char_embed": False},
|
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||||
{"width": 8, "embed_size": 100, "char_embed": True},
|
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||||
{"width": 8, "embed_size": 100, "conv_depth": 6},
|
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||||
{"width": 8, "embed_size": 100, "conv_depth": 6},
|
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||||
{"width": 8, "embed_size": 100, "subword_features": False},
|
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||||
|
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||||
|
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||||
|
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
def test_tok2vec_configs(tok2vec_config):
|
def test_tok2vec_configs(tok2vec_config):
|
||||||
docs = get_batch(3)
|
docs = get_batch(3)
|
||||||
tok2vec = Tok2Vec(**tok2vec_config)
|
tok2vec = build_Tok2Vec_model(**tok2vec_config)
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
vectors, backprop = tok2vec.begin_update(docs)
|
vectors, backprop = tok2vec.begin_update(docs)
|
||||||
assert len(vectors) == len(docs)
|
assert len(vectors) == len(docs)
|
||||||
|
|
|
@ -6,8 +6,7 @@ from pathlib import Path
|
||||||
import random
|
import random
|
||||||
from typing import List
|
from typing import List
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.config
|
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu
|
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
@ -146,6 +145,10 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
pipeline from meta.json and then calls from_disk() with path."""
|
pipeline from meta.json and then calls from_disk() with path."""
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
|
nlp_config = get_model_config(model_path)
|
||||||
|
if nlp_config.get("nlp", None):
|
||||||
|
return load_model_from_config(nlp_config["nlp"])
|
||||||
|
|
||||||
# Support language factories registered via entry points (e.g. custom
|
# Support language factories registered via entry points (e.g. custom
|
||||||
# language subclass) while keeping top-level language identifier "lang"
|
# language subclass) while keeping top-level language identifier "lang"
|
||||||
lang = meta.get("lang_factory", meta["lang"])
|
lang = meta.get("lang_factory", meta["lang"])
|
||||||
|
@ -162,11 +165,30 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
if name not in disable:
|
if name not in disable:
|
||||||
config = meta.get("pipeline_args", {}).get(name, {})
|
config = meta.get("pipeline_args", {}).get(name, {})
|
||||||
factory = factories.get(name, name)
|
factory = factories.get(name, name)
|
||||||
|
if nlp_config.get(name, None):
|
||||||
|
model_config = nlp_config[name]["model"]
|
||||||
|
config["model"] = model_config
|
||||||
component = nlp.create_pipe(factory, config=config)
|
component = nlp.create_pipe(factory, config=config)
|
||||||
nlp.add_pipe(component, name=name)
|
nlp.add_pipe(component, name=name)
|
||||||
return nlp.from_disk(model_path, exclude=disable)
|
return nlp.from_disk(model_path, exclude=disable)
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_from_config(nlp_config):
|
||||||
|
if "name" in nlp_config:
|
||||||
|
nlp = load_model(**nlp_config)
|
||||||
|
elif "lang" in nlp_config:
|
||||||
|
lang_class = get_lang_class(nlp_config["lang"])
|
||||||
|
nlp = lang_class()
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E993)
|
||||||
|
if "pipeline" in nlp_config:
|
||||||
|
for name, component_cfg in nlp_config["pipeline"].items():
|
||||||
|
factory = component_cfg.pop("factory")
|
||||||
|
component = nlp.create_pipe(factory, config=component_cfg)
|
||||||
|
nlp.add_pipe(component, name=name)
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_init_py(init_file, **overrides):
|
def load_model_from_init_py(init_file, **overrides):
|
||||||
"""Helper function to use in the `load()` method of a model package's
|
"""Helper function to use in the `load()` method of a model package's
|
||||||
__init__.py.
|
__init__.py.
|
||||||
|
@ -184,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
|
||||||
return load_model_from_path(data_path, meta, **overrides)
|
return load_model_from_path(data_path, meta, **overrides)
|
||||||
|
|
||||||
|
|
||||||
def load_from_config(path, create_objects=False):
|
def load_config(path, create_objects=False):
|
||||||
"""Load a Thinc-formatted config file, optionally filling in objects where
|
"""Load a Thinc-formatted config file, optionally filling in objects where
|
||||||
the config references registry entries. See "Thinc config files" for details.
|
the config references registry entries. See "Thinc config files" for details.
|
||||||
|
|
||||||
|
@ -212,7 +234,7 @@ def get_model_meta(path):
|
||||||
raise IOError(Errors.E052.format(path=model_path))
|
raise IOError(Errors.E052.format(path=model_path))
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
raise IOError(Errors.E053.format(path=meta_path))
|
raise IOError(Errors.E053.format(path=meta_path, name="meta.json"))
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
for setting in ["lang", "name", "version"]:
|
for setting in ["lang", "name", "version"]:
|
||||||
if setting not in meta or not meta[setting]:
|
if setting not in meta or not meta[setting]:
|
||||||
|
@ -220,6 +242,23 @@ def get_model_meta(path):
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_config(path):
|
||||||
|
"""Get the model's config from a directory path.
|
||||||
|
|
||||||
|
path (unicode or Path): Path to model directory.
|
||||||
|
RETURNS (Config): The model's config data.
|
||||||
|
"""
|
||||||
|
model_path = ensure_path(path)
|
||||||
|
if not model_path.exists():
|
||||||
|
raise IOError(Errors.E052.format(path=model_path))
|
||||||
|
config_path = model_path / "config.cfg"
|
||||||
|
# model directories are allowed not to have config files ?
|
||||||
|
if not config_path.is_file():
|
||||||
|
return Config({})
|
||||||
|
# raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||||
|
return Config().from_disk(config_path)
|
||||||
|
|
||||||
|
|
||||||
def is_package(name):
|
def is_package(name):
|
||||||
"""Check if string maps to a package installed via pip.
|
"""Check if string maps to a package installed via pip.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user