mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 08:14:15 +03:00
Merge pull request #5617 from explosion/chore/tidy-auto-format
This commit is contained in:
commit
dbe9c29f61
|
@ -24,8 +24,8 @@ from ..gold import Example
|
|||
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
|
||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
|
||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
|
||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
|
||||
# fmt: on
|
||||
)
|
||||
def pretrain(
|
||||
|
|
|
@ -3,7 +3,6 @@ from timeit import default_timer as timer
|
|||
|
||||
import srsly
|
||||
from pydantic import BaseModel, FilePath
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
@ -16,7 +15,9 @@ from ..gold import GoldCorpus
|
|||
from ..lookups import Lookups
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..ml import models # don't remove - required to load the built-in architectures
|
||||
|
||||
# Don't remove - required to load the built-in architectures
|
||||
from ..ml import models # noqa: F401
|
||||
|
||||
registry = util.registry
|
||||
|
||||
|
@ -114,33 +115,19 @@ class ConfigSchema(BaseModel):
|
|||
extra = "allow"
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
output_path=("Output directory to store model in", "option", "o", Path),
|
||||
init_tok2vec=(
|
||||
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
|
||||
Path),
|
||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
||||
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||
# fmt: on
|
||||
)
|
||||
def train_cli(
|
||||
train_path,
|
||||
dev_path,
|
||||
config_path,
|
||||
output_path=None,
|
||||
init_tok2vec=None,
|
||||
raw_text=None,
|
||||
verbose=False,
|
||||
use_gpu=-1,
|
||||
tag_map_path=None,
|
||||
omit_extra_lookups=False,
|
||||
# fmt: off
|
||||
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||
config_path: ("Path to config file", "positional", None, Path),
|
||||
output_path: ("Output directory to store model in", "option", "o", Path) = None,
|
||||
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
||||
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
|
||||
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
||||
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
|
@ -212,7 +199,7 @@ def train(
|
|||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["training"]["seed"])
|
||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
nlp_config = config["nlp"]
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
|
@ -227,7 +214,9 @@ def train(
|
|||
# verify textcat config
|
||||
if "textcat" in nlp_config["pipeline"]:
|
||||
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
||||
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
|
||||
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
|
||||
"exclusive_classes"
|
||||
]
|
||||
|
||||
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
||||
if textcat_multilabel:
|
||||
|
@ -255,7 +244,9 @@ def train(
|
|||
"to 'false' in the config to train a classifier with classes "
|
||||
"that are not mutually exclusive."
|
||||
)
|
||||
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
|
||||
msg.info(
|
||||
f"Initialized textcat component for {len(textcat_labels)} unique labels"
|
||||
)
|
||||
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
|
||||
|
||||
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
|
||||
|
@ -281,9 +272,7 @@ def train(
|
|||
nlp.resume_training()
|
||||
else:
|
||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||
nlp.begin_training(
|
||||
lambda: corpus.train_examples
|
||||
)
|
||||
nlp.begin_training(lambda: corpus.train_examples)
|
||||
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
|
@ -310,8 +299,7 @@ def train(
|
|||
tok2vec = tok2vec.get(subpath)
|
||||
if not tok2vec:
|
||||
msg.fail(
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.",
|
||||
exits=1,
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
||||
)
|
||||
tok2vec.from_bytes(weights_data)
|
||||
|
||||
|
@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
noise_level=0.0, # I think this is deprecated?
|
||||
noise_level=0.0, # I think this is deprecated?
|
||||
orth_variant_level=cfg["orth_variant_level"],
|
||||
gold_preproc=cfg["gold_preproc"],
|
||||
max_length=cfg["max_length"],
|
||||
|
@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
|||
try:
|
||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||
except KeyError as e:
|
||||
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict_name="score_weights", key=str(e), keys=list(scores.keys())
|
||||
)
|
||||
)
|
||||
|
||||
scores["speed"] = wps
|
||||
return weighted_score, scores
|
||||
|
@ -578,15 +570,25 @@ def setup_printer(training, nlp):
|
|||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
||||
Errors.E983.format(
|
||||
dict_name="scores (losses)",
|
||||
key=str(e),
|
||||
keys=list(info["losses"].keys()),
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
scores = [
|
||||
"{0:.2f}".format(float(info["other_scores"][col]))
|
||||
for col in score_cols
|
||||
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict_name="scores (other)",
|
||||
key=str(e),
|
||||
keys=list(info["other_scores"].keys()),
|
||||
)
|
||||
)
|
||||
data = (
|
||||
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||
)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||
from .errors import Errors
|
||||
from .lookups import Lookups
|
||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||
|
@ -51,7 +50,13 @@ class Lemmatizer(object):
|
|||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
|
||||
if not any(
|
||||
(
|
||||
index_table.get(univ_pos),
|
||||
exc_table.get(univ_pos),
|
||||
rules_table.get(univ_pos),
|
||||
)
|
||||
):
|
||||
if univ_pos == "propn":
|
||||
return [string]
|
||||
else:
|
||||
|
|
|
@ -1 +1 @@
|
|||
from .models import *
|
||||
from .models import * # noqa: F401, F403
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from typing import Dict, Optional
|
||||
import numpy
|
||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
|
||||
from thinc.api import to_numpy
|
||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
||||
|
||||
from ..tokens import Doc
|
||||
from thinc.api import Model
|
||||
from thinc.types import Padded, Floats3d
|
||||
|
||||
|
||||
def BILUO() -> Model[Padded, Padded]:
|
||||
|
@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]:
|
|||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions}
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
|
||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||
if X is not None and Y is not None:
|
||||
if X.data.shape != Y.data.shape:
|
||||
# TODO: Fix error
|
||||
|
@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
|||
masks = model.ops.alloc3f(*Y.shape)
|
||||
max_value = Xp.data.max()
|
||||
for t in range(Xp.data.shape[0]):
|
||||
is_last = (Xp.lengths < (t+2)).astype("i")
|
||||
is_last = (Xp.lengths < (t + 2)).astype("i")
|
||||
masks[t] = valid_transitions[is_last, prev_actions]
|
||||
# Don't train the out-of-bounds sequences.
|
||||
masks[t, Xp.size_at_t[t]:] = 0
|
||||
masks[t, Xp.size_at_t[t] :] = 0
|
||||
# Valid actions get 0*10e8, invalid get large negative value
|
||||
Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10)
|
||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
|
||||
prev_actions = Y[t].argmax(axis=-1)
|
||||
|
||||
def backprop_biluo(dY: Padded) -> Padded:
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
|
||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
||||
|
||||
from ..tokens import Doc
|
||||
from typing import Dict, Optional
|
||||
from thinc.api import Ops, Model
|
||||
from thinc.types import Padded, Floats3d
|
||||
|
||||
|
||||
def IOB() -> Model[Padded, Padded]:
|
||||
|
@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]:
|
|||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions}
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
|
||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||
if X is not None and Y is not None:
|
||||
if X.data.shape != Y.data.shape:
|
||||
# TODO: Fix error
|
||||
|
@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
|||
for t in range(Xp.data.shape[0]):
|
||||
masks[t] = valid_transitions[prev_actions]
|
||||
# Don't train the out-of-bounds sequences.
|
||||
masks[t, Xp.size_at_t[t]:] = 0
|
||||
masks[t, Xp.size_at_t[t] :] = 0
|
||||
# Valid actions get 0*10e8, invalid get -1*10e8
|
||||
Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
|
||||
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
|
||||
prev_actions = Y[t].argmax(axis=-1)
|
||||
|
||||
def backprop_biluo(dY: Padded) -> Padded:
|
||||
# Masking the gradient seems to do poorly here. But why?
|
||||
#dY.data *= masks
|
||||
# dY.data *= masks
|
||||
return dY
|
||||
|
||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
||||
|
@ -83,10 +81,10 @@ def _get_transition_table(
|
|||
B_range = ops.xp.arange(B_start, B_end)
|
||||
I_range = ops.xp.arange(I_start, I_end)
|
||||
# B and O are always valid
|
||||
table[:, B_start : B_end] = 1
|
||||
table[:, B_start:B_end] = 1
|
||||
table[:, O_action] = 1
|
||||
# I can only follow a matching B
|
||||
table[B_range, I_range] = 1
|
||||
|
||||
|
||||
_cache[n_actions] = table
|
||||
return table
|
||||
|
|
|
@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
|
|||
#
|
||||
# (ids < 0).T @ dY
|
||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True)
|
||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
||||
return d_pad.reshape((1, nF, nO, nP))
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .entity_linker import * # noqa
|
||||
from .parser import * # noqa
|
||||
from .simple_ner import *
|
||||
from .simple_ner import * # noqa
|
||||
from .tagger import * # noqa
|
||||
from .textcat import * # noqa
|
||||
from .tok2vec import * # noqa
|
||||
|
|
|
@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
|||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
|
||||
Maxout(
|
||||
nO=token_vector_width * 2,
|
||||
nI=token_vector_width,
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
),
|
||||
LayerNorm(token_vector_width * 2),
|
||||
softmax,
|
||||
)
|
||||
|
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
|
|||
# nO = vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
Maxout(
|
||||
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
|
||||
nO=nO,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
normalize=True,
|
||||
dropout=0.0,
|
||||
),
|
||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||
)
|
||||
|
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
|||
def mlm_forward(model, docs, is_train):
|
||||
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
|
||||
output, backprop = model.get_ref("wrapped-model").begin_update(
|
||||
docs
|
||||
) # drop=drop
|
||||
|
||||
def mlm_backward(d_output):
|
||||
d_output *= 1 - mask
|
||||
|
|
|
@ -16,18 +16,14 @@ def build_tb_parser_model(
|
|||
nO=None,
|
||||
):
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
with_array(Linear(hidden_width, t2v_width)),
|
||||
list2array(),
|
||||
)
|
||||
tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
||||
lower = PrecomputableAffine(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces
|
||||
nP=maxout_pieces,
|
||||
)
|
||||
if use_upper:
|
||||
with use_ops("numpy"):
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import functools
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
|
||||
from typing import List
|
||||
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
|
||||
from thinc.api import chain, list2padded, configure_normal_init
|
||||
from thinc.api import Dropout
|
||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...tokens import Doc
|
||||
from .._biluo import BILUO
|
||||
|
@ -12,12 +11,12 @@ from ...util import registry
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.BiluoTagger.v1")
|
||||
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
||||
def BiluoTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
biluo = BILUO()
|
||||
linear = Linear(
|
||||
nO=None,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
init_W=configure_normal_init(mean=0.02)
|
||||
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
||||
)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
|
@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
|||
with_array(chain(Dropout(0.1), linear)),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list()
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
|
@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
|||
layers=[model, linear],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.IOBTagger.v1")
|
||||
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
||||
def IOBTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
biluo = IOB()
|
||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
||||
model = chain(
|
||||
|
@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
|||
with_array(linear),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list()
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
|
@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
|||
layers=[model],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
|
||||
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
||||
if model.get_dim("nO") is None and Y:
|
||||
model.set_dim("nO", Y[0].shape[1])
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
|
||||
from thinc.api import glorot_uniform_init
|
||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||
|
||||
from ...util import registry
|
||||
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
|
||||
from thinc.api import chain, concatenate, clone, Dropout
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
|
||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
|
||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
|
||||
from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
|
||||
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
|
||||
|
||||
from ..spacy_vectors import SpacyVectors
|
||||
from ... import util
|
||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
|
||||
|
@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCat.v1")
|
||||
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
|
||||
window_size, conv_depth, dropout, nO=None):
|
||||
def build_text_classifier(
|
||||
width,
|
||||
embed_size,
|
||||
pretrained_vectors,
|
||||
exclusive_classes,
|
||||
ngram_size,
|
||||
window_size,
|
||||
conv_depth,
|
||||
dropout,
|
||||
nO=None,
|
||||
):
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
|
||||
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
|
||||
lower = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
|
||||
)
|
||||
prefix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
|
||||
)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||
|
@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
|||
vectors_width = width
|
||||
tok2vec = vector_layer >> with_array(
|
||||
Maxout(width, vectors_width, normalize=True)
|
||||
>> residual((expand_window(window_size=window_size)
|
||||
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
|
||||
>> residual(
|
||||
(
|
||||
expand_window(window_size=window_size)
|
||||
>> Maxout(
|
||||
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
||||
)
|
||||
)
|
||||
)
|
||||
** conv_depth,
|
||||
pad=conv_depth,
|
||||
)
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> ParametricAttention(width)
|
||||
>> reduce_sum()
|
||||
>> residual(Maxout(nO=width, nI=width))
|
||||
>> Linear(nO=nO, nI=width)
|
||||
>> Dropout(0.0)
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> ParametricAttention(width)
|
||||
>> reduce_sum()
|
||||
>> residual(Maxout(nO=width, nI=width))
|
||||
>> Linear(nO=nO, nI=width)
|
||||
>> Dropout(0.0)
|
||||
)
|
||||
|
||||
linear_model = build_bow_text_classifier(
|
||||
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
|
||||
nO=nO,
|
||||
ngram_size=ngram_size,
|
||||
exclusive_classes=exclusive_classes,
|
||||
no_output_layer=False,
|
||||
)
|
||||
nO_double = nO*2 if nO else None
|
||||
nO_double = nO * 2 if nO else None
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||
else:
|
||||
output_layer = (
|
||||
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
)
|
||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
if model.has_dim("nO") is not False:
|
||||
|
|
|
@ -99,7 +99,13 @@ def hash_charembed_cnn(
|
|||
|
||||
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
||||
def hash_embed_bilstm_v1(
|
||||
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
|
||||
pretrained_vectors,
|
||||
width,
|
||||
depth,
|
||||
embed_size,
|
||||
subword_features,
|
||||
maxout_pieces,
|
||||
dropout,
|
||||
):
|
||||
# Does not use character embeddings: set to False by default
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
|
|||
|
||||
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
|
||||
def LayerNormalizedMaxout(width, maxout_pieces):
|
||||
return Maxout(
|
||||
nO=width,
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
)
|
||||
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
|
||||
def MultiHashEmbed(
|
||||
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
||||
):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
if use_subwords:
|
||||
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
|
||||
)
|
||||
|
||||
if pretrained_vectors:
|
||||
glove = StaticVectors(
|
||||
|
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
|||
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
|
||||
Maxout(
|
||||
nO=width,
|
||||
nI=width * ((window_size * 2) + 1),
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
|
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
|
|||
subword_features = False
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
|
||||
norm = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
|
||||
)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
|
||||
)
|
||||
else:
|
||||
prefix, suffix, shape = (None, None, None)
|
||||
if pretrained_vectors is not None:
|
||||
|
|
|
@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
|||
attrs={
|
||||
"has_upper": has_upper,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output
|
||||
}
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
@ -31,7 +31,7 @@ def forward(model, X, is_train):
|
|||
model.layers,
|
||||
unseen_classes=model.attrs["unseen_classes"],
|
||||
train=is_train,
|
||||
has_upper=model.attrs["has_upper"]
|
||||
has_upper=model.attrs["has_upper"],
|
||||
)
|
||||
|
||||
return step_model, step_model.finish_steps
|
||||
|
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
|
|||
nI = None
|
||||
if smaller.has_dim("nI"):
|
||||
nI = smaller.get_dim("nI")
|
||||
with use_ops('numpy'):
|
||||
with use_ops("numpy"):
|
||||
larger = Linear(nO=new_nO, nI=nI)
|
||||
larger.init = smaller.init
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
|
@ -74,8 +74,8 @@ def resize_output(model, new_nO):
|
|||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
if smaller.has_dim("nO"):
|
||||
larger_W[:smaller.get_dim("nO")] = smaller_W
|
||||
larger_b[:smaller.get_dim("nO")] = smaller_b
|
||||
larger_W[: smaller.get_dim("nO")] = smaller_W
|
||||
larger_b[: smaller.get_dim("nO")] = smaller_b
|
||||
for i in range(smaller.get_dim("nO"), new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
|
|
|
@ -21,9 +21,7 @@ class SimpleNER(Pipe):
|
|||
self.model = model
|
||||
self.cfg = {"labels": []}
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(),
|
||||
normalize=True,
|
||||
missing_value=None
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
assert self.model is not None
|
||||
|
||||
|
@ -38,21 +36,21 @@ class SimpleNER(Pipe):
|
|||
def add_label(self, label):
|
||||
if label not in self.cfg["labels"]:
|
||||
self.cfg["labels"].append(label)
|
||||
|
||||
|
||||
def get_tag_names(self):
|
||||
if self.is_biluo:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels] +
|
||||
[f"I-{label}" for label in self.labels] +
|
||||
[f"L-{label}" for label in self.labels] +
|
||||
[f"U-{label}" for label in self.labels] +
|
||||
["O"]
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ [f"L-{label}" for label in self.labels]
|
||||
+ [f"U-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
else:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels] +
|
||||
[f"I-{label}" for label in self.labels] +
|
||||
["O"]
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
|
||||
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
||||
|
@ -108,7 +106,7 @@ class SimpleNER(Pipe):
|
|||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
self.cfg.update(kwargs)
|
||||
if not hasattr(get_examples, '__call__'):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
gold_tuples = get_examples
|
||||
get_examples = lambda: gold_tuples
|
||||
labels = _get_labels(get_examples())
|
||||
|
@ -117,14 +115,12 @@ class SimpleNER(Pipe):
|
|||
labels = self.labels
|
||||
n_actions = self.model.attrs["get_num_actions"](len(labels))
|
||||
self.model.set_dim("nO", n_actions)
|
||||
self.model.initialize()
|
||||
self.model.initialize()
|
||||
if pipeline is not None:
|
||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(),
|
||||
normalize=True,
|
||||
missing_value=None
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
|
||||
return sgd
|
||||
|
@ -135,7 +131,7 @@ class SimpleNER(Pipe):
|
|||
|
||||
def _has_ner(eg):
|
||||
for ner_tag in eg.gold.ner:
|
||||
if ner_tag != "-" and ner_tag != None:
|
||||
if ner_tag != "-" and ner_tag is not None:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
@ -145,7 +141,7 @@ def _get_labels(examples):
|
|||
labels = set()
|
||||
for eg in examples:
|
||||
for ner_tag in eg.token_annotation.entities:
|
||||
if ner_tag != 'O' and ner_tag != '-':
|
||||
_, label = ner_tag.split('-', 1)
|
||||
if ner_tag != "O" and ner_tag != "-":
|
||||
_, label = ner_tag.split("-", 1)
|
||||
labels.add(label)
|
||||
return list(sorted(labels))
|
||||
|
|
|
@ -98,7 +98,9 @@ class Scorer(object):
|
|||
for name, component in pipeline:
|
||||
if name == "textcat":
|
||||
self.textcat_multilabel = component.model.attrs["multi_label"]
|
||||
self.textcat_positive_label = component.cfg.get("positive_label", None)
|
||||
self.textcat_positive_label = component.cfg.get(
|
||||
"positive_label", None
|
||||
)
|
||||
for label in component.cfg.get("labels", []):
|
||||
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
||||
self.textcat_f_per_cat[label] = PRFScore()
|
||||
|
@ -119,19 +121,19 @@ class Scorer(object):
|
|||
|
||||
@property
|
||||
def morphs_acc(self):
|
||||
"""RETURNS (float): Morph tag accuracy (morphological features,
|
||||
"""RETURNS (float): Morph tag accuracy (morphological features,
|
||||
i.e. `Token.morph`).
|
||||
"""
|
||||
return self.morphs.fscore * 100
|
||||
return self.morphs.fscore * 100
|
||||
|
||||
@property
|
||||
def morphs_per_type(self):
|
||||
"""RETURNS (dict): Scores per dependency label.
|
||||
"""RETURNS (dict): Scores per dependency label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.morphs_per_feat.items()
|
||||
}
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.morphs_per_feat.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def sent_p(self):
|
||||
|
@ -302,7 +304,15 @@ class Scorer(object):
|
|||
gold_morphs_per_feat = {}
|
||||
gold_sent_starts = set()
|
||||
gold_ents = set(tags_to_entities(orig.entities))
|
||||
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
|
||||
for id_, tag, pos, morph, head, dep, sent_start in zip(
|
||||
orig.ids,
|
||||
orig.tags,
|
||||
orig.pos,
|
||||
orig.morphs,
|
||||
orig.heads,
|
||||
orig.deps,
|
||||
orig.sent_starts,
|
||||
):
|
||||
gold_tags.add((id_, tag))
|
||||
gold_pos.add((id_, pos))
|
||||
gold_morphs.add((id_, morph))
|
||||
|
@ -400,7 +410,10 @@ class Scorer(object):
|
|||
self.pos.score_set(cand_pos, gold_pos)
|
||||
self.morphs.score_set(cand_morphs, gold_morphs)
|
||||
for field in self.morphs_per_feat:
|
||||
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
|
||||
self.morphs_per_feat[field].score_set(
|
||||
cand_morphs_per_feat.get(field, set()),
|
||||
gold_morphs_per_feat.get(field, set()),
|
||||
)
|
||||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||
self.labelled.score_set(cand_deps, gold_deps)
|
||||
for dep in self.labelled_per_dep:
|
||||
|
@ -412,7 +425,9 @@ class Scorer(object):
|
|||
)
|
||||
if (
|
||||
len(gold.cats) > 0
|
||||
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
|
||||
and set(self.textcat_f_per_cat)
|
||||
== set(self.textcat_auc_per_cat)
|
||||
== set(gold.cats)
|
||||
and set(gold.cats) == set(doc.cats)
|
||||
):
|
||||
goldcat = max(gold.cats, key=gold.cats.get)
|
||||
|
@ -424,10 +439,10 @@ class Scorer(object):
|
|||
)
|
||||
for label in set(gold.cats):
|
||||
self.textcat_auc_per_cat[label].score_set(
|
||||
doc.cats[label], gold.cats[label]
|
||||
doc.cats[label], gold.cats[label]
|
||||
)
|
||||
self.textcat_f_per_cat[label].score_set(
|
||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||
)
|
||||
elif len(self.textcat_f_per_cat) > 0:
|
||||
model_labels = set(self.textcat_f_per_cat)
|
||||
|
|
|
@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
|
|||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
|
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
def test_ents_reset(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import pytest
|
||||
from thinc.api import Adam, NumpyOps
|
||||
from thinc.api import Adam
|
||||
from spacy.attrs import NORM
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.pipeline.defaults import default_parser, default_ner
|
||||
from spacy.tokens import Doc
|
||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||
|
@ -17,7 +16,12 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(vocab, default_parser(), **config)
|
||||
return parser
|
||||
|
||||
|
@ -58,7 +62,12 @@ def test_add_label(parser):
|
|||
|
||||
|
||||
def test_add_label_deserializes_correctly():
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||
ner1.add_label("C")
|
||||
ner1.add_label("B")
|
||||
|
|
|
@ -138,7 +138,12 @@ def test_get_oracle_actions():
|
|||
deps.append(dep)
|
||||
ents.append(ent)
|
||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(doc.vocab, default_parser(), **config)
|
||||
parser.moves.add_action(0, "")
|
||||
parser.moves.add_action(1, "")
|
||||
|
|
|
@ -138,7 +138,12 @@ def test_accept_blocked_token():
|
|||
# 1. test normal behaviour
|
||||
nlp1 = English()
|
||||
doc1 = nlp1("I live in New York")
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
|
||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||
|
@ -157,7 +162,12 @@ def test_accept_blocked_token():
|
|||
# 2. test blocking behaviour
|
||||
nlp2 = English()
|
||||
doc2 = nlp2("I live in New York")
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
|
@ -215,7 +225,12 @@ def test_overwrite_token():
|
|||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||
|
||||
# Check that a new ner can overwrite O
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
|
|
|
@ -28,7 +28,12 @@ def tok2vec():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab, arc_eager):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
|
||||
|
||||
|
||||
|
|
|
@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores):
|
|||
|
||||
def test_beam_parse():
|
||||
nlp = Language()
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
|
||||
nlp.parser.add_label("nsubj")
|
||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||
|
|
|
@ -16,7 +16,12 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(vocab, default_parser(), **config)
|
||||
parser.cfg["token_vector_width"] = 4
|
||||
parser.cfg["hidden_width"] = 32
|
||||
|
|
|
@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
|||
def test_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
|
@ -285,7 +287,11 @@ def test_overfitting_IO():
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
|
||||
mykb.add_alias(
|
||||
alias="Russ Cochran",
|
||||
entities=["Q2146908", "Q7381115"],
|
||||
probabilities=[0.5, 0.5],
|
||||
)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
|
||||
|
|
|
@ -15,8 +15,17 @@ def test_label_types():
|
|||
|
||||
|
||||
TRAIN_DATA = [
|
||||
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
|
||||
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
|
||||
(
|
||||
"I like green eggs",
|
||||
{
|
||||
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
|
||||
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"Eat blue ham",
|
||||
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
@ -38,7 +47,12 @@ def test_overfitting_IO():
|
|||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
||||
gold_morphs = [
|
||||
"Feat=N|POS=NOUN",
|
||||
"Feat=V|POS=VERB",
|
||||
"Feat=J|POS=ADJ",
|
||||
"Feat=N|POS=NOUN",
|
||||
]
|
||||
assert gold_morphs == [t.morph_ for t in doc]
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
|
|
|
@ -1,30 +1,31 @@
|
|||
import pytest
|
||||
from collections import namedtuple
|
||||
|
||||
from thinc.api import NumpyOps
|
||||
from spacy.ml._biluo import BILUO, _get_transition_table
|
||||
from spacy.pipeline.simple_ner import SimpleNER
|
||||
import spacy
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
["PER", "ORG", "LOC", "MISC"],
|
||||
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
|
||||
])
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
["PER", "ORG", "LOC", "MISC"],
|
||||
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
|
||||
]
|
||||
)
|
||||
def labels(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ops():
|
||||
return NumpyOps()
|
||||
|
||||
|
||||
def _get_actions(labels):
|
||||
action_names = (
|
||||
[f"B{label}" for label in labels] + \
|
||||
[f"I{label}" for label in labels] + \
|
||||
[f"L{label}" for label in labels] + \
|
||||
[f"U{label}" for label in labels] + \
|
||||
["O"]
|
||||
[f"B{label}" for label in labels]
|
||||
+ [f"I{label}" for label in labels]
|
||||
+ [f"L{label}" for label in labels]
|
||||
+ [f"U{label}" for label in labels]
|
||||
+ ["O"]
|
||||
)
|
||||
A = namedtuple("actions", action_names)
|
||||
return A(**{name: i for i, name in enumerate(action_names)})
|
||||
|
@ -228,7 +229,7 @@ def test_transition_table(ops):
|
|||
assert table[0, a.O, a.Uloc] == 1
|
||||
assert table[0, a.O, a.Uorg] == 1
|
||||
assert table[0, a.O, a.O] == 1
|
||||
|
||||
|
||||
# Last token, prev action was B
|
||||
assert table[1, a.Bper, a.Bper] == 0
|
||||
assert table[1, a.Bper, a.Bloc] == 0
|
||||
|
|
|
@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
|
|||
|
||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||
def test_issue1967(label):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||
example = Example(doc=None)
|
||||
example.set_token_annotation(
|
||||
|
|
|
@ -196,7 +196,12 @@ def test_issue3345():
|
|||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||
doc[4].is_sent_start = True
|
||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||
ner.moves.add_action(5, "")
|
||||
|
|
|
@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
|
|||
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
|
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
|
|||
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
|
|
|
@ -74,7 +74,12 @@ def test_issue4042_bug2():
|
|||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
|
|
@ -12,7 +12,12 @@ def test_issue4313():
|
|||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
|||
|
||||
@pytest.fixture
|
||||
def parser(en_vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(en_vocab, default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
return parser
|
||||
|
|
|
@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
assert vocab1.to_bytes() == vocab1_b
|
||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||
assert new_vocab1.to_bytes() == vocab1_b
|
||||
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
|
||||
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(
|
||||
strings1 + list(default_strings)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
|
|
@ -40,6 +40,7 @@ test_ner_apple = [
|
|||
]
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tagged_doc():
|
||||
text = "Sarah's sister flew to Silicon Valley via London."
|
||||
|
@ -184,7 +185,7 @@ def test_tag_score(tagged_doc):
|
|||
tagged_doc,
|
||||
tags=[t.tag_ for t in tagged_doc],
|
||||
pos=[t.pos_ for t in tagged_doc],
|
||||
morphs=[t.morph_ for t in tagged_doc]
|
||||
morphs=[t.morph_ for t in tagged_doc],
|
||||
)
|
||||
scorer.score((tagged_doc, gold))
|
||||
results = scorer.scores
|
||||
|
|
|
@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words
|
|||
([400, 400, 199, 3], [4]),
|
||||
([400, 400, 199, 3, 200], [3, 2]),
|
||||
([400, 400, 199, 3, 1], [5]),
|
||||
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
||||
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
||||
([400, 400, 199, 3, 1, 200], [3, 3]),
|
||||
([400, 400, 199, 3, 1, 999], [3, 3]),
|
||||
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
||||
|
@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches):
|
|||
examples = [Example(doc=doc) for doc in docs]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
|
||||
batches = list(
|
||||
minibatch_by_words(
|
||||
examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
|
||||
)
|
||||
)
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
max_size = batch_size + batch_size * tol
|
||||
|
@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
|||
examples = [Example(doc=doc) for doc in docs]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
|
||||
batches = list(
|
||||
minibatch_by_words(
|
||||
examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
|
||||
)
|
||||
)
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
|
||||
|
|
|
@ -697,7 +697,9 @@ def decaying(start, stop, decay):
|
|||
curr -= decay
|
||||
|
||||
|
||||
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
|
||||
def minibatch_by_words(
|
||||
examples, size, count_words=len, tolerance=0.2, discard_oversize=False
|
||||
):
|
||||
"""Create minibatches of roughly a given number of words. If any examples
|
||||
are longer than the specified batch length, they will appear in a batch by
|
||||
themselves, or be discarded if discard_oversize=True."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user