mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge pull request #5617 from explosion/chore/tidy-auto-format
This commit is contained in:
commit
dbe9c29f61
|
@ -24,8 +24,8 @@ from ..gold import Example
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
||||||
config_path=("Path to config file", "positional", None, Path),
|
config_path=("Path to config file", "positional", None, Path),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
|
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
|
||||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
|
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
)
|
)
|
||||||
def pretrain(
|
def pretrain(
|
||||||
|
|
|
@ -3,7 +3,6 @@ from timeit import default_timer as timer
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from pydantic import BaseModel, FilePath
|
from pydantic import BaseModel, FilePath
|
||||||
import plac
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -16,7 +15,9 @@ from ..gold import GoldCorpus
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..ml import models # don't remove - required to load the built-in architectures
|
|
||||||
|
# Don't remove - required to load the built-in architectures
|
||||||
|
from ..ml import models # noqa: F401
|
||||||
|
|
||||||
registry = util.registry
|
registry = util.registry
|
||||||
|
|
||||||
|
@ -114,33 +115,19 @@ class ConfigSchema(BaseModel):
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
# fmt: off
|
|
||||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
|
||||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
|
||||||
config_path=("Path to config file", "positional", None, Path),
|
|
||||||
output_path=("Output directory to store model in", "option", "o", Path),
|
|
||||||
init_tok2vec=(
|
|
||||||
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
|
|
||||||
Path),
|
|
||||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
|
||||||
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
|
||||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
def train_cli(
|
def train_cli(
|
||||||
train_path,
|
# fmt: off
|
||||||
dev_path,
|
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
config_path,
|
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
output_path=None,
|
config_path: ("Path to config file", "positional", None, Path),
|
||||||
init_tok2vec=None,
|
output_path: ("Output directory to store model in", "option", "o", Path) = None,
|
||||||
raw_text=None,
|
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||||
verbose=False,
|
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
||||||
use_gpu=-1,
|
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
|
||||||
tag_map_path=None,
|
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
||||||
omit_extra_lookups=False,
|
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||||
|
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
|
@ -212,7 +199,7 @@ def train(
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
util.fix_random_seed(config["training"]["seed"])
|
util.fix_random_seed(config["training"]["seed"])
|
||||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||||
# It feels kind of weird to not have a default for this.
|
# It feels kind of weird to not have a default for this.
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
nlp_config = config["nlp"]
|
nlp_config = config["nlp"]
|
||||||
config = util.load_config(config_path, create_objects=True)
|
config = util.load_config(config_path, create_objects=True)
|
||||||
|
@ -227,7 +214,9 @@ def train(
|
||||||
# verify textcat config
|
# verify textcat config
|
||||||
if "textcat" in nlp_config["pipeline"]:
|
if "textcat" in nlp_config["pipeline"]:
|
||||||
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
||||||
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
|
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
|
||||||
|
"exclusive_classes"
|
||||||
|
]
|
||||||
|
|
||||||
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
||||||
if textcat_multilabel:
|
if textcat_multilabel:
|
||||||
|
@ -255,7 +244,9 @@ def train(
|
||||||
"to 'false' in the config to train a classifier with classes "
|
"to 'false' in the config to train a classifier with classes "
|
||||||
"that are not mutually exclusive."
|
"that are not mutually exclusive."
|
||||||
)
|
)
|
||||||
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
|
msg.info(
|
||||||
|
f"Initialized textcat component for {len(textcat_labels)} unique labels"
|
||||||
|
)
|
||||||
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
|
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
|
||||||
|
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
|
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
|
||||||
|
@ -281,9 +272,7 @@ def train(
|
||||||
nlp.resume_training()
|
nlp.resume_training()
|
||||||
else:
|
else:
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
nlp.begin_training(
|
nlp.begin_training(lambda: corpus.train_examples)
|
||||||
lambda: corpus.train_examples
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Update tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
@ -310,8 +299,7 @@ def train(
|
||||||
tok2vec = tok2vec.get(subpath)
|
tok2vec = tok2vec.get(subpath)
|
||||||
if not tok2vec:
|
if not tok2vec:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Could not locate the tok2vec model at {tok2vec_path}.",
|
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
||||||
exits=1,
|
|
||||||
)
|
)
|
||||||
tok2vec.from_bytes(weights_data)
|
tok2vec.from_bytes(weights_data)
|
||||||
|
|
||||||
|
@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=0.0, # I think this is deprecated?
|
noise_level=0.0, # I think this is deprecated?
|
||||||
orth_variant_level=cfg["orth_variant_level"],
|
orth_variant_level=cfg["orth_variant_level"],
|
||||||
gold_preproc=cfg["gold_preproc"],
|
gold_preproc=cfg["gold_preproc"],
|
||||||
max_length=cfg["max_length"],
|
max_length=cfg["max_length"],
|
||||||
|
@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
try:
|
try:
|
||||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
|
raise KeyError(
|
||||||
|
Errors.E983.format(
|
||||||
|
dict_name="score_weights", key=str(e), keys=list(scores.keys())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
scores["speed"] = wps
|
scores["speed"] = wps
|
||||||
return weighted_score, scores
|
return weighted_score, scores
|
||||||
|
@ -578,15 +570,25 @@ def setup_printer(training, nlp):
|
||||||
]
|
]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
Errors.E983.format(
|
||||||
|
dict_name="scores (losses)",
|
||||||
|
key=str(e),
|
||||||
|
keys=list(info["losses"].keys()),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
scores = [
|
scores = [
|
||||||
"{0:.2f}".format(float(info["other_scores"][col]))
|
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
|
||||||
for col in score_cols
|
|
||||||
]
|
]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
raise KeyError(
|
||||||
|
Errors.E983.format(
|
||||||
|
dict_name="scores (other)",
|
||||||
|
key=str(e),
|
||||||
|
keys=list(info["other_scores"].keys()),
|
||||||
|
)
|
||||||
|
)
|
||||||
data = (
|
data = (
|
||||||
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
|
@ -51,7 +50,13 @@ class Lemmatizer(object):
|
||||||
index_table = self.lookups.get_table("lemma_index", {})
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
|
if not any(
|
||||||
|
(
|
||||||
|
index_table.get(univ_pos),
|
||||||
|
exc_table.get(univ_pos),
|
||||||
|
rules_table.get(univ_pos),
|
||||||
|
)
|
||||||
|
):
|
||||||
if univ_pos == "propn":
|
if univ_pos == "propn":
|
||||||
return [string]
|
return [string]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
from .models import *
|
from .models import * # noqa: F401, F403
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||||
from typing import List, Tuple, Dict, Optional
|
from typing import Dict, Optional
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
|
from thinc.api import Model
|
||||||
from thinc.api import to_numpy
|
from thinc.types import Padded, Floats3d
|
||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
|
||||||
|
|
||||||
from ..tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def BILUO() -> Model[Padded, Padded]:
|
def BILUO() -> Model[Padded, Padded]:
|
||||||
|
@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]:
|
||||||
forward,
|
forward,
|
||||||
init=init,
|
init=init,
|
||||||
dims={"nO": None},
|
dims={"nO": None},
|
||||||
attrs={"get_num_actions": get_num_actions}
|
attrs={"get_num_actions": get_num_actions},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
|
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||||
if X is not None and Y is not None:
|
if X is not None and Y is not None:
|
||||||
if X.data.shape != Y.data.shape:
|
if X.data.shape != Y.data.shape:
|
||||||
# TODO: Fix error
|
# TODO: Fix error
|
||||||
|
@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
||||||
masks = model.ops.alloc3f(*Y.shape)
|
masks = model.ops.alloc3f(*Y.shape)
|
||||||
max_value = Xp.data.max()
|
max_value = Xp.data.max()
|
||||||
for t in range(Xp.data.shape[0]):
|
for t in range(Xp.data.shape[0]):
|
||||||
is_last = (Xp.lengths < (t+2)).astype("i")
|
is_last = (Xp.lengths < (t + 2)).astype("i")
|
||||||
masks[t] = valid_transitions[is_last, prev_actions]
|
masks[t] = valid_transitions[is_last, prev_actions]
|
||||||
# Don't train the out-of-bounds sequences.
|
# Don't train the out-of-bounds sequences.
|
||||||
masks[t, Xp.size_at_t[t]:] = 0
|
masks[t, Xp.size_at_t[t] :] = 0
|
||||||
# Valid actions get 0*10e8, invalid get large negative value
|
# Valid actions get 0*10e8, invalid get large negative value
|
||||||
Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10)
|
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
|
||||||
prev_actions = Y[t].argmax(axis=-1)
|
prev_actions = Y[t].argmax(axis=-1)
|
||||||
|
|
||||||
def backprop_biluo(dY: Padded) -> Padded:
|
def backprop_biluo(dY: Padded) -> Padded:
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
|
||||||
from typing import List, Tuple, Dict, Optional
|
from typing import Dict, Optional
|
||||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
|
from thinc.api import Ops, Model
|
||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
from thinc.types import Padded, Floats3d
|
||||||
|
|
||||||
from ..tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def IOB() -> Model[Padded, Padded]:
|
def IOB() -> Model[Padded, Padded]:
|
||||||
|
@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]:
|
||||||
forward,
|
forward,
|
||||||
init=init,
|
init=init,
|
||||||
dims={"nO": None},
|
dims={"nO": None},
|
||||||
attrs={"get_num_actions": get_num_actions}
|
attrs={"get_num_actions": get_num_actions},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
|
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||||
if X is not None and Y is not None:
|
if X is not None and Y is not None:
|
||||||
if X.data.shape != Y.data.shape:
|
if X.data.shape != Y.data.shape:
|
||||||
# TODO: Fix error
|
# TODO: Fix error
|
||||||
|
@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
|
||||||
for t in range(Xp.data.shape[0]):
|
for t in range(Xp.data.shape[0]):
|
||||||
masks[t] = valid_transitions[prev_actions]
|
masks[t] = valid_transitions[prev_actions]
|
||||||
# Don't train the out-of-bounds sequences.
|
# Don't train the out-of-bounds sequences.
|
||||||
masks[t, Xp.size_at_t[t]:] = 0
|
masks[t, Xp.size_at_t[t] :] = 0
|
||||||
# Valid actions get 0*10e8, invalid get -1*10e8
|
# Valid actions get 0*10e8, invalid get -1*10e8
|
||||||
Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
|
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
|
||||||
prev_actions = Y[t].argmax(axis=-1)
|
prev_actions = Y[t].argmax(axis=-1)
|
||||||
|
|
||||||
def backprop_biluo(dY: Padded) -> Padded:
|
def backprop_biluo(dY: Padded) -> Padded:
|
||||||
# Masking the gradient seems to do poorly here. But why?
|
# Masking the gradient seems to do poorly here. But why?
|
||||||
#dY.data *= masks
|
# dY.data *= masks
|
||||||
return dY
|
return dY
|
||||||
|
|
||||||
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
|
||||||
|
@ -83,10 +81,10 @@ def _get_transition_table(
|
||||||
B_range = ops.xp.arange(B_start, B_end)
|
B_range = ops.xp.arange(B_start, B_end)
|
||||||
I_range = ops.xp.arange(I_start, I_end)
|
I_range = ops.xp.arange(I_start, I_end)
|
||||||
# B and O are always valid
|
# B and O are always valid
|
||||||
table[:, B_start : B_end] = 1
|
table[:, B_start:B_end] = 1
|
||||||
table[:, O_action] = 1
|
table[:, O_action] = 1
|
||||||
# I can only follow a matching B
|
# I can only follow a matching B
|
||||||
table[B_range, I_range] = 1
|
table[B_range, I_range] = 1
|
||||||
|
|
||||||
_cache[n_actions] = table
|
_cache[n_actions] = table
|
||||||
return table
|
return table
|
||||||
|
|
|
@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||||
#
|
#
|
||||||
# (ids < 0).T @ dY
|
# (ids < 0).T @ dY
|
||||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True)
|
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
||||||
return d_pad.reshape((1, nF, nO, nP))
|
return d_pad.reshape((1, nF, nO, nP))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
from .simple_ner import *
|
from .simple_ner import * # noqa
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
from .tok2vec import * # noqa
|
from .tok2vec import * # noqa
|
||||||
|
|
|
@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
||||||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
|
Maxout(
|
||||||
|
nO=token_vector_width * 2,
|
||||||
|
nI=token_vector_width,
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
),
|
||||||
LayerNorm(token_vector_width * 2),
|
LayerNorm(token_vector_width * 2),
|
||||||
softmax,
|
softmax,
|
||||||
)
|
)
|
||||||
|
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
|
||||||
# nO = vocab.vectors.data.shape[1]
|
# nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
Maxout(
|
Maxout(
|
||||||
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
|
nO=nO,
|
||||||
|
nI=tok2vec.get_dim("nO"),
|
||||||
|
nP=maxout_pieces,
|
||||||
|
normalize=True,
|
||||||
|
dropout=0.0,
|
||||||
),
|
),
|
||||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||||
)
|
)
|
||||||
|
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||||
def mlm_forward(model, docs, is_train):
|
def mlm_forward(model, docs, is_train):
|
||||||
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||||
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
|
output, backprop = model.get_ref("wrapped-model").begin_update(
|
||||||
|
docs
|
||||||
|
) # drop=drop
|
||||||
|
|
||||||
def mlm_backward(d_output):
|
def mlm_backward(d_output):
|
||||||
d_output *= 1 - mask
|
d_output *= 1 - mask
|
||||||
|
|
|
@ -16,18 +16,14 @@ def build_tb_parser_model(
|
||||||
nO=None,
|
nO=None,
|
||||||
):
|
):
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(
|
tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
|
||||||
tok2vec,
|
|
||||||
with_array(Linear(hidden_width, t2v_width)),
|
|
||||||
list2array(),
|
|
||||||
)
|
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
|
|
||||||
lower = PrecomputableAffine(
|
lower = PrecomputableAffine(
|
||||||
nO=hidden_width if use_upper else nO,
|
nO=hidden_width if use_upper else nO,
|
||||||
nF=nr_feature_tokens,
|
nF=nr_feature_tokens,
|
||||||
nI=tok2vec.get_dim("nO"),
|
nI=tok2vec.get_dim("nO"),
|
||||||
nP=maxout_pieces
|
nP=maxout_pieces,
|
||||||
)
|
)
|
||||||
if use_upper:
|
if use_upper:
|
||||||
with use_ops("numpy"):
|
with use_ops("numpy"):
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
import functools
|
from typing import List
|
||||||
from typing import List, Tuple, Dict, Optional
|
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
|
||||||
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
|
|
||||||
from thinc.api import chain, list2padded, configure_normal_init
|
from thinc.api import chain, list2padded, configure_normal_init
|
||||||
from thinc.api import Dropout
|
from thinc.api import Dropout
|
||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .._biluo import BILUO
|
from .._biluo import BILUO
|
||||||
|
@ -12,12 +11,12 @@ from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.BiluoTagger.v1")
|
@registry.architectures.register("spacy.BiluoTagger.v1")
|
||||||
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
def BiluoTagger(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
biluo = BILUO()
|
biluo = BILUO()
|
||||||
linear = Linear(
|
linear = Linear(
|
||||||
nO=None,
|
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
||||||
nI=tok2vec.get_dim("nO"),
|
|
||||||
init_W=configure_normal_init(mean=0.02)
|
|
||||||
)
|
)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
|
@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
||||||
with_array(chain(Dropout(0.1), linear)),
|
with_array(chain(Dropout(0.1), linear)),
|
||||||
biluo,
|
biluo,
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list()
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
|
@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
||||||
layers=[model, linear],
|
layers=[model, linear],
|
||||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||||
dims={"nO": None},
|
dims={"nO": None},
|
||||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.IOBTagger.v1")
|
@registry.architectures.register("spacy.IOBTagger.v1")
|
||||||
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
def IOBTagger(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
biluo = IOB()
|
biluo = IOB()
|
||||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
||||||
model = chain(
|
model = chain(
|
||||||
|
@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
||||||
with_array(linear),
|
with_array(linear),
|
||||||
biluo,
|
biluo,
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list()
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
|
@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
||||||
layers=[model],
|
layers=[model],
|
||||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||||
dims={"nO": None},
|
dims={"nO": None},
|
||||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
||||||
if model.get_dim("nO") is None and Y:
|
if model.get_dim("nO") is None and Y:
|
||||||
model.set_dim("nO", Y[0].shape[1])
|
model.set_dim("nO", Y[0].shape[1])
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
|
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
from thinc.api import glorot_uniform_init
|
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout
|
from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
|
||||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
|
from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
|
||||||
|
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
|
||||||
|
|
||||||
from ..spacy_vectors import SpacyVectors
|
from ..spacy_vectors import SpacyVectors
|
||||||
from ... import util
|
from ... import util
|
||||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
|
||||||
|
@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCat.v1")
|
@registry.architectures.register("spacy.TextCat.v1")
|
||||||
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
|
def build_text_classifier(
|
||||||
window_size, conv_depth, dropout, nO=None):
|
width,
|
||||||
|
embed_size,
|
||||||
|
pretrained_vectors,
|
||||||
|
exclusive_classes,
|
||||||
|
ngram_size,
|
||||||
|
window_size,
|
||||||
|
conv_depth,
|
||||||
|
dropout,
|
||||||
|
nO=None,
|
||||||
|
):
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
|
lower = HashEmbed(
|
||||||
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
|
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
|
||||||
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
|
)
|
||||||
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
|
prefix = HashEmbed(
|
||||||
|
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
|
||||||
|
)
|
||||||
|
suffix = HashEmbed(
|
||||||
|
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
|
||||||
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
|
||||||
|
)
|
||||||
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||||
|
@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
||||||
vectors_width = width
|
vectors_width = width
|
||||||
tok2vec = vector_layer >> with_array(
|
tok2vec = vector_layer >> with_array(
|
||||||
Maxout(width, vectors_width, normalize=True)
|
Maxout(width, vectors_width, normalize=True)
|
||||||
>> residual((expand_window(window_size=window_size)
|
>> residual(
|
||||||
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
|
(
|
||||||
|
expand_window(window_size=window_size)
|
||||||
|
>> Maxout(
|
||||||
|
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
** conv_depth,
|
||||||
pad=conv_depth,
|
pad=conv_depth,
|
||||||
)
|
)
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
>> list2ragged()
|
>> list2ragged()
|
||||||
>> ParametricAttention(width)
|
>> ParametricAttention(width)
|
||||||
>> reduce_sum()
|
>> reduce_sum()
|
||||||
>> residual(Maxout(nO=width, nI=width))
|
>> residual(Maxout(nO=width, nI=width))
|
||||||
>> Linear(nO=nO, nI=width)
|
>> Linear(nO=nO, nI=width)
|
||||||
>> Dropout(0.0)
|
>> Dropout(0.0)
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = build_bow_text_classifier(
|
linear_model = build_bow_text_classifier(
|
||||||
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
|
nO=nO,
|
||||||
|
ngram_size=ngram_size,
|
||||||
|
exclusive_classes=exclusive_classes,
|
||||||
|
no_output_layer=False,
|
||||||
)
|
)
|
||||||
nO_double = nO*2 if nO else None
|
nO_double = nO * 2 if nO else None
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||||
else:
|
else:
|
||||||
output_layer = (
|
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||||
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
|
||||||
)
|
|
||||||
model = (linear_model | cnn_model) >> output_layer
|
model = (linear_model | cnn_model) >> output_layer
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
if model.has_dim("nO") is not False:
|
if model.has_dim("nO") is not False:
|
||||||
|
|
|
@ -99,7 +99,13 @@ def hash_charembed_cnn(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
||||||
def hash_embed_bilstm_v1(
|
def hash_embed_bilstm_v1(
|
||||||
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
|
pretrained_vectors,
|
||||||
|
width,
|
||||||
|
depth,
|
||||||
|
embed_size,
|
||||||
|
subword_features,
|
||||||
|
maxout_pieces,
|
||||||
|
dropout,
|
||||||
):
|
):
|
||||||
# Does not use character embeddings: set to False by default
|
# Does not use character embeddings: set to False by default
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
|
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
|
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
|
||||||
def LayerNormalizedMaxout(width, maxout_pieces):
|
def LayerNormalizedMaxout(width, maxout_pieces):
|
||||||
return Maxout(
|
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
|
||||||
nO=width,
|
|
||||||
nP=maxout_pieces,
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||||
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
|
def MultiHashEmbed(
|
||||||
|
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
||||||
|
):
|
||||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||||
if use_subwords:
|
if use_subwords:
|
||||||
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
|
prefix = HashEmbed(
|
||||||
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
|
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
|
||||||
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
|
)
|
||||||
|
suffix = HashEmbed(
|
||||||
|
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
|
||||||
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
|
||||||
|
)
|
||||||
|
|
||||||
if pretrained_vectors:
|
if pretrained_vectors:
|
||||||
glove = StaticVectors(
|
glove = StaticVectors(
|
||||||
|
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
||||||
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
|
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
|
||||||
cnn = chain(
|
cnn = chain(
|
||||||
expand_window(window_size=window_size),
|
expand_window(window_size=window_size),
|
||||||
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
|
Maxout(
|
||||||
|
nO=width,
|
||||||
|
nI=width * ((window_size * 2) + 1),
|
||||||
|
nP=maxout_pieces,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize=True,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", width)
|
model.set_dim("nO", width)
|
||||||
|
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
|
||||||
subword_features = False
|
subword_features = False
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
|
norm = HashEmbed(
|
||||||
|
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
|
||||||
|
)
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
|
prefix = HashEmbed(
|
||||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
|
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
|
||||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
|
)
|
||||||
|
suffix = HashEmbed(
|
||||||
|
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
|
||||||
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prefix, suffix, shape = (None, None, None)
|
prefix, suffix, shape = (None, None, None)
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
|
|
|
@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
||||||
attrs={
|
attrs={
|
||||||
"has_upper": has_upper,
|
"has_upper": has_upper,
|
||||||
"unseen_classes": set(unseen_classes),
|
"unseen_classes": set(unseen_classes),
|
||||||
"resize_output": resize_output
|
"resize_output": resize_output,
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ def forward(model, X, is_train):
|
||||||
model.layers,
|
model.layers,
|
||||||
unseen_classes=model.attrs["unseen_classes"],
|
unseen_classes=model.attrs["unseen_classes"],
|
||||||
train=is_train,
|
train=is_train,
|
||||||
has_upper=model.attrs["has_upper"]
|
has_upper=model.attrs["has_upper"],
|
||||||
)
|
)
|
||||||
|
|
||||||
return step_model, step_model.finish_steps
|
return step_model, step_model.finish_steps
|
||||||
|
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
|
||||||
nI = None
|
nI = None
|
||||||
if smaller.has_dim("nI"):
|
if smaller.has_dim("nI"):
|
||||||
nI = smaller.get_dim("nI")
|
nI = smaller.get_dim("nI")
|
||||||
with use_ops('numpy'):
|
with use_ops("numpy"):
|
||||||
larger = Linear(nO=new_nO, nI=nI)
|
larger = Linear(nO=new_nO, nI=nI)
|
||||||
larger.init = smaller.init
|
larger.init = smaller.init
|
||||||
# it could be that the model is not initialized yet, then skip this bit
|
# it could be that the model is not initialized yet, then skip this bit
|
||||||
|
@ -74,8 +74,8 @@ def resize_output(model, new_nO):
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||||
# just adding rows here.
|
# just adding rows here.
|
||||||
if smaller.has_dim("nO"):
|
if smaller.has_dim("nO"):
|
||||||
larger_W[:smaller.get_dim("nO")] = smaller_W
|
larger_W[: smaller.get_dim("nO")] = smaller_W
|
||||||
larger_b[:smaller.get_dim("nO")] = smaller_b
|
larger_b[: smaller.get_dim("nO")] = smaller_b
|
||||||
for i in range(smaller.get_dim("nO"), new_nO):
|
for i in range(smaller.get_dim("nO"), new_nO):
|
||||||
model.attrs["unseen_classes"].add(i)
|
model.attrs["unseen_classes"].add(i)
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,7 @@ class SimpleNER(Pipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = {"labels": []}
|
self.cfg = {"labels": []}
|
||||||
self.loss_func = SequenceCategoricalCrossentropy(
|
self.loss_func = SequenceCategoricalCrossentropy(
|
||||||
names=self.get_tag_names(),
|
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||||
normalize=True,
|
|
||||||
missing_value=None
|
|
||||||
)
|
)
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
|
|
||||||
|
@ -38,21 +36,21 @@ class SimpleNER(Pipe):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
if label not in self.cfg["labels"]:
|
if label not in self.cfg["labels"]:
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
|
|
||||||
def get_tag_names(self):
|
def get_tag_names(self):
|
||||||
if self.is_biluo:
|
if self.is_biluo:
|
||||||
return (
|
return (
|
||||||
[f"B-{label}" for label in self.labels] +
|
[f"B-{label}" for label in self.labels]
|
||||||
[f"I-{label}" for label in self.labels] +
|
+ [f"I-{label}" for label in self.labels]
|
||||||
[f"L-{label}" for label in self.labels] +
|
+ [f"L-{label}" for label in self.labels]
|
||||||
[f"U-{label}" for label in self.labels] +
|
+ [f"U-{label}" for label in self.labels]
|
||||||
["O"]
|
+ ["O"]
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return (
|
return (
|
||||||
[f"B-{label}" for label in self.labels] +
|
[f"B-{label}" for label in self.labels]
|
||||||
[f"I-{label}" for label in self.labels] +
|
+ [f"I-{label}" for label in self.labels]
|
||||||
["O"]
|
+ ["O"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
||||||
|
@ -108,7 +106,7 @@ class SimpleNER(Pipe):
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||||
self.cfg.update(kwargs)
|
self.cfg.update(kwargs)
|
||||||
if not hasattr(get_examples, '__call__'):
|
if not hasattr(get_examples, "__call__"):
|
||||||
gold_tuples = get_examples
|
gold_tuples = get_examples
|
||||||
get_examples = lambda: gold_tuples
|
get_examples = lambda: gold_tuples
|
||||||
labels = _get_labels(get_examples())
|
labels = _get_labels(get_examples())
|
||||||
|
@ -117,14 +115,12 @@ class SimpleNER(Pipe):
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
n_actions = self.model.attrs["get_num_actions"](len(labels))
|
n_actions = self.model.attrs["get_num_actions"](len(labels))
|
||||||
self.model.set_dim("nO", n_actions)
|
self.model.set_dim("nO", n_actions)
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
self.loss_func = SequenceCategoricalCrossentropy(
|
self.loss_func = SequenceCategoricalCrossentropy(
|
||||||
names=self.get_tag_names(),
|
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||||
normalize=True,
|
|
||||||
missing_value=None
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return sgd
|
return sgd
|
||||||
|
@ -135,7 +131,7 @@ class SimpleNER(Pipe):
|
||||||
|
|
||||||
def _has_ner(eg):
|
def _has_ner(eg):
|
||||||
for ner_tag in eg.gold.ner:
|
for ner_tag in eg.gold.ner:
|
||||||
if ner_tag != "-" and ner_tag != None:
|
if ner_tag != "-" and ner_tag is not None:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -145,7 +141,7 @@ def _get_labels(examples):
|
||||||
labels = set()
|
labels = set()
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
for ner_tag in eg.token_annotation.entities:
|
for ner_tag in eg.token_annotation.entities:
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
if ner_tag != "O" and ner_tag != "-":
|
||||||
_, label = ner_tag.split('-', 1)
|
_, label = ner_tag.split("-", 1)
|
||||||
labels.add(label)
|
labels.add(label)
|
||||||
return list(sorted(labels))
|
return list(sorted(labels))
|
||||||
|
|
|
@ -98,7 +98,9 @@ class Scorer(object):
|
||||||
for name, component in pipeline:
|
for name, component in pipeline:
|
||||||
if name == "textcat":
|
if name == "textcat":
|
||||||
self.textcat_multilabel = component.model.attrs["multi_label"]
|
self.textcat_multilabel = component.model.attrs["multi_label"]
|
||||||
self.textcat_positive_label = component.cfg.get("positive_label", None)
|
self.textcat_positive_label = component.cfg.get(
|
||||||
|
"positive_label", None
|
||||||
|
)
|
||||||
for label in component.cfg.get("labels", []):
|
for label in component.cfg.get("labels", []):
|
||||||
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
||||||
self.textcat_f_per_cat[label] = PRFScore()
|
self.textcat_f_per_cat[label] = PRFScore()
|
||||||
|
@ -119,19 +121,19 @@ class Scorer(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def morphs_acc(self):
|
def morphs_acc(self):
|
||||||
"""RETURNS (float): Morph tag accuracy (morphological features,
|
"""RETURNS (float): Morph tag accuracy (morphological features,
|
||||||
i.e. `Token.morph`).
|
i.e. `Token.morph`).
|
||||||
"""
|
"""
|
||||||
return self.morphs.fscore * 100
|
return self.morphs.fscore * 100
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def morphs_per_type(self):
|
def morphs_per_type(self):
|
||||||
"""RETURNS (dict): Scores per dependency label.
|
"""RETURNS (dict): Scores per dependency label.
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||||
for k, v in self.morphs_per_feat.items()
|
for k, v in self.morphs_per_feat.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sent_p(self):
|
def sent_p(self):
|
||||||
|
@ -302,7 +304,15 @@ class Scorer(object):
|
||||||
gold_morphs_per_feat = {}
|
gold_morphs_per_feat = {}
|
||||||
gold_sent_starts = set()
|
gold_sent_starts = set()
|
||||||
gold_ents = set(tags_to_entities(orig.entities))
|
gold_ents = set(tags_to_entities(orig.entities))
|
||||||
for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
|
for id_, tag, pos, morph, head, dep, sent_start in zip(
|
||||||
|
orig.ids,
|
||||||
|
orig.tags,
|
||||||
|
orig.pos,
|
||||||
|
orig.morphs,
|
||||||
|
orig.heads,
|
||||||
|
orig.deps,
|
||||||
|
orig.sent_starts,
|
||||||
|
):
|
||||||
gold_tags.add((id_, tag))
|
gold_tags.add((id_, tag))
|
||||||
gold_pos.add((id_, pos))
|
gold_pos.add((id_, pos))
|
||||||
gold_morphs.add((id_, morph))
|
gold_morphs.add((id_, morph))
|
||||||
|
@ -400,7 +410,10 @@ class Scorer(object):
|
||||||
self.pos.score_set(cand_pos, gold_pos)
|
self.pos.score_set(cand_pos, gold_pos)
|
||||||
self.morphs.score_set(cand_morphs, gold_morphs)
|
self.morphs.score_set(cand_morphs, gold_morphs)
|
||||||
for field in self.morphs_per_feat:
|
for field in self.morphs_per_feat:
|
||||||
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
|
self.morphs_per_feat[field].score_set(
|
||||||
|
cand_morphs_per_feat.get(field, set()),
|
||||||
|
gold_morphs_per_feat.get(field, set()),
|
||||||
|
)
|
||||||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||||
self.labelled.score_set(cand_deps, gold_deps)
|
self.labelled.score_set(cand_deps, gold_deps)
|
||||||
for dep in self.labelled_per_dep:
|
for dep in self.labelled_per_dep:
|
||||||
|
@ -412,7 +425,9 @@ class Scorer(object):
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
len(gold.cats) > 0
|
len(gold.cats) > 0
|
||||||
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
|
and set(self.textcat_f_per_cat)
|
||||||
|
== set(self.textcat_auc_per_cat)
|
||||||
|
== set(gold.cats)
|
||||||
and set(gold.cats) == set(doc.cats)
|
and set(gold.cats) == set(doc.cats)
|
||||||
):
|
):
|
||||||
goldcat = max(gold.cats, key=gold.cats.get)
|
goldcat = max(gold.cats, key=gold.cats.get)
|
||||||
|
@ -424,10 +439,10 @@ class Scorer(object):
|
||||||
)
|
)
|
||||||
for label in set(gold.cats):
|
for label in set(gold.cats):
|
||||||
self.textcat_auc_per_cat[label].score_set(
|
self.textcat_auc_per_cat[label].score_set(
|
||||||
doc.cats[label], gold.cats[label]
|
doc.cats[label], gold.cats[label]
|
||||||
)
|
)
|
||||||
self.textcat_f_per_cat[label].score_set(
|
self.textcat_f_per_cat[label].score_set(
|
||||||
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
set([label]) & set([candcat]), set([label]) & set([goldcat])
|
||||||
)
|
)
|
||||||
elif len(self.textcat_f_per_cat) > 0:
|
elif len(self.textcat_f_per_cat) > 0:
|
||||||
model_labels = set(self.textcat_f_per_cat)
|
model_labels = set(self.textcat_f_per_cat)
|
||||||
|
|
|
@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
|
||||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
def test_ents_reset(en_vocab):
|
def test_ents_reset(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = get_doc(en_vocab, text)
|
doc = get_doc(en_vocab, text)
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Adam, NumpyOps
|
from thinc.api import Adam
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_parser, default_ner
|
from spacy.pipeline.defaults import default_parser, default_ner
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||||
|
@ -17,7 +16,12 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(vocab, default_parser(), **config)
|
parser = DependencyParser(vocab, default_parser(), **config)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -58,7 +62,12 @@ def test_add_label(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_add_label_deserializes_correctly():
|
def test_add_label_deserializes_correctly():
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
|
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
|
|
|
@ -138,7 +138,12 @@ def test_get_oracle_actions():
|
||||||
deps.append(dep)
|
deps.append(dep)
|
||||||
ents.append(ent)
|
ents.append(ent)
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(doc.vocab, default_parser(), **config)
|
parser = DependencyParser(doc.vocab, default_parser(), **config)
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
|
|
|
@ -138,7 +138,12 @@ def test_accept_blocked_token():
|
||||||
# 1. test normal behaviour
|
# 1. test normal behaviour
|
||||||
nlp1 = English()
|
nlp1 = English()
|
||||||
doc1 = nlp1("I live in New York")
|
doc1 = nlp1("I live in New York")
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
|
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
|
||||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||||
|
@ -157,7 +162,12 @@ def test_accept_blocked_token():
|
||||||
# 2. test blocking behaviour
|
# 2. test blocking behaviour
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
doc2 = nlp2("I live in New York")
|
doc2 = nlp2("I live in New York")
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
|
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
|
||||||
|
|
||||||
# set "New York" to a blocked entity
|
# set "New York" to a blocked entity
|
||||||
|
@ -215,7 +225,12 @@ def test_overwrite_token():
|
||||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||||
|
|
||||||
# Check that a new ner can overwrite O
|
# Check that a new ner can overwrite O
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
|
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||||
ner2.moves.add_action(5, "")
|
ner2.moves.add_action(5, "")
|
||||||
ner2.add_label("GPE")
|
ner2.add_label("GPE")
|
||||||
|
|
|
@ -28,7 +28,12 @@ def tok2vec():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab, arc_eager):
|
def parser(vocab, arc_eager):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
|
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
|
||||||
def test_beam_parse():
|
def test_beam_parse():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
|
nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
|
||||||
nlp.parser.add_label("nsubj")
|
nlp.parser.add_label("nsubj")
|
||||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||||
|
|
|
@ -16,7 +16,12 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(vocab, default_parser(), **config)
|
parser = DependencyParser(vocab, default_parser(), **config)
|
||||||
parser.cfg["token_vector_width"] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
|
|
|
@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
|
||||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
||||||
ruler = EntityRuler(nlp)
|
ruler = EntityRuler(nlp)
|
||||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
patterns = [
|
||||||
|
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||||
|
]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
@ -285,7 +287,11 @@ def test_overfitting_IO():
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
|
mykb.add_alias(
|
||||||
|
alias="Russ Cochran",
|
||||||
|
entities=["Q2146908", "Q7381115"],
|
||||||
|
probabilities=[0.5, 0.5],
|
||||||
|
)
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
|
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
|
||||||
|
|
|
@ -15,8 +15,17 @@ def test_label_types():
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
|
(
|
||||||
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
|
"I like green eggs",
|
||||||
|
{
|
||||||
|
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
|
||||||
|
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Eat blue ham",
|
||||||
|
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +47,12 @@ def test_overfitting_IO():
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
gold_morphs = [
|
||||||
|
"Feat=N|POS=NOUN",
|
||||||
|
"Feat=V|POS=VERB",
|
||||||
|
"Feat=J|POS=ADJ",
|
||||||
|
"Feat=N|POS=NOUN",
|
||||||
|
]
|
||||||
assert gold_morphs == [t.morph_ for t in doc]
|
assert gold_morphs == [t.morph_ for t in doc]
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
|
|
|
@ -1,30 +1,31 @@
|
||||||
import pytest
|
import pytest
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
from thinc.api import NumpyOps
|
from thinc.api import NumpyOps
|
||||||
from spacy.ml._biluo import BILUO, _get_transition_table
|
from spacy.ml._biluo import BILUO, _get_transition_table
|
||||||
from spacy.pipeline.simple_ner import SimpleNER
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[
|
@pytest.fixture(
|
||||||
["PER", "ORG", "LOC", "MISC"],
|
params=[
|
||||||
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
|
["PER", "ORG", "LOC", "MISC"],
|
||||||
])
|
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
|
||||||
|
]
|
||||||
|
)
|
||||||
def labels(request):
|
def labels(request):
|
||||||
return request.param
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ops():
|
def ops():
|
||||||
return NumpyOps()
|
return NumpyOps()
|
||||||
|
|
||||||
|
|
||||||
def _get_actions(labels):
|
def _get_actions(labels):
|
||||||
action_names = (
|
action_names = (
|
||||||
[f"B{label}" for label in labels] + \
|
[f"B{label}" for label in labels]
|
||||||
[f"I{label}" for label in labels] + \
|
+ [f"I{label}" for label in labels]
|
||||||
[f"L{label}" for label in labels] + \
|
+ [f"L{label}" for label in labels]
|
||||||
[f"U{label}" for label in labels] + \
|
+ [f"U{label}" for label in labels]
|
||||||
["O"]
|
+ ["O"]
|
||||||
)
|
)
|
||||||
A = namedtuple("actions", action_names)
|
A = namedtuple("actions", action_names)
|
||||||
return A(**{name: i for i, name in enumerate(action_names)})
|
return A(**{name: i for i, name in enumerate(action_names)})
|
||||||
|
@ -228,7 +229,7 @@ def test_transition_table(ops):
|
||||||
assert table[0, a.O, a.Uloc] == 1
|
assert table[0, a.O, a.Uloc] == 1
|
||||||
assert table[0, a.O, a.Uorg] == 1
|
assert table[0, a.O, a.Uorg] == 1
|
||||||
assert table[0, a.O, a.O] == 1
|
assert table[0, a.O, a.O] == 1
|
||||||
|
|
||||||
# Last token, prev action was B
|
# Last token, prev action was B
|
||||||
assert table[1, a.Bper, a.Bper] == 0
|
assert table[1, a.Bper, a.Bper] == 0
|
||||||
assert table[1, a.Bper, a.Bloc] == 0
|
assert table[1, a.Bper, a.Bloc] == 0
|
||||||
|
|
|
@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
|
||||||
|
|
||||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
example = Example(doc=None)
|
example = Example(doc=None)
|
||||||
example.set_token_annotation(
|
example.set_token_annotation(
|
||||||
|
|
|
@ -196,7 +196,12 @@ def test_issue3345():
|
||||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||||
doc[4].is_sent_start = True
|
doc[4].is_sent_start = True
|
||||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||||
ner.moves.add_action(5, "")
|
ner.moves.add_action(5, "")
|
||||||
|
|
|
@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
def test_issue3830_no_subtok():
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
|
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
def test_issue3830_with_subtok():
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": True,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
|
|
|
@ -74,7 +74,12 @@ def test_issue4042_bug2():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
ner1.to_disk(output_dir)
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||||
ner2.from_disk(output_dir)
|
ner2.from_disk(output_dir)
|
||||||
assert len(ner2.labels) == 2
|
assert len(ner2.labels) == 2
|
||||||
|
|
|
@ -12,7 +12,12 @@ def test_issue4313():
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
beam_density = 0.0001
|
beam_density = 0.0001
|
||||||
nlp = English()
|
nlp = English()
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import pytest
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(en_vocab):
|
def parser(en_vocab):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
parser = DependencyParser(en_vocab, default_parser(), **config)
|
parser = DependencyParser(en_vocab, default_parser(), **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
return parser
|
return parser
|
||||||
|
|
|
@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||||
assert vocab1.to_bytes() == vocab1_b
|
assert vocab1.to_bytes() == vocab1_b
|
||||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||||
assert new_vocab1.to_bytes() == vocab1_b
|
assert new_vocab1.to_bytes() == vocab1_b
|
||||||
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
||||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
|
assert sorted([s for s in new_vocab1.strings]) == sorted(
|
||||||
|
strings1 + list(default_strings)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
|
|
|
@ -40,6 +40,7 @@ test_ner_apple = [
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tagged_doc():
|
def tagged_doc():
|
||||||
text = "Sarah's sister flew to Silicon Valley via London."
|
text = "Sarah's sister flew to Silicon Valley via London."
|
||||||
|
@ -184,7 +185,7 @@ def test_tag_score(tagged_doc):
|
||||||
tagged_doc,
|
tagged_doc,
|
||||||
tags=[t.tag_ for t in tagged_doc],
|
tags=[t.tag_ for t in tagged_doc],
|
||||||
pos=[t.pos_ for t in tagged_doc],
|
pos=[t.pos_ for t in tagged_doc],
|
||||||
morphs=[t.morph_ for t in tagged_doc]
|
morphs=[t.morph_ for t in tagged_doc],
|
||||||
)
|
)
|
||||||
scorer.score((tagged_doc, gold))
|
scorer.score((tagged_doc, gold))
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
|
@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words
|
||||||
([400, 400, 199, 3], [4]),
|
([400, 400, 199, 3], [4]),
|
||||||
([400, 400, 199, 3, 200], [3, 2]),
|
([400, 400, 199, 3, 200], [3, 2]),
|
||||||
([400, 400, 199, 3, 1], [5]),
|
([400, 400, 199, 3, 1], [5]),
|
||||||
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
||||||
([400, 400, 199, 3, 1, 200], [3, 3]),
|
([400, 400, 199, 3, 1, 200], [3, 3]),
|
||||||
([400, 400, 199, 3, 1, 999], [3, 3]),
|
([400, 400, 199, 3, 1, 999], [3, 3]),
|
||||||
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
||||||
|
@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches):
|
||||||
examples = [Example(doc=doc) for doc in docs]
|
examples = [Example(doc=doc) for doc in docs]
|
||||||
tol = 0.2
|
tol = 0.2
|
||||||
batch_size = 1000
|
batch_size = 1000
|
||||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
|
batches = list(
|
||||||
|
minibatch_by_words(
|
||||||
|
examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
|
||||||
|
)
|
||||||
|
)
|
||||||
assert [len(batch) for batch in batches] == expected_batches
|
assert [len(batch) for batch in batches] == expected_batches
|
||||||
|
|
||||||
max_size = batch_size + batch_size * tol
|
max_size = batch_size + batch_size * tol
|
||||||
|
@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
||||||
examples = [Example(doc=doc) for doc in docs]
|
examples = [Example(doc=doc) for doc in docs]
|
||||||
tol = 0.2
|
tol = 0.2
|
||||||
batch_size = 1000
|
batch_size = 1000
|
||||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
|
batches = list(
|
||||||
|
minibatch_by_words(
|
||||||
|
examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
|
||||||
|
)
|
||||||
|
)
|
||||||
assert [len(batch) for batch in batches] == expected_batches
|
assert [len(batch) for batch in batches] == expected_batches
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -697,7 +697,9 @@ def decaying(start, stop, decay):
|
||||||
curr -= decay
|
curr -= decay
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
|
def minibatch_by_words(
|
||||||
|
examples, size, count_words=len, tolerance=0.2, discard_oversize=False
|
||||||
|
):
|
||||||
"""Create minibatches of roughly a given number of words. If any examples
|
"""Create minibatches of roughly a given number of words. If any examples
|
||||||
are longer than the specified batch length, they will appear in a batch by
|
are longer than the specified batch length, they will appear in a batch by
|
||||||
themselves, or be discarded if discard_oversize=True."""
|
themselves, or be discarded if discard_oversize=True."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user