mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into fix/cli-debug
# Conflicts: # pyproject.toml # requirements.txt # setup.cfg
This commit is contained in:
commit
6f4e46ee93
|
@ -16,7 +16,7 @@ from bin.ud import conll17_ud_eval
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.pipeline._parser_internals.nonproj import projectivize
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
|
@ -13,7 +13,7 @@ import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.pipeline._parser_internals.nonproj import projectivize
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
|
16
setup.py
16
setup.py
|
@ -31,6 +31,7 @@ MOD_NAMES = [
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.kb",
|
"spacy.kb",
|
||||||
|
"spacy.ml.parser_model",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.dep_parser",
|
"spacy.pipeline.dep_parser",
|
||||||
"spacy.pipeline.morphologizer",
|
"spacy.pipeline.morphologizer",
|
||||||
|
@ -40,14 +41,14 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline.sentencizer",
|
"spacy.pipeline.sentencizer",
|
||||||
"spacy.pipeline.senter",
|
"spacy.pipeline.senter",
|
||||||
"spacy.pipeline.tagger",
|
"spacy.pipeline.tagger",
|
||||||
"spacy.syntax.stateclass",
|
"spacy.pipeline.transition_parser",
|
||||||
"spacy.syntax._state",
|
"spacy.pipeline._parser_internals.arc_eager",
|
||||||
|
"spacy.pipeline._parser_internals.ner",
|
||||||
|
"spacy.pipeline._parser_internals.nonproj",
|
||||||
|
"spacy.pipeline._parser_internals._state",
|
||||||
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
"spacy.syntax.nn_parser",
|
|
||||||
"spacy.syntax._parser_model",
|
|
||||||
"spacy.syntax.nonproj",
|
|
||||||
"spacy.syntax.transition_system",
|
|
||||||
"spacy.syntax.arc_eager",
|
|
||||||
"spacy.gold.gold_io",
|
"spacy.gold.gold_io",
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
|
@ -57,7 +58,6 @@ MOD_NAMES = [
|
||||||
"spacy.matcher.matcher",
|
"spacy.matcher.matcher",
|
||||||
"spacy.matcher.phrasematcher",
|
"spacy.matcher.phrasematcher",
|
||||||
"spacy.matcher.dependencymatcher",
|
"spacy.matcher.dependencymatcher",
|
||||||
"spacy.syntax.ner",
|
|
||||||
"spacy.symbols",
|
"spacy.symbols",
|
||||||
"spacy.vectors",
|
"spacy.vectors",
|
||||||
]
|
]
|
||||||
|
|
|
@ -10,7 +10,7 @@ from thinc.api import Config
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
from ..gold import Corpus, Example
|
from ..gold import Corpus, Example
|
||||||
from ..syntax import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
|
@ -63,8 +63,6 @@ class Warnings:
|
||||||
"have the spacy-lookups-data package installed.")
|
"have the spacy-lookups-data package installed.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
|
||||||
"previous components in the pipeline declare that they assign it.")
|
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
|
|
|
@ -10,7 +10,7 @@ from .align import Alignment
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
from .iob_utils import spans_from_biluo_tags
|
from .iob_utils import spans_from_biluo_tags
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..syntax import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
|
|
@ -18,7 +18,7 @@ from timeit import default_timer as timer
|
||||||
|
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .gold import Example
|
from .gold import Example
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry
|
from .util import create_default_optimizer, registry
|
||||||
|
@ -37,8 +37,6 @@ from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
|
||||||
# TODO: integrate pipeline analyis
|
|
||||||
ENABLE_PIPELINE_ANALYSIS = False
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||||
|
@ -522,6 +520,25 @@ class Language:
|
||||||
return add_component(func)
|
return add_component(func)
|
||||||
return add_component
|
return add_component
|
||||||
|
|
||||||
|
def analyze_pipes(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
|
||||||
|
pretty: bool = False,
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Analyze the current pipeline components, print a summary of what
|
||||||
|
they assign or require and check that all requirements are met.
|
||||||
|
|
||||||
|
keys (List[str]): The meta values to display in the table. Corresponds
|
||||||
|
to values in FactoryMeta, defined by @Language.factory decorator.
|
||||||
|
pretty (bool): Pretty-print the results.
|
||||||
|
RETURNS (dict): The data.
|
||||||
|
"""
|
||||||
|
analysis = analyze_pipes(self, keys=keys)
|
||||||
|
if pretty:
|
||||||
|
print_pipe_analysis(analysis, keys=keys)
|
||||||
|
return analysis
|
||||||
|
|
||||||
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
|
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
|
||||||
"""Get a pipeline component for a given component name.
|
"""Get a pipeline component for a given component name.
|
||||||
|
|
||||||
|
@ -666,8 +683,6 @@ class Language:
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
self.pipeline.insert(pipe_index, (name, pipe_component))
|
self.pipeline.insert(pipe_index, (name, pipe_component))
|
||||||
if ENABLE_PIPELINE_ANALYSIS:
|
|
||||||
analyze_pipes(self, name, pipe_index)
|
|
||||||
return pipe_component
|
return pipe_component
|
||||||
|
|
||||||
def _get_pipe_index(
|
def _get_pipe_index(
|
||||||
|
@ -758,8 +773,6 @@ class Language:
|
||||||
self.add_pipe(factory_name, name=name)
|
self.add_pipe(factory_name, name=name)
|
||||||
else:
|
else:
|
||||||
self.add_pipe(factory_name, name=name, before=pipe_index)
|
self.add_pipe(factory_name, name=name, before=pipe_index)
|
||||||
if ENABLE_PIPELINE_ANALYSIS:
|
|
||||||
analyze_all_pipes(self)
|
|
||||||
|
|
||||||
def rename_pipe(self, old_name: str, new_name: str) -> None:
|
def rename_pipe(self, old_name: str, new_name: str) -> None:
|
||||||
"""Rename a pipeline component.
|
"""Rename a pipeline component.
|
||||||
|
@ -793,8 +806,6 @@ class Language:
|
||||||
# because factory may be used for something else
|
# because factory may be used for something else
|
||||||
self._pipe_meta.pop(name)
|
self._pipe_meta.pop(name)
|
||||||
self._pipe_configs.pop(name)
|
self._pipe_configs.pop(name)
|
||||||
if ENABLE_PIPELINE_ANALYSIS:
|
|
||||||
analyze_all_pipes(self)
|
|
||||||
return removed
|
return removed
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
|
@ -1099,6 +1110,7 @@ class Language:
|
||||||
batch_size: int = 256,
|
batch_size: int = 256,
|
||||||
scorer: Optional[Scorer] = None,
|
scorer: Optional[Scorer] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
|
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||||
) -> Dict[str, Union[float, dict]]:
|
) -> Dict[str, Union[float, dict]]:
|
||||||
"""Evaluate a model's pipeline components.
|
"""Evaluate a model's pipeline components.
|
||||||
|
|
||||||
|
@ -1109,6 +1121,8 @@ class Language:
|
||||||
will be created.
|
will be created.
|
||||||
component_cfg (dict): An optional dictionary with extra keyword
|
component_cfg (dict): An optional dictionary with extra keyword
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
|
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||||||
|
for the scorer.
|
||||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://spacy.io/api/language#evaluate
|
||||||
|
@ -1126,8 +1140,10 @@ class Language:
|
||||||
raise TypeError(err)
|
raise TypeError(err)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
if scorer_cfg is None:
|
||||||
|
scorer_cfg = {}
|
||||||
if scorer is None:
|
if scorer is None:
|
||||||
kwargs = component_cfg.get("scorer", {})
|
kwargs = dict(scorer_cfg)
|
||||||
kwargs.setdefault("verbose", verbose)
|
kwargs.setdefault("verbose", verbose)
|
||||||
kwargs.setdefault("nlp", self)
|
kwargs.setdefault("nlp", self)
|
||||||
scorer = Scorer(**kwargs)
|
scorer = Scorer(**kwargs)
|
||||||
|
@ -1136,9 +1152,9 @@ class Language:
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
# tokenize the texts only for timing purposes
|
# tokenize the texts only for timing purposes
|
||||||
if not hasattr(self.tokenizer, "pipe"):
|
if not hasattr(self.tokenizer, "pipe"):
|
||||||
_ = [self.tokenizer(text) for text in texts]
|
_ = [self.tokenizer(text) for text in texts] # noqa: F841
|
||||||
else:
|
else:
|
||||||
_ = list(self.tokenizer.pipe(texts))
|
_ = list(self.tokenizer.pipe(texts)) # noqa: F841
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
def init(model: Model, X=None, Y=None):
|
||||||
vectors_table = model.ops.alloc3f(
|
vectors_table = model.ops.alloc3f(
|
||||||
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
||||||
)
|
)
|
||||||
model.set_param("E", vectors_table)
|
model.set_param("E", vectors_table)
|
||||||
|
|
||||||
|
|
||||||
def forward(model, docs, is_train):
|
def forward(model: Model, docs: List[Doc], is_train: bool):
|
||||||
if docs is None:
|
if docs is None:
|
||||||
return []
|
return []
|
||||||
ids = []
|
ids = []
|
||||||
|
|
|
@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
|
||||||
if X is not None and Y is not None:
|
if X is not None and Y is not None:
|
||||||
if X.data.shape != Y.data.shape:
|
if X.data.shape != Y.data.shape:
|
||||||
# TODO: Fix error
|
# TODO: Fix error
|
||||||
|
|
|
@ -4,14 +4,14 @@ from thinc.api import Model
|
||||||
from ..attrs import LOWER
|
from ..attrs import LOWER
|
||||||
|
|
||||||
|
|
||||||
def extract_ngrams(ngram_size, attr=LOWER) -> Model:
|
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||||
model = Model("extract_ngrams", forward)
|
model = Model("extract_ngrams", forward)
|
||||||
model.attrs["ngram_size"] = ngram_size
|
model.attrs["ngram_size"] = ngram_size
|
||||||
model.attrs["attr"] = attr
|
model.attrs["attr"] = attr
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def forward(model, docs, is_train: bool):
|
def forward(model: Model, docs, is_train: bool):
|
||||||
batch_keys = []
|
batch_keys = []
|
||||||
batch_vals = []
|
batch_vals = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from pathlib import Path
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear
|
from thinc.api import Model, Maxout, Linear
|
||||||
|
|
||||||
|
@ -9,7 +8,7 @@ from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||||
def build_nel_encoder(tok2vec, nO=None):
|
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
token_width = tok2vec.get_dim("nO")
|
token_width = tok2vec.get_dim("nO")
|
||||||
output_layer = Linear(nO=nO, nI=token_width)
|
output_layer = Linear(nO=nO, nI=token_width)
|
||||||
|
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.KBFromFile.v1")
|
@registry.assets.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
|
def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
|
||||||
vocab = Vocab().from_disk(vocab_path)
|
vocab = Vocab().from_disk(vocab_path)
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
kb.load_bulk(kb_path)
|
kb.load_bulk(kb_path)
|
||||||
|
|
|
@ -1,10 +1,20 @@
|
||||||
|
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from ...vocab import Vocab # noqa: F401
|
||||||
|
from ...tokens import Doc # noqa: F401
|
||||||
|
|
||||||
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
|
||||||
|
def build_multi_task_model(
|
||||||
|
tok2vec: Model,
|
||||||
|
maxout_pieces: int,
|
||||||
|
token_vector_width: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
|
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None):
|
def build_cloze_multi_task_model(
|
||||||
|
vocab: "Vocab",
|
||||||
|
tok2vec: Model,
|
||||||
|
maxout_pieces: int,
|
||||||
|
hidden_size: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
# nO = vocab.vectors.data.shape[1]
|
# nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
|
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_characters_multi_task_model(
|
def build_cloze_characters_multi_task_model(
|
||||||
vocab, tok2vec, maxout_pieces, hidden_size, nr_char
|
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
|
||||||
):
|
) -> Model:
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
Maxout(hidden_size, nP=maxout_pieces),
|
Maxout(hidden_size, nP=maxout_pieces),
|
||||||
LayerNorm(nI=hidden_size),
|
LayerNorm(nI=hidden_size),
|
||||||
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
||||||
)
|
)
|
||||||
|
|
||||||
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
|
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
def build_masked_language_model(
|
||||||
|
vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
|
||||||
|
) -> Model:
|
||||||
"""Convert a model into a BERT-style masked language model"""
|
"""Convert a model into a BERT-style masked language model"""
|
||||||
|
|
||||||
random_words = _RandomWords(vocab)
|
random_words = _RandomWords(vocab)
|
||||||
|
|
||||||
def mlm_forward(model, docs, is_train):
|
def mlm_forward(model, docs, is_train):
|
||||||
|
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||||
|
|
||||||
return output, mlm_backward
|
return output, mlm_backward
|
||||||
|
|
||||||
def mlm_initialize(model, X=None, Y=None):
|
def mlm_initialize(model: Model, X=None, Y=None):
|
||||||
wrapped = model.layers[0]
|
wrapped = model.layers[0]
|
||||||
wrapped.initialize(X=X, Y=Y)
|
wrapped.initialize(X=X, Y=Y)
|
||||||
for dim in wrapped.dim_names:
|
for dim in wrapped.dim_names:
|
||||||
|
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||||
dims={dim: None for dim in wrapped_model.dim_names},
|
dims={dim: None for dim in wrapped_model.dim_names},
|
||||||
)
|
)
|
||||||
mlm_model.set_ref("wrapped", wrapped_model)
|
mlm_model.set_ref("wrapped", wrapped_model)
|
||||||
|
|
||||||
return mlm_model
|
return mlm_model
|
||||||
|
|
||||||
|
|
||||||
class _RandomWords:
|
class _RandomWords:
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab: "Vocab") -> None:
|
||||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||||
self.words = self.words[:10000]
|
self.words = self.words[:10000]
|
||||||
|
@ -104,7 +119,7 @@ class _RandomWords:
|
||||||
self.probs /= self.probs.sum()
|
self.probs /= self.probs.sum()
|
||||||
self._cache = []
|
self._cache = []
|
||||||
|
|
||||||
def next(self):
|
def next(self) -> str:
|
||||||
if not self._cache:
|
if not self._cache:
|
||||||
self._cache.extend(
|
self._cache.extend(
|
||||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||||
|
@ -113,9 +128,11 @@ class _RandomWords:
|
||||||
return self.words[index]
|
return self.words[index]
|
||||||
|
|
||||||
|
|
||||||
def _apply_mask(docs, random_words, mask_prob=0.15):
|
def _apply_mask(
|
||||||
|
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
|
||||||
|
) -> Tuple[numpy.ndarray, List["Doc"]]:
|
||||||
# This needs to be here to avoid circular imports
|
# This needs to be here to avoid circular imports
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc # noqa: F811
|
||||||
|
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||||
|
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
return mask, masked_docs
|
return mask, masked_docs
|
||||||
|
|
||||||
|
|
||||||
def _replace_word(word, random_words, mask="[MASK]"):
|
def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
|
||||||
roll = numpy.random.random()
|
roll = numpy.random.random()
|
||||||
if roll < 0.8:
|
if roll < 0.8:
|
||||||
return mask
|
return mask
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from pydantic import StrictInt
|
from typing import Optional
|
||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
|
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||||
from thinc.api import LayerNorm, Maxout, Mish
|
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from .._precomputable_affine import PrecomputableAffine
|
from .._precomputable_affine import PrecomputableAffine
|
||||||
|
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
tok2vec: Model,
|
tok2vec: Model,
|
||||||
nr_feature_tokens: StrictInt,
|
nr_feature_tokens: int,
|
||||||
hidden_width: StrictInt,
|
hidden_width: int,
|
||||||
maxout_pieces: StrictInt,
|
maxout_pieces: int,
|
||||||
use_upper=True,
|
use_upper: bool = True,
|
||||||
nO=None,
|
nO: Optional[int] = None,
|
||||||
):
|
) -> Model:
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
|
|
||||||
lower = PrecomputableAffine(
|
lower = PrecomputableAffine(
|
||||||
nO=hidden_width if use_upper else nO,
|
nO=hidden_width if use_upper else nO,
|
||||||
nF=nr_feature_tokens,
|
nF=nr_feature_tokens,
|
||||||
|
|
|
@ -26,7 +26,6 @@ def BiluoTagger(
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list(),
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
"biluo-tagger",
|
"biluo-tagger",
|
||||||
forward,
|
forward,
|
||||||
|
@ -52,7 +51,6 @@ def IOBTagger(
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list(),
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
"iob-tagger",
|
"iob-tagger",
|
||||||
forward,
|
forward,
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tagger.v1")
|
@registry.architectures.register("spacy.Tagger.v1")
|
||||||
def build_tagger_model(tok2vec, nO=None) -> Model:
|
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||||
|
|
|
@ -2,10 +2,9 @@ from typing import Optional
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||||
|
|
||||||
from ... import util
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatBOW.v1")
|
@registry.architectures.register("spacy.TextCatBOW.v1")
|
||||||
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
|
def build_bow_text_classifier(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO)
|
sparse_linear = SparseLinear(nO)
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||||
|
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier(
|
||||||
width,
|
width: int,
|
||||||
embed_size,
|
embed_size: int,
|
||||||
pretrained_vectors,
|
pretrained_vectors: Optional[bool],
|
||||||
exclusive_classes,
|
exclusive_classes: bool,
|
||||||
ngram_size,
|
ngram_size: int,
|
||||||
window_size,
|
window_size: int,
|
||||||
conv_depth,
|
conv_depth: int,
|
||||||
dropout,
|
dropout: Optional[float],
|
||||||
nO=None,
|
nO: Optional[int] = None,
|
||||||
):
|
) -> Model:
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
lower = HashEmbed(
|
lower = HashEmbed(
|
||||||
|
@ -91,7 +95,6 @@ def build_text_classifier(
|
||||||
dropout=dropout,
|
dropout=dropout,
|
||||||
seed=13,
|
seed=13,
|
||||||
)
|
)
|
||||||
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||||
uniqued(
|
uniqued(
|
||||||
|
@ -100,7 +103,6 @@ def build_text_classifier(
|
||||||
column=cols.index(ORTH),
|
column=cols.index(ORTH),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_vectors:
|
if pretrained_vectors:
|
||||||
static_vectors = StaticVectors(width)
|
static_vectors = StaticVectors(width)
|
||||||
vector_layer = trained_vectors | static_vectors
|
vector_layer = trained_vectors | static_vectors
|
||||||
|
@ -152,7 +154,12 @@ def build_text_classifier(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
|
def build_text_classifier_lowdata(
|
||||||
|
width: int,
|
||||||
|
pretrained_vectors: Optional[bool],
|
||||||
|
dropout: Optional[float],
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
model = (
|
model = (
|
||||||
|
|
|
@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ... import util
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
def tok2vec_listener_v1(width, upstream="*"):
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
width=width,
|
width=width,
|
||||||
depth=depth,
|
depth=depth,
|
||||||
window_size=window_size,
|
window_size=window_size,
|
||||||
maxout_pieces=maxout_pieces
|
maxout_pieces=maxout_pieces,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
|
@ -68,7 +68,6 @@ def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
||||||
):
|
):
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(feature):
|
||||||
|
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([NORM]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5))
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
)
|
),
|
||||||
),
|
),
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||||
ragged2list()
|
ragged2list(),
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
|
||||||
def MishWindowEncoder(width, window_size, depth):
|
def MishWindowEncoder(width, window_size, depth):
|
||||||
cnn = chain(
|
cnn = chain(
|
||||||
expand_window(window_size=window_size),
|
expand_window(window_size=window_size),
|
||||||
Mish(
|
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||||
nO=width,
|
|
||||||
nI=width * ((window_size * 2) + 1),
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", width)
|
model.set_dim("nO", width)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free, realloc
|
from ..typedefs cimport weight_t, hash_t
|
||||||
from ..typedefs cimport weight_t, class_t, hash_t
|
from ..pipeline._parser_internals._state cimport StateC
|
||||||
|
|
||||||
from ._state cimport StateC
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct SizesC:
|
cdef struct SizesC:
|
|
@ -1,29 +1,18 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||||
cimport cython.parallel
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free, realloc
|
from libc.stdlib cimport calloc, free, realloc
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from thinc.extra.search cimport Beam
|
|
||||||
from thinc.backends.linalg cimport Vec, VecVec
|
from thinc.backends.linalg cimport Vec, VecVec
|
||||||
cimport blis.cy
|
cimport blis.cy
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.random
|
import numpy.random
|
||||||
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop
|
from thinc.api import Model, CupyOps, NumpyOps
|
||||||
|
|
||||||
from ..typedefs cimport weight_t, class_t, hash_t
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from .stateclass cimport StateClass
|
|
||||||
from .transition_system cimport Transition
|
|
||||||
|
|
||||||
from ..compat import copy_array
|
|
||||||
from ..errors import Errors, TempErrors
|
|
||||||
from ..util import create_default_optimizer
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from . import nonproj
|
from ..typedefs cimport weight_t, class_t, hash_t
|
||||||
|
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||||
|
|
||||||
|
|
||||||
cdef WeightsC get_c_weights(model) except *:
|
cdef WeightsC get_c_weights(model) except *:
|
|
@ -1,5 +1,5 @@
|
||||||
from thinc.api import Model, noop, use_ops, Linear
|
from thinc.api import Model, noop, use_ops, Linear
|
||||||
from ..syntax._parser_model import ParserStepModel
|
from .parser_model import ParserStepModel
|
||||||
|
|
||||||
|
|
||||||
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
||||||
from wasabi import Printer
|
from wasabi import msg
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors
|
||||||
from .util import dot_to_dict
|
from .util import dot_to_dict
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -11,48 +10,7 @@ if TYPE_CHECKING:
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
|
||||||
nlp: "Language", name: str, index: int, warn: bool = True
|
|
||||||
) -> List[str]:
|
|
||||||
"""Analyze a pipeline component with respect to its position in the current
|
|
||||||
pipeline and the other components. Will check whether requirements are
|
|
||||||
fulfilled (e.g. if previous components assign the attributes).
|
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
|
||||||
name (str): The name of the pipeline component to analyze.
|
|
||||||
index (int): The index of the component in the pipeline.
|
|
||||||
warn (bool): Show user warning if problem is found.
|
|
||||||
RETURNS (List[str]): The problems found for the given pipeline component.
|
|
||||||
"""
|
|
||||||
assert nlp.pipeline[index][0] == name
|
|
||||||
prev_pipes = nlp.pipeline[:index]
|
|
||||||
meta = nlp.get_pipe_meta(name)
|
|
||||||
requires = {annot: False for annot in meta.requires}
|
|
||||||
if requires:
|
|
||||||
for prev_name, prev_pipe in prev_pipes:
|
|
||||||
prev_meta = nlp.get_pipe_meta(prev_name)
|
|
||||||
for annot in prev_meta.assigns:
|
|
||||||
requires[annot] = True
|
|
||||||
problems = []
|
|
||||||
for annot, fulfilled in requires.items():
|
|
||||||
if not fulfilled:
|
|
||||||
problems.append(annot)
|
|
||||||
if warn:
|
|
||||||
warnings.warn(Warnings.W025.format(name=name, attr=annot))
|
|
||||||
return problems
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
|
|
||||||
"""Analyze all pipes in the pipeline in order.
|
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
|
||||||
warn (bool): Show user warning if problem is found.
|
|
||||||
RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
|
|
||||||
"""
|
|
||||||
problems = {}
|
|
||||||
for i, name in enumerate(nlp.pipe_names):
|
|
||||||
problems[name] = analyze_pipes(nlp, name, i, warn=warn)
|
|
||||||
return problems
|
|
||||||
|
|
||||||
|
|
||||||
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||||
|
@ -101,89 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
|
def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
assert feature in ["assigns", "requires"]
|
"""Check which components in the pipeline assign or require an attribute.
|
||||||
result = []
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
attr (str): The attribute, e.g. "doc.tensor".
|
||||||
|
RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
|
||||||
|
mapped to a list of component names.
|
||||||
|
"""
|
||||||
|
result = {"assigns": [], "requires": []}
|
||||||
for pipe_name in nlp.pipe_names:
|
for pipe_name in nlp.pipe_names:
|
||||||
meta = nlp.get_pipe_meta(pipe_name)
|
meta = nlp.get_pipe_meta(pipe_name)
|
||||||
pipe_assigns = getattr(meta, feature, [])
|
if attr in meta.assigns:
|
||||||
if attr in pipe_assigns:
|
result["assigns"].append(pipe_name)
|
||||||
result.append(pipe_name)
|
if attr in meta.requires:
|
||||||
|
result["requires"].append(pipe_name)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
|
def analyze_pipes(
|
||||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
||||||
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
pipeline (Language): The current nlp object.
|
|
||||||
attr (str): The attribute to check.
|
|
||||||
RETURNS (List[str]): Names of components that require the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(nlp, attr, "assigns")
|
|
||||||
|
|
||||||
|
|
||||||
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
|
|
||||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
|
||||||
|
|
||||||
pipeline (Language): The current nlp object.
|
|
||||||
attr (str): The attribute to check.
|
|
||||||
RETURNS (List[str]): Names of components that require the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(nlp, attr, "requires")
|
|
||||||
|
|
||||||
|
|
||||||
def print_summary(
|
|
||||||
nlp: "Language", pretty: bool = True, no_print: bool = False
|
|
||||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
well as any problems if available.
|
well as any problems if available.
|
||||||
|
|
||||||
nlp (Language): The nlp object.
|
nlp (Language): The nlp object.
|
||||||
pretty (bool): Pretty-print the results (color etc).
|
keys (List[str]): The meta keys to show in the table.
|
||||||
no_print (bool): Don't print anything, just return the data.
|
RETURNS (dict): A dict with "summary" and "problems".
|
||||||
RETURNS (dict): A dict with "overview" and "problems".
|
|
||||||
"""
|
"""
|
||||||
msg = Printer(pretty=pretty, no_print=no_print)
|
result = {"summary": {}, "problems": {}}
|
||||||
overview = []
|
all_attrs = set()
|
||||||
problems = {}
|
|
||||||
for i, name in enumerate(nlp.pipe_names):
|
for i, name in enumerate(nlp.pipe_names):
|
||||||
meta = nlp.get_pipe_meta(name)
|
meta = nlp.get_pipe_meta(name)
|
||||||
overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
|
all_attrs.update(meta.assigns)
|
||||||
problems[name] = analyze_pipes(nlp, name, i, warn=False)
|
all_attrs.update(meta.requires)
|
||||||
|
result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
|
||||||
|
prev_pipes = nlp.pipeline[:i]
|
||||||
|
requires = {annot: False for annot in meta.requires}
|
||||||
|
if requires:
|
||||||
|
for prev_name, prev_pipe in prev_pipes:
|
||||||
|
prev_meta = nlp.get_pipe_meta(prev_name)
|
||||||
|
for annot in prev_meta.assigns:
|
||||||
|
requires[annot] = True
|
||||||
|
result["problems"][name] = []
|
||||||
|
for annot, fulfilled in requires.items():
|
||||||
|
if not fulfilled:
|
||||||
|
result["problems"][name].append(annot)
|
||||||
|
result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def print_pipe_analysis(
|
||||||
|
analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
|
||||||
|
*,
|
||||||
|
keys: List[str] = DEFAULT_KEYS,
|
||||||
|
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
||||||
|
"""Print a formatted version of the pipe analysis produced by analyze_pipes.
|
||||||
|
|
||||||
|
analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
|
||||||
|
keys (List[str]): The meta keys to show in the table.
|
||||||
|
"""
|
||||||
msg.divider("Pipeline Overview")
|
msg.divider("Pipeline Overview")
|
||||||
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
header = ["#", "Component", *[key.capitalize() for key in keys]]
|
||||||
msg.table(overview, header=header, divider=True, multiline=True)
|
summary = analysis["summary"].items()
|
||||||
n_problems = sum(len(p) for p in problems.values())
|
body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
|
||||||
if any(p for p in problems.values()):
|
msg.table(body, header=header, divider=True, multiline=True)
|
||||||
|
n_problems = sum(len(p) for p in analysis["problems"].values())
|
||||||
|
if any(p for p in analysis["problems"].values()):
|
||||||
msg.divider(f"Problems ({n_problems})")
|
msg.divider(f"Problems ({n_problems})")
|
||||||
for name, problem in problems.items():
|
for name, problem in analysis["problems"].items():
|
||||||
if problem:
|
if problem:
|
||||||
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
||||||
else:
|
else:
|
||||||
msg.good("No problems found.")
|
msg.good("No problems found.")
|
||||||
if no_print:
|
|
||||||
return {"overview": overview, "problems": problems}
|
|
||||||
|
|
||||||
|
|
||||||
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
|
|
||||||
"""Count how many subsequent components require an annotation set by each
|
|
||||||
component in the pipeline.
|
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
|
||||||
RETURNS (List[int]): The interdependency counts.
|
|
||||||
"""
|
|
||||||
pipe_assigns = []
|
|
||||||
pipe_requires = []
|
|
||||||
for name in nlp.pipe_names:
|
|
||||||
meta = nlp.get_pipe_meta(name)
|
|
||||||
pipe_assigns.append(set(meta.assigns))
|
|
||||||
pipe_requires.append(set(meta.requires))
|
|
||||||
counts = []
|
|
||||||
for i, assigns in enumerate(pipe_assigns):
|
|
||||||
count = 0
|
|
||||||
for requires in pipe_requires[i + 1 :]:
|
|
||||||
if assigns.intersection(requires):
|
|
||||||
count += 1
|
|
||||||
counts.append(count)
|
|
||||||
return counts
|
|
||||||
|
|
|
@ -1,15 +1,14 @@
|
||||||
from libc.string cimport memcpy, memset, memmove
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ...vocab cimport EMPTY_LEXEME
|
||||||
from ..structs cimport TokenC, SpanC
|
from ...structs cimport TokenC, SpanC
|
||||||
from ..lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ..symbols cimport punct
|
from ...attrs cimport IS_SPACE
|
||||||
from ..attrs cimport IS_SPACE
|
from ...typedefs cimport attr_t
|
||||||
from ..typedefs cimport attr_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
|
@ -1,8 +1,6 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..typedefs cimport weight_t, attr_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport Transition, TransitionSystem
|
||||||
|
|
||||||
|
|
||||||
cdef class ArcEager(TransitionSystem):
|
cdef class ArcEager(TransitionSystem):
|
|
@ -1,24 +1,17 @@
|
||||||
# cython: profile=True, cdivision=True, infer_types=True
|
# cython: profile=True, cdivision=True, infer_types=True
|
||||||
from cpython.ref cimport Py_INCREF
|
|
||||||
from cymem.cymem cimport Pool, Address
|
from cymem.cymem cimport Pool, Address
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
import json
|
|
||||||
|
|
||||||
from ..typedefs cimport hash_t, attr_t
|
from ...typedefs cimport hash_t, attr_t
|
||||||
from ..strings cimport hash_string
|
from ...strings cimport hash_string
|
||||||
from ..structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
from ...gold.example cimport Example
|
||||||
|
from ...errors import Errors
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
|
||||||
from ..gold.example cimport Example
|
|
||||||
|
|
||||||
from ..errors import Errors
|
|
||||||
from .nonproj import is_nonproj_tree
|
|
||||||
from . import nonproj
|
|
||||||
|
|
||||||
|
|
||||||
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
||||||
cdef int BINARY_COSTS = 1
|
cdef int BINARY_COSTS = 1
|
|
@ -1,6 +1,4 @@
|
||||||
from .transition_system cimport TransitionSystem
|
from .transition_system cimport TransitionSystem
|
||||||
from .transition_system cimport Transition
|
|
||||||
from ..typedefs cimport attr_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoPushDown(TransitionSystem):
|
cdef class BiluoPushDown(TransitionSystem):
|
|
@ -2,17 +2,14 @@ from collections import Counter
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..typedefs cimport weight_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
|
from ...lexeme cimport Lexeme
|
||||||
|
from ...attrs cimport IS_SPACE
|
||||||
|
from ...gold.example cimport Example
|
||||||
|
from ...errors import Errors
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition, do_func_t
|
||||||
from .transition_system cimport do_func_t
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from ..attrs cimport IS_SPACE
|
|
||||||
from ..gold.iob_utils import biluo_tags_from_offsets
|
|
||||||
from ..gold.example cimport Example
|
|
||||||
|
|
||||||
from ..errors import Errors
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
|
@ -5,9 +5,9 @@ scheme.
|
||||||
"""
|
"""
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
|
||||||
from ..errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
DELIMITER = '||'
|
DELIMITER = '||'
|
|
@ -1,12 +1,8 @@
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
cimport cython
|
|
||||||
|
|
||||||
from ..structs cimport TokenC, SpanC
|
from ...structs cimport TokenC, SpanC
|
||||||
from ..typedefs cimport attr_t
|
from ...typedefs cimport attr_t
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
|
@ -1,11 +1,11 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
from ..structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ..strings cimport StringStore
|
from ...strings cimport StringStore
|
||||||
|
from ...gold.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from ..gold.example cimport Example
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Transition:
|
cdef struct Transition:
|
|
@ -1,19 +1,17 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from cpython.ref cimport Py_INCREF
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..typedefs cimport weight_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from ..tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
from ..structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..typedefs cimport attr_t
|
|
||||||
|
|
||||||
from ..errors import Errors
|
from ...errors import Errors
|
||||||
from .. import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
|
@ -1,13 +1,13 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ..syntax.nn_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ..syntax.arc_eager cimport ArcEager
|
from ._parser_internals.arc_eager cimport ArcEager
|
||||||
|
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..syntax import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"parser",
|
"parser",
|
||||||
assigns=["token.dep", "token.is_sent_start", "doc.sents"],
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||||
default_config={
|
default_config={
|
||||||
"moves": None,
|
"moves": None,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
|
@ -120,7 +120,8 @@ cdef class DependencyParser(Parser):
|
||||||
return dep
|
return dep
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
||||||
results.update(Scorer.score_deps(examples, "dep", getter=dep_getter,
|
kwargs.setdefault("getter", dep_getter)
|
||||||
ignore_labels=("p", "punct"), **kwargs))
|
kwargs.setdefault("ignore_label", ("p", "punct"))
|
||||||
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -222,9 +222,9 @@ class EntityLinker(Pipe):
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
if not sentence_docs:
|
if not sentence_docs:
|
||||||
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
||||||
return 0.0
|
return losses
|
||||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||||
loss, d_scores = self.get_similarity_loss(
|
loss, d_scores = self.get_loss(
|
||||||
sentence_encodings=sentence_encodings, examples=examples
|
sentence_encodings=sentence_encodings, examples=examples
|
||||||
)
|
)
|
||||||
bp_context(d_scores)
|
bp_context(d_scores)
|
||||||
|
@ -235,7 +235,7 @@ class EntityLinker(Pipe):
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_similarity_loss(self, examples: Iterable[Example], sentence_encodings):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
@ -247,7 +247,7 @@ class EntityLinker(Pipe):
|
||||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
if sentence_encodings.shape != entity_encodings.shape:
|
if sentence_encodings.shape != entity_encodings.shape:
|
||||||
err = Errors.E147.format(
|
err = Errors.E147.format(
|
||||||
method="get_similarity_loss", msg="gold entities do not match up"
|
method="get_loss", msg="gold entities do not match up"
|
||||||
)
|
)
|
||||||
raise RuntimeError(err)
|
raise RuntimeError(err)
|
||||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
||||||
|
@ -337,13 +337,13 @@ class EntityLinker(Pipe):
|
||||||
final_kb_ids.append(candidates[0].entity_)
|
final_kb_ids.append(candidates[0].entity_)
|
||||||
else:
|
else:
|
||||||
random.shuffle(candidates)
|
random.shuffle(candidates)
|
||||||
# this will set all prior probabilities to 0 if they should be excluded from the model
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray(
|
||||||
[c.prior_prob for c in candidates]
|
[c.prior_prob for c in candidates]
|
||||||
)
|
)
|
||||||
if not self.cfg.get("incl_prior"):
|
if not self.cfg.get("incl_prior"):
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray(
|
||||||
[0.0 for c in candidates]
|
[0.0 for _ in candidates]
|
||||||
)
|
)
|
||||||
scores = prior_probs
|
scores = prior_probs
|
||||||
# add in similarity from the context
|
# add in similarity from the context
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import CosineDistance, to_categorical, to_categorical, Model, Config
|
from thinc.api import CosineDistance, to_categorical, Model, Config
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -9,7 +9,7 @@ from ..tokens.doc cimport Doc
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..syntax import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
@ -219,3 +219,6 @@ class ClozeMultitask(Pipe):
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ..syntax.nn_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ..syntax.ner cimport BiluoPushDown
|
from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
2
spacy/pipeline/pipe.pxd
Normal file
2
spacy/pipeline/pipe.pxd
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
cdef class Pipe:
|
||||||
|
cdef public str name
|
|
@ -8,7 +8,7 @@ from ..errors import Errors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
class Pipe:
|
cdef class Pipe:
|
||||||
"""This class is a base class and not instantiated directly. Trainable
|
"""This class is a base class and not instantiated directly. Trainable
|
||||||
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
||||||
from it and it defines the interface that components should follow to
|
from it and it defines the interface that components should follow to
|
||||||
|
@ -17,8 +17,6 @@ class Pipe:
|
||||||
DOCS: https://spacy.io/api/pipe
|
DOCS: https://spacy.io/api/pipe
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = None
|
|
||||||
|
|
||||||
def __init__(self, vocab, model, name, **cfg):
|
def __init__(self, vocab, model, name, **cfg):
|
||||||
"""Initialize a pipeline component.
|
"""Initialize a pipeline component.
|
||||||
|
|
||||||
|
|
|
@ -203,3 +203,9 @@ class Sentencizer(Pipe):
|
||||||
cfg = srsly.read_json(path)
|
cfg = srsly.read_json(path)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def get_loss(self, examples, scores):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
|
@ -109,7 +109,7 @@ class SentenceRecognizer(Tagger):
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
eg_truth = []
|
eg_truth = []
|
||||||
for x in eg.get_aligned("sent_start"):
|
for x in eg.get_aligned("sent_start"):
|
||||||
if x == None:
|
if x is None:
|
||||||
eg_truth.append(None)
|
eg_truth.append(None)
|
||||||
elif x == 1:
|
elif x == 1:
|
||||||
eg_truth.append(labels[1])
|
eg_truth.append(labels[1])
|
||||||
|
|
|
@ -131,8 +131,6 @@ class SimpleNER(Pipe):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
|
||||||
loss = 0
|
|
||||||
d_scores = []
|
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
tags = eg.get_aligned("TAG", as_string=True)
|
tags = eg.get_aligned("TAG", as_string=True)
|
||||||
|
@ -159,7 +157,6 @@ class SimpleNER(Pipe):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
gold_tuples = get_examples
|
gold_tuples = get_examples
|
||||||
get_examples = lambda: gold_tuples
|
get_examples = lambda: gold_tuples
|
||||||
labels = _get_labels(get_examples())
|
|
||||||
for label in _get_labels(get_examples()):
|
for label in _get_labels(get_examples()):
|
||||||
self.add_label(label)
|
self.add_label(label)
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
|
|
|
@ -238,8 +238,11 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return losses
|
||||||
try:
|
try:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
@ -250,7 +253,7 @@ class TextCategorizer(Pipe):
|
||||||
raise TypeError(err)
|
raise TypeError(err)
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return losses
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
scores, bp_scores = self.model.begin_update(docs)
|
||||||
target = self._rehearsal_model(examples)
|
target = self._rehearsal_model(examples)
|
||||||
|
@ -259,7 +262,6 @@ class TextCategorizer(Pipe):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
losses[self.name] += (gradient ** 2).sum()
|
losses[self.name] += (gradient ** 2).sum()
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
|
|
@ -199,6 +199,9 @@ class Tok2Vec(Pipe):
|
||||||
docs = [Doc(self.vocab, words=["hello"])]
|
docs = [Doc(self.vocab, words=["hello"])]
|
||||||
self.model.initialize(X=docs)
|
self.model.initialize(X=docs)
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Tok2VecListener(Model):
|
class Tok2VecListener(Model):
|
||||||
"""A layer that gets fed its answers from an upstream connection,
|
"""A layer that gets fed its answers from an upstream connection,
|
||||||
|
|
|
@ -1,16 +1,15 @@
|
||||||
from .stateclass cimport StateClass
|
from cymem.cymem cimport Pool
|
||||||
from .arc_eager cimport TransitionSystem
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc
|
from .pipe cimport Pipe
|
||||||
from ..structs cimport TokenC
|
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||||
from ._state cimport StateC
|
from ._parser_internals._state cimport StateC
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC
|
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser(Pipe):
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef public object model
|
cdef public object model
|
||||||
cdef public str name
|
|
||||||
cdef public object _rehearsal_model
|
cdef public object _rehearsal_model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
|
@ -1,42 +1,32 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||||
cimport cython.parallel
|
from __future__ import print_function
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from cpython.ref cimport PyObject, Py_XDECREF
|
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
|
||||||
from libc.math cimport exp
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from thinc.backends.linalg cimport Vec, VecVec
|
|
||||||
|
|
||||||
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
|
|
||||||
from thinc.api import get_array_module, zero_init, set_dropout_rate
|
|
||||||
from itertools import islice
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ._parser_internals.stateclass cimport StateClass
|
||||||
|
from ..ml.parser_model cimport alloc_activations, free_activations
|
||||||
|
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||||
|
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
|
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||||
|
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
|
from .. import util
|
||||||
|
from ..util import create_default_optimizer
|
||||||
|
|
||||||
|
from thinc.api import set_dropout_rate
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import numpy
|
import numpy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..typedefs cimport weight_t, class_t, hash_t
|
|
||||||
from ._parser_model cimport alloc_activations, free_activations
|
|
||||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
|
||||||
from ._parser_model cimport get_c_weights, get_c_sizes
|
|
||||||
from .stateclass cimport StateClass
|
|
||||||
from ._state cimport StateC
|
|
||||||
from .transition_system cimport Transition
|
|
||||||
|
|
||||||
from ..util import create_default_optimizer, registry
|
cdef class Parser(Pipe):
|
||||||
from ..compat import copy_array
|
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from .. import util
|
|
||||||
from . import nonproj
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
|
@ -107,7 +97,7 @@ cdef class Parser:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
'''Return the embedding and convolutional layer of the model.'''
|
"""Return the embedding and convolutional layer of the model."""
|
||||||
return self.model.get_ref("tok2vec")
|
return self.model.get_ref("tok2vec")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -138,13 +128,13 @@ cdef class Parser:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
||||||
'''Setup models for secondary objectives, to benefit from multi-task
|
"""Setup models for secondary objectives, to benefit from multi-task
|
||||||
learning. This method is intended to be overridden by subclasses.
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
|
||||||
For instance, the dependency parser can benefit from sharing
|
For instance, the dependency parser can benefit from sharing
|
||||||
an input representation with a label prediction model. These auxiliary
|
an input representation with a label prediction model. These auxiliary
|
||||||
models are discarded after training.
|
models are discarded after training.
|
||||||
'''
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
226
spacy/scorer.py
226
spacy/scorer.py
|
@ -1,55 +1,61 @@
|
||||||
|
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from .gold import Example
|
||||||
|
from .tokens import Token, Doc
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import get_lang_class
|
from .util import get_lang_class
|
||||||
from .morphology import Morphology
|
from .morphology import Morphology
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
|
||||||
|
|
||||||
|
|
||||||
class PRFScore:
|
class PRFScore:
|
||||||
"""
|
"""A precision / recall / F score."""
|
||||||
A precision / recall / F score
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.tp = 0
|
self.tp = 0
|
||||||
self.fp = 0
|
self.fp = 0
|
||||||
self.fn = 0
|
self.fn = 0
|
||||||
|
|
||||||
def score_set(self, cand, gold):
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
self.tp += len(cand.intersection(gold))
|
self.tp += len(cand.intersection(gold))
|
||||||
self.fp += len(cand - gold)
|
self.fp += len(cand - gold)
|
||||||
self.fn += len(gold - cand)
|
self.fn += len(gold - cand)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def precision(self):
|
def precision(self) -> float:
|
||||||
return self.tp / (self.tp + self.fp + 1e-100)
|
return self.tp / (self.tp + self.fp + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def recall(self):
|
def recall(self) -> float:
|
||||||
return self.tp / (self.tp + self.fn + 1e-100)
|
return self.tp / (self.tp + self.fn + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def fscore(self):
|
def fscore(self) -> float:
|
||||||
p = self.precision
|
p = self.precision
|
||||||
r = self.recall
|
r = self.recall
|
||||||
return 2 * ((p * r) / (p + r + 1e-100))
|
return 2 * ((p * r) / (p + r + 1e-100))
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self) -> Dict[str, float]:
|
||||||
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
return {"p": self.precision, "r": self.recall, "f": self.fscore}
|
||||||
|
|
||||||
|
|
||||||
class ROCAUCScore:
|
class ROCAUCScore:
|
||||||
"""
|
"""An AUC ROC score."""
|
||||||
An AUC ROC score.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.golds = []
|
self.golds = []
|
||||||
self.cands = []
|
self.cands = []
|
||||||
self.saved_score = 0.0
|
self.saved_score = 0.0
|
||||||
self.saved_score_at_len = 0
|
self.saved_score_at_len = 0
|
||||||
|
|
||||||
def score_set(self, cand, gold):
|
def score_set(self, cand, gold) -> None:
|
||||||
self.cands.append(cand)
|
self.cands.append(cand)
|
||||||
self.golds.append(gold)
|
self.golds.append(gold)
|
||||||
|
|
||||||
|
@ -70,51 +76,52 @@ class ROCAUCScore:
|
||||||
class Scorer:
|
class Scorer:
|
||||||
"""Compute evaluation scores."""
|
"""Compute evaluation scores."""
|
||||||
|
|
||||||
def __init__(self, nlp=None, **cfg):
|
def __init__(
|
||||||
|
self,
|
||||||
|
nlp: Optional["Language"] = None,
|
||||||
|
default_lang: str = "xx",
|
||||||
|
default_pipeline=DEFAULT_PIPELINE,
|
||||||
|
**cfg,
|
||||||
|
) -> None:
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#init
|
DOCS: https://spacy.io/api/scorer#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
|
||||||
if not nlp:
|
if not nlp:
|
||||||
# create a default pipeline
|
nlp = get_lang_class(default_lang)()
|
||||||
nlp = get_lang_class("xx")()
|
for pipe in default_pipeline:
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe(pipe)
|
||||||
nlp.add_pipe("tagger")
|
|
||||||
nlp.add_pipe("morphologizer")
|
|
||||||
nlp.add_pipe("parser")
|
|
||||||
nlp.add_pipe("ner")
|
|
||||||
nlp.add_pipe("textcat")
|
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
|
|
||||||
def score(self, examples):
|
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||||
"""Evaluate a list of Examples.
|
"""Evaluate a list of Examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||||
RETURNS (Dict): A dictionary of scores.
|
RETURNS (Dict): A dictionary of scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
|
|
||||||
if hasattr(self.nlp.tokenizer, "score"):
|
if hasattr(self.nlp.tokenizer, "score"):
|
||||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
|
||||||
for name, component in self.nlp.pipeline:
|
for name, component in self.nlp.pipeline:
|
||||||
if hasattr(component, "score"):
|
if hasattr(component, "score"):
|
||||||
scores.update(component.score(examples, **self.cfg))
|
scores.update(component.score(examples, **self.cfg))
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_tokenization(examples, **cfg):
|
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
|
||||||
"""Returns accuracy and PRF scores for tokenization.
|
"""Returns accuracy and PRF scores for tokenization.
|
||||||
|
|
||||||
* token_acc: # correct tokens / # gold tokens
|
* token_acc: # correct tokens / # gold tokens
|
||||||
* token_p/r/f: PRF for token character spans
|
* token_p/r/f: PRF for token character spans
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
RETURNS (dict): A dictionary containing the scores token_acc/p/r/f.
|
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||||
|
token_acc/p/r/f.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_tokenization
|
||||||
"""
|
"""
|
||||||
acc_score = PRFScore()
|
acc_score = PRFScore()
|
||||||
prf_score = PRFScore()
|
prf_score = PRFScore()
|
||||||
|
@ -145,16 +152,24 @@ class Scorer:
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr(examples, attr, getter=getattr, **cfg):
|
def score_token_attr(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
) -> Dict[str, float]:
|
||||||
"""Returns an accuracy score for a token-level attribute.
|
"""Returns an accuracy score for a token-level attribute.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (dict): A dictionary containing the accuracy score under the
|
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||||
key attr_acc.
|
under the key attr_acc.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_token_attr
|
||||||
"""
|
"""
|
||||||
tag_score = PRFScore()
|
tag_score = PRFScore()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -172,17 +187,21 @@ class Scorer:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
pred_tags.add((gold_i, getter(token, attr)))
|
pred_tags.add((gold_i, getter(token, attr)))
|
||||||
tag_score.score_set(pred_tags, gold_tags)
|
tag_score.score_set(pred_tags, gold_tags)
|
||||||
return {
|
return {f"{attr}_acc": tag_score.fscore}
|
||||||
attr + "_acc": tag_score.fscore,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg):
|
def score_token_attr_per_feat(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
):
|
||||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
||||||
|
@ -223,20 +242,26 @@ class Scorer:
|
||||||
per_feat[field].score_set(
|
per_feat[field].score_set(
|
||||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||||
)
|
)
|
||||||
return {
|
return {f"{attr}_per_feat": per_feat}
|
||||||
attr + "_per_feat": per_feat,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_spans(examples, attr, getter=getattr, **cfg):
|
def score_spans(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
attr: str,
|
||||||
|
*,
|
||||||
|
getter: Callable[[Doc, str], Any] = getattr,
|
||||||
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF scores for labeled spans.
|
"""Returns PRF scores for labeled spans.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(doc, attr) should return the spans for the individual doc.
|
getter(doc, attr) should return the spans for the individual doc.
|
||||||
RETURNS (dict): A dictionary containing the PRF scores under the
|
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||||
keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_spans
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
score_per_type = dict()
|
score_per_type = dict()
|
||||||
|
@ -256,14 +281,12 @@ class Scorer:
|
||||||
# Find all predidate labels, for all and per type
|
# Find all predidate labels, for all and per type
|
||||||
gold_spans = set()
|
gold_spans = set()
|
||||||
pred_spans = set()
|
pred_spans = set()
|
||||||
|
|
||||||
# Special case for ents:
|
# Special case for ents:
|
||||||
# If we have missing values in the gold, we can't easily tell
|
# If we have missing values in the gold, we can't easily tell
|
||||||
# whether our NER predictions are true.
|
# whether our NER predictions are true.
|
||||||
# It seems bad but it's what we've always done.
|
# It seems bad but it's what we've always done.
|
||||||
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for span in getter(gold_doc, attr):
|
for span in getter(gold_doc, attr):
|
||||||
gold_span = (span.label_, span.start, span.end - 1)
|
gold_span = (span.label_, span.start, span.end - 1)
|
||||||
gold_spans.add(gold_span)
|
gold_spans.add(gold_span)
|
||||||
|
@ -279,38 +302,39 @@ class Scorer:
|
||||||
# Score for all labels
|
# Score for all labels
|
||||||
score.score_set(pred_spans, gold_spans)
|
score.score_set(pred_spans, gold_spans)
|
||||||
results = {
|
results = {
|
||||||
attr + "_p": score.precision,
|
f"{attr}_p": score.precision,
|
||||||
attr + "_r": score.recall,
|
f"{attr}_r": score.recall,
|
||||||
attr + "_f": score.fscore,
|
f"{attr}_f": score.fscore,
|
||||||
attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||||
}
|
}
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_cats(
|
def score_cats(
|
||||||
examples,
|
examples: Iterable[Example],
|
||||||
attr,
|
attr: str,
|
||||||
getter=getattr,
|
*,
|
||||||
labels=[],
|
getter: Callable[[Doc, str], Any] = getattr,
|
||||||
multi_label=True,
|
labels: Iterable[str] = tuple(),
|
||||||
positive_label=None,
|
multi_label: bool = True,
|
||||||
**cfg
|
positive_label: Optional[str] = None,
|
||||||
):
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||||
dict with scores for each label like Doc.cats. The reported overall
|
dict with scores for each label like Doc.cats. The reported overall
|
||||||
score depends on the scorer settings.
|
score depends on the scorer settings.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(doc, attr) should return the values for the individual doc.
|
getter(doc, attr) should return the values for the individual doc.
|
||||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||||
multi_label (bool): Whether the attribute allows multiple labels.
|
multi_label (bool): Whether the attribute allows multiple labels.
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
RETURNS (dict): A dictionary containing the scores, with inapplicable
|
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
|
||||||
scores as None:
|
inapplicable scores as None:
|
||||||
for all:
|
for all:
|
||||||
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
||||||
attr_score_desc (text description of the overall score),
|
attr_score_desc (text description of the overall score),
|
||||||
|
@ -319,6 +343,8 @@ class Scorer:
|
||||||
for binary exclusive with positive label: attr_p/r/f
|
for binary exclusive with positive label: attr_p/r/f
|
||||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
||||||
for multilabel, macro-averaged AUC: attr_macro_auc
|
for multilabel, macro-averaged AUC: attr_macro_auc
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_cats
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
f_per_type = dict()
|
f_per_type = dict()
|
||||||
|
@ -367,64 +393,67 @@ class Scorer:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
results = {
|
results = {
|
||||||
attr + "_score": None,
|
f"{attr}_score": None,
|
||||||
attr + "_score_desc": None,
|
f"{attr}_score_desc": None,
|
||||||
attr + "_p": None,
|
f"{attr}_p": None,
|
||||||
attr + "_r": None,
|
f"{attr}_r": None,
|
||||||
attr + "_f": None,
|
f"{attr}_f": None,
|
||||||
attr + "_macro_f": None,
|
f"{attr}_macro_f": None,
|
||||||
attr + "_macro_auc": None,
|
f"{attr}_macro_auc": None,
|
||||||
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
if len(labels) == 2 and not multi_label and positive_label:
|
if len(labels) == 2 and not multi_label and positive_label:
|
||||||
results[attr + "_p"] = score.precision
|
results[f"{attr}_p"] = score.precision
|
||||||
results[attr + "_r"] = score.recall
|
results[f"{attr}_r"] = score.recall
|
||||||
results[attr + "_f"] = score.fscore
|
results[f"{attr}_f"] = score.fscore
|
||||||
results[attr + "_score"] = results[attr + "_f"]
|
results[f"{attr}_score"] = results[f"{attr}_f"]
|
||||||
results[attr + "_score_desc"] = "F (" + positive_label + ")"
|
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||||
elif not multi_label:
|
elif not multi_label:
|
||||||
results[attr + "_macro_f"] = sum(
|
results[f"{attr}_macro_f"] = sum(
|
||||||
[score.fscore for label, score in f_per_type.items()]
|
[score.fscore for label, score in f_per_type.items()]
|
||||||
) / (len(f_per_type) + 1e-100)
|
) / (len(f_per_type) + 1e-100)
|
||||||
results[attr + "_score"] = results[attr + "_macro_f"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
|
||||||
results[attr + "_score_desc"] = "macro F"
|
results[f"{attr}_score_desc"] = "macro F"
|
||||||
else:
|
else:
|
||||||
results[attr + "_macro_auc"] = max(
|
results[f"{attr}_macro_auc"] = max(
|
||||||
sum([score.score for label, score in auc_per_type.items()])
|
sum([score.score for label, score in auc_per_type.items()])
|
||||||
/ (len(auc_per_type) + 1e-100),
|
/ (len(auc_per_type) + 1e-100),
|
||||||
-1,
|
-1,
|
||||||
)
|
)
|
||||||
results[attr + "_score"] = results[attr + "_macro_auc"]
|
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
|
||||||
results[attr + "_score_desc"] = "macro AUC"
|
results[f"{attr}_score_desc"] = "macro AUC"
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_deps(
|
def score_deps(
|
||||||
examples,
|
examples: Iterable[Example],
|
||||||
attr,
|
attr: str,
|
||||||
getter=getattr,
|
*,
|
||||||
head_attr="head",
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
head_getter=getattr,
|
head_attr: str = "head",
|
||||||
ignore_labels=tuple(),
|
head_getter: Callable[[Token, str], Any] = getattr,
|
||||||
**cfg
|
ignore_labels: Tuple[str] = tuple(),
|
||||||
):
|
**cfg,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||||
parses.
|
parses.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute containing the dependency label.
|
attr (str): The attribute containing the dependency label.
|
||||||
getter (callable): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
head_attr (str): The attribute containing the head token. Defaults to
|
head_attr (str): The attribute containing the head token. Defaults to
|
||||||
'head'.
|
'head'.
|
||||||
head_getter (callable): Defaults to getattr. If provided,
|
head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
head_getter(token, attr) should return the value of the head for an
|
head_getter(token, attr) should return the value of the head for an
|
||||||
individual token.
|
individual token.
|
||||||
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
||||||
RETURNS (dict): A dictionary containing the scores:
|
RETURNS (Dict[str, Any]): A dictionary containing the scores:
|
||||||
attr_uas, attr_las, and attr_las_per_type.
|
attr_uas, attr_las, and attr_las_per_type.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/scorer#score_deps
|
||||||
"""
|
"""
|
||||||
unlabelled = PRFScore()
|
unlabelled = PRFScore()
|
||||||
labelled = PRFScore()
|
labelled = PRFScore()
|
||||||
|
@ -482,10 +511,11 @@ class Scorer:
|
||||||
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
||||||
)
|
)
|
||||||
return {
|
return {
|
||||||
attr + "_uas": unlabelled.fscore,
|
f"{attr}_uas": unlabelled.fscore,
|
||||||
attr + "_las": labelled.fscore,
|
f"{attr}_las": labelled.fscore,
|
||||||
attr
|
f"{attr}_las_per_type": {
|
||||||
+ "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()},
|
k: v.to_dict() for k, v in labelled_per_dep.items()
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ from spacy import registry
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.pipeline._parser_internals.nonproj import projectivize
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.pipeline._parser_internals.arc_eager import ArcEager
|
||||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from spacy.lang.en import English
|
||||||
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
|
@ -3,8 +3,8 @@ import pytest
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.pipeline._parser_internals.arc_eager import ArcEager
|
||||||
from spacy.syntax.nn_parser import Parser
|
from spacy.pipeline.transition_parser import Parser
|
||||||
from spacy.tokens.doc import Doc
|
from spacy.tokens.doc import Doc
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc
|
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
|
||||||
from spacy.syntax.nonproj import is_nonproj_tree
|
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
|
||||||
from spacy.syntax import nonproj
|
from spacy.pipeline._parser_internals import nonproj
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,10 @@
|
||||||
import spacy.language
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipe_analysis import print_summary, validate_attrs
|
from spacy.pipe_analysis import get_attr_info, validate_attrs
|
||||||
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
|
|
||||||
from spacy.pipe_analysis import count_pipeline_interdependencies
|
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_component_decorator_assigns():
|
def test_component_decorator_assigns():
|
||||||
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
|
|
||||||
|
|
||||||
@Language.component("c1", assigns=["token.tag", "doc.tensor"])
|
@Language.component("c1", assigns=["token.tag", "doc.tensor"])
|
||||||
def test_component1(doc):
|
def test_component1(doc):
|
||||||
return doc
|
return doc
|
||||||
|
@ -32,10 +27,11 @@ def test_component_decorator_assigns():
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("c1")
|
nlp.add_pipe("c1")
|
||||||
with pytest.warns(UserWarning):
|
nlp.add_pipe("c2")
|
||||||
nlp.add_pipe("c2")
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
|
assert problems["c2"] == ["token.pos"]
|
||||||
nlp.add_pipe("c3")
|
nlp.add_pipe("c3")
|
||||||
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"]
|
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"]
|
||||||
nlp.add_pipe("c1", name="c4")
|
nlp.add_pipe("c1", name="c4")
|
||||||
test_component4_meta = nlp.get_pipe_meta("c1")
|
test_component4_meta = nlp.get_pipe_meta("c1")
|
||||||
assert test_component4_meta.factory == "c1"
|
assert test_component4_meta.factory == "c1"
|
||||||
|
@ -43,9 +39,8 @@ def test_component_decorator_assigns():
|
||||||
assert not Language.has_factory("c4")
|
assert not Language.has_factory("c4")
|
||||||
assert nlp.pipe_factories["c1"] == "c1"
|
assert nlp.pipe_factories["c1"] == "c1"
|
||||||
assert nlp.pipe_factories["c4"] == "c1"
|
assert nlp.pipe_factories["c4"] == "c1"
|
||||||
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"]
|
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"]
|
||||||
assert get_requires_for_attr(nlp, "token.pos") == ["c2"]
|
assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"]
|
||||||
assert print_summary(nlp, no_print=True)
|
|
||||||
assert nlp("hello world")
|
assert nlp("hello world")
|
||||||
|
|
||||||
|
|
||||||
|
@ -100,7 +95,6 @@ def test_analysis_validate_attrs_invalid(attr):
|
||||||
|
|
||||||
def test_analysis_validate_attrs_remove_pipe():
|
def test_analysis_validate_attrs_remove_pipe():
|
||||||
"""Test that attributes are validated correctly on remove."""
|
"""Test that attributes are validated correctly on remove."""
|
||||||
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
|
|
||||||
|
|
||||||
@Language.component("pipe_analysis_c6", assigns=["token.tag"])
|
@Language.component("pipe_analysis_c6", assigns=["token.tag"])
|
||||||
def c1(doc):
|
def c1(doc):
|
||||||
|
@ -112,26 +106,9 @@ def test_analysis_validate_attrs_remove_pipe():
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("pipe_analysis_c6")
|
nlp.add_pipe("pipe_analysis_c6")
|
||||||
with pytest.warns(UserWarning):
|
nlp.add_pipe("pipe_analysis_c7")
|
||||||
nlp.add_pipe("pipe_analysis_c7")
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
with pytest.warns(None) as record:
|
assert problems["pipe_analysis_c7"] == ["token.pos"]
|
||||||
nlp.remove_pipe("pipe_analysis_c7")
|
nlp.remove_pipe("pipe_analysis_c7")
|
||||||
assert not record.list
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
|
assert all(p == [] for p in problems.values())
|
||||||
|
|
||||||
def test_pipe_interdependencies():
|
|
||||||
prefix = "test_pipe_interdependencies"
|
|
||||||
|
|
||||||
@Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",))
|
|
||||||
def fancifier(doc):
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@Language.component(f"{prefix}.needer", requires=("doc._.fancy",))
|
|
||||||
def needer(doc):
|
|
||||||
return doc
|
|
||||||
|
|
||||||
nlp = Language()
|
|
||||||
nlp.add_pipe(f"{prefix}.fancifier")
|
|
||||||
nlp.add_pipe(f"{prefix}.needer")
|
|
||||||
counts = count_pipeline_interdependencies(nlp)
|
|
||||||
assert counts == [1, 0]
|
|
||||||
|
|
|
@ -118,7 +118,7 @@ def test_overfitting_IO():
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(
|
scores = nlp.evaluate(
|
||||||
train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}}
|
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||||
)
|
)
|
||||||
assert scores["cats_f"] == 1.0
|
assert scores["cats_f"] == 1.0
|
||||||
assert scores["cats_score"] == 1.0
|
assert scores["cats_score"] == 1.0
|
||||||
|
|
|
@ -7,7 +7,7 @@ import importlib.util
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import thinc
|
import thinc
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model
|
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
@ -24,8 +24,6 @@ import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import shlex
|
import shlex
|
||||||
import inspect
|
import inspect
|
||||||
from thinc.types import Unserializable
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
||||||
- ['Tok2Vec', 'tok2vec']
|
- ['Tok2Vec', 'tok2vec']
|
||||||
- ['Transformers', 'transformers']
|
- ['Transformers', 'transformers']
|
||||||
- ['Parser & NER', 'parser']
|
- ['Parser & NER', 'parser']
|
||||||
|
- ['Tagging', 'tagger']
|
||||||
- ['Text Classification', 'textcat']
|
- ['Text Classification', 'textcat']
|
||||||
- ['Entity Linking', 'entitylinker']
|
- ['Entity Linking', 'entitylinker']
|
||||||
---
|
---
|
||||||
|
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
|
||||||
|
|
||||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
> # TODO: ...
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------------- | ----- | ----------- |
|
||||||
|
| `width` | int | |
|
||||||
|
| `depth` | int | |
|
||||||
|
| `embed_size` | int | |
|
||||||
|
| `window_size` | int | |
|
||||||
|
| `maxout_pieces` | int | |
|
||||||
|
| `subword_features` | bool | |
|
||||||
|
| `dropout` | float | |
|
||||||
|
| `pretrained_vectors` | bool | |
|
||||||
|
|
||||||
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
||||||
|
|
||||||
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
||||||
|
@ -99,6 +124,28 @@ architectures into your training config.
|
||||||
| `use_upper` | bool | |
|
| `use_upper` | bool | |
|
||||||
| `nO` | int | |
|
| `nO` | int | |
|
||||||
|
|
||||||
|
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||||
|
|
||||||
|
### spacy.Tagger.v1 {#Tagger}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.Tagger.v1"
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
||||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||||
|
|
||||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||||
|
@ -112,3 +159,21 @@ architectures into your training config.
|
||||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||||
|
|
||||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.EntityLinker.v1"
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("parser", config=config)
|
> nlp.add_pipe("parser", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
| `moves` | list | <!-- TODO: --> | `None` |
|
| `moves` | list | | `None` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| `moves` | list | <!-- TODO: --> |
|
| `moves` | list | |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
| `update_with_oracle_cut_size` | int | |
|
||||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
| `multitasks` | `Iterable` | |
|
||||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
| `learn_tokens` | bool | |
|
||||||
| `min_action_freq` | int | <!-- TODO: --> |
|
| `min_action_freq` | int | |
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("entity_linker", config=config)
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
||||||
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` |
|
| `kb` | `KnowledgeBase` | | `None` |
|
||||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` |
|
| `labels_discard` | `Iterable[str]` | | `[]` |
|
||||||
| `incl_prior` | bool | <!-- TODO: --> | `True` |
|
| `incl_prior` | bool | | `True` |
|
||||||
| `incl_context` | bool | <!-- TODO: --> | `True` |
|
| `incl_context` | bool | | `True` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `kb` | `KnowlegeBase` | <!-- TODO: --> |
|
| `kb` | `KnowlegeBase` | |
|
||||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> |
|
| `labels_discard` | `Iterable[str]` | |
|
||||||
| `incl_prior` | bool | <!-- TODO: --> |
|
| `incl_prior` | bool | |
|
||||||
| `incl_context` | bool | <!-- TODO: --> |
|
| `incl_context` | bool | |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("ner", config=config)
|
> nlp.add_pipe("ner", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
| `moves` | list | <!-- TODO: --> | `None` |
|
| `moves` | list | | `None` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| `moves` | list | <!-- TODO: --> |
|
| `moves` | list | |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
| `update_with_oracle_cut_size` | int | |
|
||||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
| `multitasks` | `Iterable` | |
|
||||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
| `learn_tokens` | bool | |
|
||||||
| `min_action_freq` | int | <!-- TODO: --> |
|
| `min_action_freq` | int | |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,8 @@ new: 3.0
|
||||||
|
|
||||||
An `Example` holds the information for one training instance. It stores two
|
An `Example` holds the information for one training instance. It stores two
|
||||||
`Doc` objects: one for holding the gold-standard reference data, and one for
|
`Doc` objects: one for holding the gold-standard reference data, and one for
|
||||||
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
|
holding the predictions of the pipeline. An `Alignment` object stores the
|
||||||
object stores the alignment between these two documents, as they can differ in
|
alignment between these two documents, as they can differ in tokenization.
|
||||||
tokenization.
|
|
||||||
|
|
||||||
## Example.\_\_init\_\_ {#init tag="method"}
|
## Example.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -98,10 +98,10 @@ decorator. For more details and examples, see the
|
||||||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
|
@ -146,10 +146,10 @@ examples, see the
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
|
@ -302,6 +302,7 @@ Evaluate a model's pipeline components.
|
||||||
| `batch_size` | int | The batch size to use. |
|
| `batch_size` | int | The batch size to use. |
|
||||||
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
||||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
|
| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. |
|
||||||
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
|
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
|
||||||
|
|
||||||
## Language.use_params {#use_params tag="contextmanager, method"}
|
## Language.use_params {#use_params tag="contextmanager, method"}
|
||||||
|
@ -597,6 +598,97 @@ contains the information about the component and its default provided by the
|
||||||
| `name` | str | The pipeline component name. |
|
| `name` | str | The pipeline component name. |
|
||||||
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
||||||
|
|
||||||
|
## Language.analyze_pipes {#analyze_pipes tag="method" new="3"}
|
||||||
|
|
||||||
|
Analyze the current pipeline components and show a summary of the attributes
|
||||||
|
they assign and require, and the scores they set. The data is based on the
|
||||||
|
information provided in the [`@Language.component`](/api/language#component) and
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator. If requirements aren't
|
||||||
|
met, e.g. if a component specifies a required property that is not set by a
|
||||||
|
previous component, a warning is shown.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Important note">
|
||||||
|
|
||||||
|
The pipeline analysis is static and does **not actually run the components**.
|
||||||
|
This means that it relies on the information provided by the components
|
||||||
|
themselves. If a custom component declares that it assigns an attribute but it
|
||||||
|
doesn't, the pipeline analysis won't catch that.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> nlp.add_pipe("tagger")
|
||||||
|
> nlp.add_pipe("entity_linker")
|
||||||
|
> analysis = nlp.analyze_pipes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
<Accordion title="Example output" spaced>
|
||||||
|
|
||||||
|
```json
|
||||||
|
### Structured
|
||||||
|
{
|
||||||
|
"summary": {
|
||||||
|
"tagger": {
|
||||||
|
"assigns": ["token.tag"],
|
||||||
|
"requires": [],
|
||||||
|
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
|
||||||
|
"retokenizes": false
|
||||||
|
},
|
||||||
|
"entity_linker": {
|
||||||
|
"assigns": ["token.ent_kb_id"],
|
||||||
|
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
|
"scores": [],
|
||||||
|
"retokenizes": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"problems": {
|
||||||
|
"tagger": [],
|
||||||
|
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
|
||||||
|
},
|
||||||
|
"attrs": {
|
||||||
|
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
|
||||||
|
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.tag": { "assigns": ["tagger"], "requires": [] },
|
||||||
|
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
### Pretty
|
||||||
|
============================= Pipeline Overview =============================
|
||||||
|
|
||||||
|
# Component Assigns Requires Scores Retokenizes
|
||||||
|
- ------------- --------------- -------------- --------- -----------
|
||||||
|
0 tagger token.tag tag_acc False
|
||||||
|
pos_acc
|
||||||
|
lemma_acc
|
||||||
|
|
||||||
|
1 entity_linker token.ent_kb_id doc.ents False
|
||||||
|
doc.sents
|
||||||
|
token.ent_iob
|
||||||
|
token.ent_type
|
||||||
|
|
||||||
|
|
||||||
|
================================ Problems (4) ================================
|
||||||
|
⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
|
||||||
|
token.ent_iob, token.ent_type
|
||||||
|
```
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
|
||||||
|
| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. |
|
||||||
|
| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
|
||||||
|
|
||||||
## Language.meta {#meta tag="property"}
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
Custom meta data for the Language class. If a model is loaded, contains meta
|
Custom meta data for the Language class. If a model is loaded, contains meta
|
||||||
|
@ -832,8 +924,8 @@ instance and factory instance.
|
||||||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory` | str | The name of the registered component factory. |
|
| `factory` | str | The name of the registered component factory. |
|
||||||
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
|
|
|
@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `labels_morph` | dict | <!-- TODO: --> |
|
| `labels_morph` | dict | |
|
||||||
| `labels_pos` | dict | <!-- TODO: --> |
|
| `labels_pos` | dict | |
|
||||||
|
|
||||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,9 @@ source: spacy/scorer.py
|
||||||
---
|
---
|
||||||
|
|
||||||
The `Scorer` computes evaluation scores. It's typically created by
|
The `Scorer` computes evaluation scores. It's typically created by
|
||||||
[`Language.evaluate`](/api/language#evaluate).
|
[`Language.evaluate`](/api/language#evaluate). In addition, the `Scorer`
|
||||||
|
provides a number of evaluation methods for evaluating [`Token`](/api/token) and
|
||||||
In addition, the `Scorer` provides a number of evaluation methods for evaluating
|
[`Doc`](/api/doc) attributes.
|
||||||
`Token` and `Doc` attributes.
|
|
||||||
|
|
||||||
## Scorer.\_\_init\_\_ {#init tag="method"}
|
## Scorer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -20,10 +19,10 @@ Create a new `Scorer`.
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.scorer import Scorer
|
> from spacy.scorer import Scorer
|
||||||
>
|
>
|
||||||
> # default scoring pipeline
|
> # Default scoring pipeline
|
||||||
> scorer = Scorer()
|
> scorer = Scorer()
|
||||||
>
|
>
|
||||||
> # provided scoring pipeline
|
> # Provided scoring pipeline
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
> scorer = Scorer(nlp)
|
> scorer = Scorer(nlp)
|
||||||
> ```
|
> ```
|
||||||
|
@ -40,16 +39,20 @@ scoring methods provided by the components in the pipeline.
|
||||||
The returned `Dict` contains the scores provided by the individual pipeline
|
The returned `Dict` contains the scores provided by the individual pipeline
|
||||||
components. For the scoring methods provided by the `Scorer` and use by the core
|
components. For the scoring methods provided by the `Scorer` and use by the core
|
||||||
pipeline components, the individual score names start with the `Token` or `Doc`
|
pipeline components, the individual score names start with the `Token` or `Doc`
|
||||||
attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
|
attribute being scored:
|
||||||
`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`,
|
|
||||||
`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`,
|
- `token_acc`, `token_p`, `token_r`, `token_f`,
|
||||||
`textcat_macro_f`.
|
- `sents_p`, `sents_r`, `sents_f`
|
||||||
|
- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
|
||||||
|
- `dep_uas`, `dep_las`, `dep_las_per_type`
|
||||||
|
- `ents_p`, `ents_r` `ents_f`, `ents_per_type`
|
||||||
|
- `textcat_macro_auc`, `textcat_macro_f`
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> scorer = Scorer()
|
> scorer = Scorer()
|
||||||
> scorer.score(examples)
|
> scores = scorer.score(examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -57,78 +60,148 @@ attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`,
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| **RETURNS** | `Dict` | A dictionary of scores. |
|
| **RETURNS** | `Dict` | A dictionary of scores. |
|
||||||
|
|
||||||
## Scorer.score_tokenization {#score_tokenization tag="staticmethod"}
|
## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores the tokenization:
|
Scores the tokenization:
|
||||||
|
|
||||||
- `token_acc`: # correct tokens / # gold tokens
|
- `token_acc`: number of correct tokens / number of gold tokens
|
||||||
- `token_p/r/f`: PRF for token character spans
|
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
|
||||||
|
character spans
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = Scorer.score_tokenization(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. |
|
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. |
|
||||||
|
|
||||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod"}
|
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores a single token attribute.
|
Scores a single token attribute.
|
||||||
|
|
||||||
| Name | Type | Description |
|
> #### Example
|
||||||
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
>
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
> ```python
|
||||||
| `attr` | `str` | The attribute to score. |
|
> scores = Scorer.score_token_attr(examples, "pos")
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
> print(scores["pos_acc"])
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. |
|
> ```
|
||||||
|
|
||||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"}
|
| Name | Type | Description |
|
||||||
|
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
|
| `attr` | `str` | The attribute to score. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. |
|
||||||
|
|
||||||
Scores a single token attribute per feature for a token attribute in UFEATS
|
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
||||||
|
|
||||||
|
Scores a single token attribute per feature for a token attribute in
|
||||||
|
[UFEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||||
format.
|
format.
|
||||||
|
|
||||||
| Name | Type | Description |
|
> #### Example
|
||||||
| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
>
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
> ```python
|
||||||
| `attr` | `str` | The attribute to score. |
|
> scores = Scorer.score_token_attr_per_feat(examples, "morph")
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
> print(scores["morph_per_feat"])
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. |
|
> ```
|
||||||
|
|
||||||
## Scorer.score_spans {#score_spans tag="staticmethod"}
|
| Name | Type | Description |
|
||||||
|
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
|
| `attr` | `str` | The attribute to score. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
|
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. |
|
||||||
|
|
||||||
|
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Returns PRF scores for labeled or unlabeled spans.
|
Returns PRF scores for labeled or unlabeled spans.
|
||||||
|
|
||||||
| Name | Type | Description |
|
> #### Example
|
||||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
>
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
> ```python
|
||||||
| `attr` | `str` | The attribute to score. |
|
> scores = Scorer.score_spans(examples, "ents")
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
> print(scores["ents_f"])
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. |
|
> ```
|
||||||
|
|
||||||
## Scorer.score_deps {#score_deps tag="staticmethod"}
|
| Name | Type | Description |
|
||||||
|
| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
|
| `attr` | `str` | The attribute to score. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
||||||
|
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. |
|
||||||
|
|
||||||
|
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> def dep_getter(token, attr):
|
||||||
|
> dep = getattr(token, attr)
|
||||||
|
> dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
> return dep
|
||||||
|
>
|
||||||
|
> scores = Scorer.score_deps(
|
||||||
|
> examples,
|
||||||
|
> "dep",
|
||||||
|
> getter=dep_getter,
|
||||||
|
> ignore_labels=("p", "punct")
|
||||||
|
> )
|
||||||
|
> print(scores["dep_uas"], scores["dep_las"])
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
| `attr` | `str` | The attribute containing the dependency label. |
|
| `attr` | `str` | The attribute containing the dependency label. |
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||||
| `head_attr` | `str` | The attribute containing the head token. |
|
| `head_attr` | `str` | The attribute containing the head token. |
|
||||||
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
||||||
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
|
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. |
|
| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. |
|
||||||
|
|
||||||
## Scorer.score_cats {#score_cats tag="staticmethod"}
|
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||||
containing scores for each label like `Doc.cats`. The reported overall score
|
containing scores for each label like `Doc.cats`. The reported overall score
|
||||||
depends on the scorer settings.
|
depends on the scorer settings:
|
||||||
|
|
||||||
| Name | Type | Description |
|
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
|
||||||
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
|
||||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
|
||||||
| `attr` | `str` | The attribute to score. |
|
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
|
||||||
| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
|
||||||
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
|
||||||
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
|
||||||
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
> #### Example
|
||||||
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` |
|
>
|
||||||
|
> ```python
|
||||||
|
> labels = ["LABEL_A", "LABEL_B", "LABEL_C"]
|
||||||
|
> scores = Scorer.score_cats(
|
||||||
|
> examples,
|
||||||
|
> "cats",
|
||||||
|
> labels=labels
|
||||||
|
> )
|
||||||
|
> print(scores["cats_macro_auc"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||||
|
| `attr` | `str` | The attribute to score. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
||||||
|
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
||||||
|
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
||||||
|
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
||||||
|
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. |
|
||||||
|
|
|
@ -290,6 +290,8 @@ factories.
|
||||||
> return Model("custom", forward, dims={"nO": nO})
|
> return Model("custom", forward, dims={"nO": nO})
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish table -->
|
||||||
|
|
||||||
| Registry name | Description |
|
| Registry name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||||
|
@ -297,7 +299,7 @@ factories.
|
||||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
| `assets` | <!-- TODO: what is this used for again?--> |
|
| `assets` | |
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||||
|
|
|
@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
Transformer tokens and outputs for one `Doc` object.
|
Transformer tokens and outputs for one `Doc` object.
|
||||||
|
|
||||||
| Name | Type | Description |
|
<!-- TODO: finish API docs, also mention "width" is property -->
|
||||||
| --------- | -------------------------------------------------- | ----------------------------------------- |
|
|
||||||
| `tokens` | `Dict` | <!-- TODO: --> |
|
| Name | Type | Description |
|
||||||
| `tensors` | `List[FloatsXd]` | <!-- TODO: --> |
|
| --------- | -------------------------------------------------- | ----------- |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
| `tokens` | `Dict` | |
|
||||||
| `width` | int | <!-- TODO: also mention it's property --> |
|
| `tensors` | `List[FloatsXd]` | |
|
||||||
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||||
|
| `width` | int | |
|
||||||
|
|
||||||
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | -------------- |
|
| ----------- | ----------------- | ----------- |
|
||||||
| **RETURNS** | `TransformerData` | <!-- TODO: --> |
|
| **RETURNS** | `TransformerData` | |
|
||||||
|
|
||||||
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: write, also mention doc_data is property -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
|
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||||
| `spans` | `List[List[Span]]` | <!-- TODO: --> |
|
| `spans` | `List[List[Span]]` | |
|
||||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: --> |
|
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
|
||||||
| `tensors` | `List[torch.Tensor]` | <!-- TODO: --> |
|
| `tensors` | `List[torch.Tensor]` | |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||||
| `doc_data` | `List[TransformerData]` | <!-- TODO: also mention it's property --> |
|
| `doc_data` | `List[TransformerData]` | |
|
||||||
|
|
||||||
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: write -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------- | -------------- |
|
| ----------- | ---------------------- | ----------- |
|
||||||
| `arrays` | `List[List[Floats3d]]` | <!-- TODO: --> |
|
| `arrays` | `List[List[Floats3d]]` | |
|
||||||
| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> |
|
| **RETURNS** | `FullTransformerBatch` | |
|
||||||
|
|
||||||
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
||||||
|
|
||||||
Split a `TransformerData` object that represents a batch into a list with one
|
Split a `TransformerData` object that represents a batch into a list with one
|
||||||
`TransformerData` per `Doc`.
|
`TransformerData` per `Doc`.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------------- | -------------- |
|
| ----------- | ----------------------- | ----------- |
|
||||||
| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> |
|
| **RETURNS** | `List[TransformerData]` | |
|
||||||
|
|
||||||
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||||
|
|
||||||
|
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
|
||||||
|
|
||||||
The following built-in functions are available:
|
The following built-in functions are available:
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------ |
|
| ------------------ | ------------------------------------------------------------------ |
|
||||||
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
||||||
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
||||||
| `strided_spans.v1` | <!-- TODO: --> |
|
| `strided_spans.v1` | |
|
||||||
|
|
||||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||||
|
|
||||||
|
|
|
@ -231,10 +231,10 @@ available pipeline components and component functions.
|
||||||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | <!-- TODO: --> |
|
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
|
||||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||||
|
|
||||||
<!-- TODO: update with more components -->
|
<!-- TODO: finish and update with more components -->
|
||||||
|
|
||||||
<!-- TODO: explain default config and factories -->
|
<!-- TODO: explain default config and factories -->
|
||||||
|
|
||||||
|
@ -311,6 +311,99 @@ nlp.rename_pipe("ner", "entityrecognizer")
|
||||||
nlp.replace_pipe("tagger", my_custom_tagger)
|
nlp.replace_pipe("tagger", my_custom_tagger)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Analyzing pipeline components {#analysis new="3"}
|
||||||
|
|
||||||
|
The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the
|
||||||
|
components in the current pipeline and outputs information about them, like the
|
||||||
|
attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
|
||||||
|
they retokenize the `Doc` and which scores they produce during training. It will
|
||||||
|
also show warnings if components require values that aren't set by previous
|
||||||
|
component – for instance, if the entity linker is used but no component that
|
||||||
|
runs before it sets named entities. Setting `pretty=True` will pretty-print a
|
||||||
|
table instead of only returning the structured data.
|
||||||
|
|
||||||
|
> #### ✏️ Things to try
|
||||||
|
>
|
||||||
|
> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker.
|
||||||
|
> The analysis should now show no problems, because requirements are met.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe("tagger")
|
||||||
|
# This is a problem because it needs entities and sentence boundaries
|
||||||
|
nlp.add_pipe("entity_linker")
|
||||||
|
analysis = nlp.analyze_pipes(pretty=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
<Accordion title="Example output">
|
||||||
|
|
||||||
|
```json
|
||||||
|
### Structured
|
||||||
|
{
|
||||||
|
"summary": {
|
||||||
|
"tagger": {
|
||||||
|
"assigns": ["token.tag"],
|
||||||
|
"requires": [],
|
||||||
|
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
|
||||||
|
"retokenizes": false
|
||||||
|
},
|
||||||
|
"entity_linker": {
|
||||||
|
"assigns": ["token.ent_kb_id"],
|
||||||
|
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
|
"scores": [],
|
||||||
|
"retokenizes": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"problems": {
|
||||||
|
"tagger": [],
|
||||||
|
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
|
||||||
|
},
|
||||||
|
"attrs": {
|
||||||
|
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
|
||||||
|
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.tag": { "assigns": ["tagger"], "requires": [] },
|
||||||
|
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
### Pretty
|
||||||
|
============================= Pipeline Overview =============================
|
||||||
|
|
||||||
|
# Component Assigns Requires Scores Retokenizes
|
||||||
|
- ------------- --------------- -------------- --------- -----------
|
||||||
|
0 tagger token.tag tag_acc False
|
||||||
|
pos_acc
|
||||||
|
lemma_acc
|
||||||
|
|
||||||
|
1 entity_linker token.ent_kb_id doc.ents False
|
||||||
|
doc.sents
|
||||||
|
token.ent_iob
|
||||||
|
token.ent_type
|
||||||
|
|
||||||
|
|
||||||
|
================================ Problems (4) ================================
|
||||||
|
⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
|
||||||
|
token.ent_iob, token.ent_type
|
||||||
|
```
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Important note">
|
||||||
|
|
||||||
|
The pipeline analysis is static and does **not actually run the components**.
|
||||||
|
This means that it relies on the information provided by the components
|
||||||
|
themselves. If a custom component declares that it assigns an attribute but it
|
||||||
|
doesn't, the pipeline analysis won't catch that.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Creating custom pipeline components {#custom-components}
|
## Creating custom pipeline components {#custom-components}
|
||||||
|
|
||||||
A pipeline component is a function that receives a `Doc` object, modifies it and
|
A pipeline component is a function that receives a `Doc` object, modifies it and
|
||||||
|
@ -489,6 +582,8 @@ All other settings can be passed in by the user via the `config` argument on
|
||||||
[`@Language.factory`](/api/language#factory) decorator also lets you define a
|
[`@Language.factory`](/api/language#factory) decorator also lets you define a
|
||||||
`default_config` that's used as a fallback.
|
`default_config` that's used as a fallback.
|
||||||
|
|
||||||
|
<!-- TODO: add example of passing in a custom Python object via the config based on a registered function -->
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### With config {highlight="4,9"}
|
### With config {highlight="4,9"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
|
@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
||||||
|
|
||||||
### Serializing the pipeline {#pipeline}
|
### Serializing the pipeline {#pipeline}
|
||||||
|
|
||||||
<!-- TODO: update this -->
|
|
||||||
|
|
||||||
When serializing the pipeline, keep in mind that this will only save out the
|
When serializing the pipeline, keep in mind that this will only save out the
|
||||||
**binary data for the individual components** to allow spaCy to restore them –
|
**binary data for the individual components** to allow spaCy to restore them –
|
||||||
not the entire objects. This is a good thing, because it makes serialization
|
not the entire objects. This is a good thing, because it makes serialization
|
||||||
|
|
|
@ -3,7 +3,8 @@ title: Training Models
|
||||||
next: /usage/projects
|
next: /usage/projects
|
||||||
menu:
|
menu:
|
||||||
- ['Introduction', 'basics']
|
- ['Introduction', 'basics']
|
||||||
- ['CLI & Config', 'cli-config']
|
- ['Quickstart', 'quickstart']
|
||||||
|
- ['Config System', 'config']
|
||||||
- ['Transfer Learning', 'transfer-learning']
|
- ['Transfer Learning', 'transfer-learning']
|
||||||
- ['Custom Models', 'custom-models']
|
- ['Custom Models', 'custom-models']
|
||||||
- ['Parallel Training', 'parallel-training']
|
- ['Parallel Training', 'parallel-training']
|
||||||
|
@ -29,12 +30,13 @@ ready-to-use spaCy models.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Training CLI & config {#cli-config}
|
### Training CLI & config {#cli-config}
|
||||||
|
|
||||||
<!-- TODO: intro describing the new v3 training philosophy -->
|
<!-- TODO: intro describing the new v3 training philosophy -->
|
||||||
|
|
||||||
The recommended way to train your spaCy models is via the
|
The recommended way to train your spaCy models is via the
|
||||||
[`spacy train`](/api/cli#train) command on the command line.
|
[`spacy train`](/api/cli#train) command on the command line. You can pass in the
|
||||||
|
following data and information:
|
||||||
|
|
||||||
1. The **training and evaluation data** in spaCy's
|
1. The **training and evaluation data** in spaCy's
|
||||||
[binary `.spacy` format](/api/data-formats#binary-training) created using
|
[binary `.spacy` format](/api/data-formats#binary-training) created using
|
||||||
|
@ -68,38 +70,22 @@ workflows, from data preprocessing to training and packaging your model.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
<Accordion title="Understanding the training output">
|
## Quickstart {#quickstart}
|
||||||
|
|
||||||
When you train a model using the [`spacy train`](/api/cli#train) command, you'll
|
> #### Instructions
|
||||||
see a table showing metrics after each pass over the data. Here's what those
|
>
|
||||||
metrics means:
|
> 1. Select your requirements and settings. The quickstart widget will
|
||||||
|
> auto-generate a recommended starter config for you.
|
||||||
|
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
||||||
|
> file `config.cfg`.
|
||||||
|
> 3. TOOD: recommended approach for filling config
|
||||||
|
> 4. Run [`spacy train`](/api/cli#train) with your config and data.
|
||||||
|
|
||||||
<!-- TODO: update table below and include note about scores in config -->
|
import QuickstartTraining from 'widgets/quickstart-training.js'
|
||||||
|
|
||||||
| Name | Description |
|
<QuickstartTraining />
|
||||||
| ---------- | ------------------------------------------------------------------------------------------------- |
|
|
||||||
| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. |
|
|
||||||
| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. |
|
|
||||||
| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. |
|
|
||||||
| `NER P.` | NER precision on development data. Should increase. |
|
|
||||||
| `NER R.` | NER recall on development data. Should increase. |
|
|
||||||
| `NER F.` | NER F-score on development data. Should increase. |
|
|
||||||
| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. |
|
|
||||||
| `Token %` | Tokenization accuracy on development data. |
|
|
||||||
| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. |
|
|
||||||
| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. |
|
|
||||||
|
|
||||||
Note that if the development data has raw text, some of the gold-standard
|
## Training config {#config}
|
||||||
entities might not align to the predicted tokenization. These tokenization
|
|
||||||
errors are **excluded from the NER evaluation**. If your tokenization makes it
|
|
||||||
impossible for the model to predict 50% of your entities, your NER F-score might
|
|
||||||
still look good.
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Training config files {#config}
|
|
||||||
|
|
||||||
> #### Migration from spaCy v2.x
|
> #### Migration from spaCy v2.x
|
||||||
>
|
>
|
||||||
|
@ -237,7 +223,70 @@ compound = 1.001
|
||||||
|
|
||||||
<!-- TODO: refer to architectures API: /api/architectures. This should document the architectures in spacy/ml/models -->
|
<!-- TODO: refer to architectures API: /api/architectures. This should document the architectures in spacy/ml/models -->
|
||||||
|
|
||||||
<!-- TODO: how do we document the default configs? -->
|
### Metrics, training output and weighted scores {#metrics}
|
||||||
|
|
||||||
|
When you train a model using the [`spacy train`](/api/cli#train) command, you'll
|
||||||
|
see a table showing the metrics after each pass over the data. The available
|
||||||
|
metrics **depend on the pipeline components**. Pipeline components also define
|
||||||
|
which scores are shown and how they should be **weighted in the final score**
|
||||||
|
that decides about the best model.
|
||||||
|
|
||||||
|
The `training.score_weights` setting in your `config.cfg` lets you customize the
|
||||||
|
scores shown in the table and how they should be weighted. In this example, the
|
||||||
|
labeled dependency accuracy and NER F-score count towards the final score with
|
||||||
|
40% each and the tagging accuracy makes up the remaining 20%. The tokenization
|
||||||
|
accuracy and speed are both shown in the table, but not counted towards the
|
||||||
|
score.
|
||||||
|
|
||||||
|
> #### Why do I need score weights?
|
||||||
|
>
|
||||||
|
> At the end of your training process, you typically want to select the **best
|
||||||
|
> model** – but what "best" means depends on the available components and your
|
||||||
|
> specific use case. For instance, you may prefer a model with higher NER and
|
||||||
|
> lower POS tagging accuracy over a model with lower NER and higher POS
|
||||||
|
> accuracy. You can express this preference in the score weights, e.g. by
|
||||||
|
> assigning `ents_f` (NER F-score) a higher weight.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[training.score_weights]
|
||||||
|
dep_las = 0.4
|
||||||
|
ents_f = 0.4
|
||||||
|
tag_acc = 0.2
|
||||||
|
token_acc = 0.0
|
||||||
|
speed = 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
The `score_weights` don't _have to_ sum to `1.0` – but it's recommended. When
|
||||||
|
you generate a config for a given pipeline, the score weights are generated by
|
||||||
|
combining and normalizing the default score weights of the pipeline components.
|
||||||
|
The default score weights are defined by each pipeline component via the
|
||||||
|
`default_score_weights` setting on the
|
||||||
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory). By default, all pipeline
|
||||||
|
components are weighted equally.
|
||||||
|
|
||||||
|
<Accordion title="Understanding the training output and score types" spaced>
|
||||||
|
|
||||||
|
<!-- TODO: come up with good short explanation of precision and recall -->
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. |
|
||||||
|
| **Precision** (P) | Should increase. |
|
||||||
|
| **Recall** (R) | Should increase. |
|
||||||
|
| **F-Score** (F) | The weighted average of precision and recall. Should increase. |
|
||||||
|
| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. |
|
||||||
|
| **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. |
|
||||||
|
|
||||||
|
<!-- TODO: is this still relevant? -->
|
||||||
|
|
||||||
|
Note that if the development data has raw text, some of the gold-standard
|
||||||
|
entities might not align to the predicted tokenization. These tokenization
|
||||||
|
errors are **excluded from the NER evaluation**. If your tokenization makes it
|
||||||
|
impossible for the model to predict 50% of your entities, your NER F-score might
|
||||||
|
still look good.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
## Transfer learning {#transfer-learning}
|
## Transfer learning {#transfer-learning}
|
||||||
|
|
||||||
|
|
|
@ -88,7 +88,8 @@ The recommended workflow for training is to use spaCy's
|
||||||
[`spacy train`](/api/cli#train) command. The training config defines all
|
[`spacy train`](/api/cli#train) command. The training config defines all
|
||||||
component settings and hyperparameters in one place and lets you describe a tree
|
component settings and hyperparameters in one place and lets you describe a tree
|
||||||
of objects by referring to creation functions, including functions you register
|
of objects by referring to creation functions, including functions you register
|
||||||
yourself.
|
yourself. For details on how to get started with training your own model, check
|
||||||
|
out the [training quickstart](/usage/training#quickstart).
|
||||||
|
|
||||||
<Project id="en_core_bert">
|
<Project id="en_core_bert">
|
||||||
|
|
||||||
|
|
|
@ -3,21 +3,23 @@ import React, { useState, useRef } from 'react'
|
||||||
import Icon from './icon'
|
import Icon from './icon'
|
||||||
import classes from '../styles/copy.module.sass'
|
import classes from '../styles/copy.module.sass'
|
||||||
|
|
||||||
|
export function copyToClipboard(ref, callback) {
|
||||||
|
const isClient = typeof window !== 'undefined'
|
||||||
|
if (ref.current && isClient) {
|
||||||
|
ref.current.select()
|
||||||
|
document.execCommand('copy')
|
||||||
|
callback(true)
|
||||||
|
ref.current.blur()
|
||||||
|
setTimeout(() => callback(false), 1000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const CopyInput = ({ text, prefix }) => {
|
const CopyInput = ({ text, prefix }) => {
|
||||||
const isClient = typeof window !== 'undefined'
|
const isClient = typeof window !== 'undefined'
|
||||||
const supportsCopy = isClient && document.queryCommandSupported('copy')
|
const supportsCopy = isClient && document.queryCommandSupported('copy')
|
||||||
const textareaRef = useRef()
|
const textareaRef = useRef()
|
||||||
const [copySuccess, setCopySuccess] = useState(false)
|
const [copySuccess, setCopySuccess] = useState(false)
|
||||||
|
const onClick = () => copyToClipboard(textareaRef, setCopySuccess)
|
||||||
function copyToClipboard() {
|
|
||||||
if (textareaRef.current && isClient) {
|
|
||||||
textareaRef.current.select()
|
|
||||||
document.execCommand('copy')
|
|
||||||
setCopySuccess(true)
|
|
||||||
textareaRef.current.blur()
|
|
||||||
setTimeout(() => setCopySuccess(false), 1000)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function selectText() {
|
function selectText() {
|
||||||
if (textareaRef.current && isClient) {
|
if (textareaRef.current && isClient) {
|
||||||
|
@ -37,7 +39,7 @@ const CopyInput = ({ text, prefix }) => {
|
||||||
onClick={selectText}
|
onClick={selectText}
|
||||||
/>
|
/>
|
||||||
{supportsCopy && (
|
{supportsCopy && (
|
||||||
<button title="Copy to clipboard" onClick={copyToClipboard}>
|
<button title="Copy to clipboard" onClick={onClick}>
|
||||||
<Icon width={16} name={copySuccess ? 'accept' : 'clipboard'} />
|
<Icon width={16} name={copySuccess ? 'accept' : 'clipboard'} />
|
||||||
</button>
|
</button>
|
||||||
)}
|
)}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import { ReactComponent as SearchIcon } from '../images/icons/search.svg'
|
||||||
import { ReactComponent as MoonIcon } from '../images/icons/moon.svg'
|
import { ReactComponent as MoonIcon } from '../images/icons/moon.svg'
|
||||||
import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg'
|
import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg'
|
||||||
import { ReactComponent as NetworkIcon } from '../images/icons/network.svg'
|
import { ReactComponent as NetworkIcon } from '../images/icons/network.svg'
|
||||||
|
import { ReactComponent as DownloadIcon } from '../images/icons/download.svg'
|
||||||
|
|
||||||
import classes from '../styles/icon.module.sass'
|
import classes from '../styles/icon.module.sass'
|
||||||
|
|
||||||
|
@ -46,7 +47,8 @@ const icons = {
|
||||||
search: SearchIcon,
|
search: SearchIcon,
|
||||||
moon: MoonIcon,
|
moon: MoonIcon,
|
||||||
clipboard: ClipboardIcon,
|
clipboard: ClipboardIcon,
|
||||||
network: NetworkIcon
|
network: NetworkIcon,
|
||||||
|
download: DownloadIcon,
|
||||||
}
|
}
|
||||||
|
|
||||||
const Icon = ({ name, width, height, inline, variant, className }) => {
|
const Icon = ({ name, width, height, inline, variant, className }) => {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import React, { Fragment, useState, useEffect } from 'react'
|
import React, { Fragment, useState, useEffect, useRef } from 'react'
|
||||||
import PropTypes from 'prop-types'
|
import PropTypes from 'prop-types'
|
||||||
import classNames from 'classnames'
|
import classNames from 'classnames'
|
||||||
import { window } from 'browser-monads'
|
import { window } from 'browser-monads'
|
||||||
|
@ -6,6 +6,7 @@ import { window } from 'browser-monads'
|
||||||
import Section from './section'
|
import Section from './section'
|
||||||
import Icon from './icon'
|
import Icon from './icon'
|
||||||
import { H2 } from './typography'
|
import { H2 } from './typography'
|
||||||
|
import { copyToClipboard } from './copy'
|
||||||
import classes from '../styles/quickstart.module.sass'
|
import classes from '../styles/quickstart.module.sass'
|
||||||
|
|
||||||
function getNewChecked(optionId, checkedForId, multiple) {
|
function getNewChecked(optionId, checkedForId, multiple) {
|
||||||
|
@ -14,10 +15,41 @@ function getNewChecked(optionId, checkedForId, multiple) {
|
||||||
return [...checkedForId, optionId]
|
return [...checkedForId, optionId]
|
||||||
}
|
}
|
||||||
|
|
||||||
const Quickstart = ({ data, title, description, id, children }) => {
|
function getRawContent(ref) {
|
||||||
|
if (ref.current && ref.current.childNodes) {
|
||||||
|
// Select all currently visible nodes (spans and text nodes)
|
||||||
|
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
|
||||||
|
return result.map(el => el.textContent).join('\n')
|
||||||
|
}
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
|
const Quickstart = ({
|
||||||
|
data,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
copy,
|
||||||
|
download,
|
||||||
|
id,
|
||||||
|
setters = {},
|
||||||
|
hidePrompts,
|
||||||
|
children,
|
||||||
|
}) => {
|
||||||
|
const contentRef = useRef()
|
||||||
|
const copyAreaRef = useRef()
|
||||||
|
const isClient = typeof window !== 'undefined'
|
||||||
|
const supportsCopy = isClient && document.queryCommandSupported('copy')
|
||||||
|
const showCopy = supportsCopy && copy
|
||||||
const [styles, setStyles] = useState({})
|
const [styles, setStyles] = useState({})
|
||||||
const [checked, setChecked] = useState({})
|
const [checked, setChecked] = useState({})
|
||||||
const [initialized, setInitialized] = useState(false)
|
const [initialized, setInitialized] = useState(false)
|
||||||
|
const [copySuccess, setCopySuccess] = useState(false)
|
||||||
|
const [otherState, setOtherState] = useState({})
|
||||||
|
const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
|
||||||
|
const onClickCopy = () => {
|
||||||
|
copyAreaRef.current.value = getRawContent(contentRef)
|
||||||
|
copyToClipboard(copyAreaRef, setCopySuccess)
|
||||||
|
}
|
||||||
|
|
||||||
const getCss = (id, checkedOptions) => {
|
const getCss = (id, checkedOptions) => {
|
||||||
const checkedForId = checkedOptions[id] || []
|
const checkedForId = checkedOptions[id] || []
|
||||||
|
@ -32,7 +64,7 @@ const Quickstart = ({ data, title, description, id, children }) => {
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
const initialChecked = Object.assign(
|
const initialChecked = Object.assign(
|
||||||
{},
|
{},
|
||||||
...data.map(({ id, options }) => ({
|
...data.map(({ id, options = [] }) => ({
|
||||||
[id]: options.filter(option => option.checked).map(({ id }) => id),
|
[id]: options.filter(option => option.checked).map(({ id }) => id),
|
||||||
}))
|
}))
|
||||||
)
|
)
|
||||||
|
@ -48,7 +80,7 @@ const Quickstart = ({ data, title, description, id, children }) => {
|
||||||
|
|
||||||
return !data.length ? null : (
|
return !data.length ? null : (
|
||||||
<Section id={id}>
|
<Section id={id}>
|
||||||
<div className={classes.root}>
|
<div className={classNames(classes.root, { [classes.hidePrompts]: !!hidePrompts })}>
|
||||||
{title && (
|
{title && (
|
||||||
<H2 className={classes.title} name={id}>
|
<H2 className={classes.title} name={id}>
|
||||||
<a href={`#${id}`}>{title}</a>
|
<a href={`#${id}`}>{title}</a>
|
||||||
|
@ -57,82 +89,154 @@ const Quickstart = ({ data, title, description, id, children }) => {
|
||||||
|
|
||||||
{description && <p className={classes.description}>{description}</p>}
|
{description && <p className={classes.description}>{description}</p>}
|
||||||
|
|
||||||
{data.map(({ id, title, options = [], multiple, help }) => (
|
{data.map(
|
||||||
<div key={id} data-quickstart-group={id} className={classes.group}>
|
({
|
||||||
<style data-quickstart-style={id}>
|
id,
|
||||||
{styles[id] ||
|
title,
|
||||||
`[data-quickstart-results]>[data-quickstart-${id}] { display: none }`}
|
options = [],
|
||||||
</style>
|
dropdown = [],
|
||||||
<div className={classes.legend}>
|
defaultValue,
|
||||||
{title}
|
multiple,
|
||||||
{help && (
|
other,
|
||||||
<span data-tooltip={help} className={classes.help}>
|
help,
|
||||||
{' '}
|
}) => {
|
||||||
<Icon name="help" width={16} spaced />
|
// Optional function that's called with the value
|
||||||
</span>
|
const setterFunc = setters[id] || (() => {})
|
||||||
)}
|
return (
|
||||||
</div>
|
<div key={id} data-quickstart-group={id} className={classes.group}>
|
||||||
<div className={classes.fields}>
|
<style data-quickstart-style={id} scoped>
|
||||||
{options.map(option => {
|
{styles[id] ||
|
||||||
const optionType = multiple ? 'checkbox' : 'radio'
|
`[data-quickstart-results]>[data-quickstart-${id}] { display: none }`}
|
||||||
const checkedForId = checked[id] || []
|
</style>
|
||||||
return (
|
<div className={classes.legend}>
|
||||||
<Fragment key={option.id}>
|
{title}
|
||||||
<input
|
{help && (
|
||||||
onChange={() => {
|
<span data-tooltip={help} className={classes.help}>
|
||||||
const newChecked = {
|
{' '}
|
||||||
...checked,
|
<Icon name="help" width={16} spaced />
|
||||||
[id]: getNewChecked(
|
</span>
|
||||||
option.id,
|
)}
|
||||||
checkedForId,
|
</div>
|
||||||
multiple
|
<div className={classes.fields}>
|
||||||
),
|
{!!dropdown.length && (
|
||||||
|
<select
|
||||||
|
defaultValue={defaultValue}
|
||||||
|
className={classes.select}
|
||||||
|
onChange={({ target }) => {
|
||||||
|
const value = target.value
|
||||||
|
if (value != other) {
|
||||||
|
setterFunc(value)
|
||||||
|
setOther(id, false)
|
||||||
|
} else {
|
||||||
|
setterFunc('')
|
||||||
|
setOther(id, true)
|
||||||
}
|
}
|
||||||
setChecked(newChecked)
|
|
||||||
setStyles({
|
|
||||||
...styles,
|
|
||||||
[id]: getCss(id, newChecked),
|
|
||||||
})
|
|
||||||
}}
|
}}
|
||||||
type={optionType}
|
|
||||||
className={classNames(
|
|
||||||
classes.input,
|
|
||||||
classes[optionType]
|
|
||||||
)}
|
|
||||||
name={id}
|
|
||||||
id={`quickstart-${option.id}`}
|
|
||||||
value={option.id}
|
|
||||||
checked={checkedForId.includes(option.id)}
|
|
||||||
/>
|
|
||||||
<label
|
|
||||||
className={classes.label}
|
|
||||||
htmlFor={`quickstart-${option.id}`}
|
|
||||||
>
|
>
|
||||||
{option.title}
|
{dropdown.map(({ id, title }) => (
|
||||||
{option.meta && (
|
<option key={id} value={id}>
|
||||||
<span className={classes.meta}>{option.meta}</span>
|
{title}
|
||||||
)}
|
</option>
|
||||||
{option.help && (
|
))}
|
||||||
<span
|
{other && <option value={other}>{other}</option>}
|
||||||
data-tooltip={option.help}
|
</select>
|
||||||
className={classes.help}
|
)}
|
||||||
|
{other && otherState[id] && (
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
className={classes.textInput}
|
||||||
|
placeholder="Type here..."
|
||||||
|
onChange={({ target }) => setterFunc(target.value)}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
{options.map(option => {
|
||||||
|
const optionType = multiple ? 'checkbox' : 'radio'
|
||||||
|
const checkedForId = checked[id] || []
|
||||||
|
return (
|
||||||
|
<Fragment key={option.id}>
|
||||||
|
<input
|
||||||
|
onChange={() => {
|
||||||
|
const newChecked = {
|
||||||
|
...checked,
|
||||||
|
[id]: getNewChecked(
|
||||||
|
option.id,
|
||||||
|
checkedForId,
|
||||||
|
multiple
|
||||||
|
),
|
||||||
|
}
|
||||||
|
setChecked(newChecked)
|
||||||
|
setStyles({
|
||||||
|
...styles,
|
||||||
|
[id]: getCss(id, newChecked),
|
||||||
|
})
|
||||||
|
setterFunc(newChecked[id])
|
||||||
|
}}
|
||||||
|
type={optionType}
|
||||||
|
className={classNames(
|
||||||
|
classes.input,
|
||||||
|
classes[optionType]
|
||||||
|
)}
|
||||||
|
name={id}
|
||||||
|
id={`quickstart-${option.id}`}
|
||||||
|
value={option.id}
|
||||||
|
checked={checkedForId.includes(option.id)}
|
||||||
|
/>
|
||||||
|
<label
|
||||||
|
className={classes.label}
|
||||||
|
htmlFor={`quickstart-${option.id}`}
|
||||||
>
|
>
|
||||||
{' '}
|
{option.title}
|
||||||
<Icon name="help" width={16} spaced />
|
{option.meta && (
|
||||||
</span>
|
<span className={classes.meta}>
|
||||||
)}
|
{option.meta}
|
||||||
</label>
|
</span>
|
||||||
</Fragment>
|
)}
|
||||||
)
|
{option.help && (
|
||||||
})}
|
<span
|
||||||
</div>
|
data-tooltip={option.help}
|
||||||
</div>
|
className={classes.help}
|
||||||
))}
|
>
|
||||||
|
{' '}
|
||||||
|
<Icon name="help" width={16} spaced />
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</label>
|
||||||
|
</Fragment>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)}
|
||||||
<pre className={classes.code}>
|
<pre className={classes.code}>
|
||||||
<code className={classes.results} data-quickstart-results="">
|
<code className={classes.results} data-quickstart-results="" ref={contentRef}>
|
||||||
{children}
|
{children}
|
||||||
</code>
|
</code>
|
||||||
|
|
||||||
|
<menu className={classes.menu}>
|
||||||
|
{showCopy && (
|
||||||
|
<button
|
||||||
|
title="Copy to clipboard"
|
||||||
|
onClick={onClickCopy}
|
||||||
|
className={classes.iconButton}
|
||||||
|
>
|
||||||
|
<Icon width={18} name={copySuccess ? 'accept' : 'clipboard'} />
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
{download && (
|
||||||
|
<a
|
||||||
|
href={`data:application/octet-stream,${getRawContent(contentRef)}`}
|
||||||
|
title="Download file"
|
||||||
|
download={download}
|
||||||
|
className={classes.iconButton}
|
||||||
|
>
|
||||||
|
<Icon width={18} name="download" />
|
||||||
|
</a>
|
||||||
|
)}
|
||||||
|
</menu>
|
||||||
</pre>
|
</pre>
|
||||||
|
{showCopy && <textarea ref={copyAreaRef} className={classes.copyArea} rows={1} />}
|
||||||
</div>
|
</div>
|
||||||
</Section>
|
</Section>
|
||||||
)
|
)
|
||||||
|
@ -141,6 +245,7 @@ const Quickstart = ({ data, title, description, id, children }) => {
|
||||||
Quickstart.defaultProps = {
|
Quickstart.defaultProps = {
|
||||||
data: [],
|
data: [],
|
||||||
id: 'quickstart',
|
id: 'quickstart',
|
||||||
|
copy: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
Quickstart.propTypes = {
|
Quickstart.propTypes = {
|
||||||
|
@ -164,12 +269,13 @@ Quickstart.propTypes = {
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
const QS = ({ children, prompt = 'bash', divider = false, ...props }) => {
|
const QS = ({ children, prompt = 'bash', divider = false, comment = false, ...props }) => {
|
||||||
const qsClassNames = classNames({
|
const qsClassNames = classNames({
|
||||||
[classes.prompt]: !!prompt && !divider,
|
[classes.prompt]: !!prompt && !divider,
|
||||||
[classes.bash]: prompt === 'bash' && !divider,
|
[classes.bash]: prompt === 'bash' && !divider,
|
||||||
[classes.python]: prompt === 'python' && !divider,
|
[classes.python]: prompt === 'python' && !divider,
|
||||||
[classes.divider]: !!divider,
|
[classes.divider]: !!divider,
|
||||||
|
[classes.comment]: !!comment,
|
||||||
})
|
})
|
||||||
const attrs = Object.assign(
|
const attrs = Object.assign(
|
||||||
{},
|
{},
|
||||||
|
|
4
website/src/images/icons/download.svg
Normal file
4
website/src/images/icons/download.svg
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
|
||||||
|
<path d="M16.707 7.404c-0.189-0.188-0.448-0.283-0.707-0.283s-0.518 0.095-0.707 0.283l-2.293 2.293v-6.697c0-0.552-0.448-1-1-1s-1 0.448-1 1v6.697l-2.293-2.293c-0.189-0.188-0.44-0.293-0.707-0.293s-0.518 0.105-0.707 0.293c-0.39 0.39-0.39 1.024 0 1.414l4.707 4.682 4.709-4.684c0.388-0.387 0.388-1.022-0.002-1.412z"></path>
|
||||||
|
<path d="M20.987 16c0-0.105-0.004-0.211-0.039-0.316l-2-6c-0.136-0.409-0.517-0.684-0.948-0.684h-0.219c-0.094 0.188-0.21 0.368-0.367 0.525l-1.482 1.475h1.348l1.667 5h-13.893l1.667-5h1.348l-1.483-1.475c-0.157-0.157-0.274-0.337-0.367-0.525h-0.219c-0.431 0-0.812 0.275-0.948 0.684l-2 6c-0.035 0.105-0.039 0.211-0.039 0.316-0.013 0-0.013 5-0.013 5 0 0.553 0.447 1 1 1h16c0.553 0 1-0.447 1-1 0 0 0-5-0.013-5z"></path>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 821 B |
|
@ -24,7 +24,7 @@
|
||||||
.code,
|
.code,
|
||||||
.juniper-input pre
|
.juniper-input pre
|
||||||
display: block
|
display: block
|
||||||
padding: 1.75em 2em
|
padding: 1.75em 1.5em
|
||||||
|
|
||||||
.code
|
.code
|
||||||
&[data-prompt]:before,
|
&[data-prompt]:before,
|
||||||
|
|
|
@ -370,9 +370,9 @@ body [id]:target
|
||||||
background-color: var(--color-dark-secondary)
|
background-color: var(--color-dark-secondary)
|
||||||
border-left: 0.35em solid var(--color-theme)
|
border-left: 0.35em solid var(--color-theme)
|
||||||
display: block
|
display: block
|
||||||
margin-right: -2em
|
margin-right: -1.5em
|
||||||
margin-left: -2em
|
margin-left: -1.5em
|
||||||
padding-right: 2em
|
padding-right: 1.5em
|
||||||
padding-left: 1.65em
|
padding-left: 1.65em
|
||||||
|
|
||||||
&:empty:before
|
&:empty:before
|
||||||
|
|
|
@ -83,6 +83,24 @@
|
||||||
.fields
|
.fields
|
||||||
flex: 100%
|
flex: 100%
|
||||||
|
|
||||||
|
.select
|
||||||
|
cursor: pointer
|
||||||
|
border: 1px solid var(--color-subtle)
|
||||||
|
border-radius: var(--border-radius)
|
||||||
|
display: inline-block
|
||||||
|
padding: 0.35rem 1.25rem
|
||||||
|
margin: 0 1rem 0.75rem 0
|
||||||
|
font-size: var(--font-size-sm)
|
||||||
|
background: var(--color-back)
|
||||||
|
|
||||||
|
.text-input
|
||||||
|
border: 1px solid var(--color-subtle)
|
||||||
|
border-radius: var(--border-radius)
|
||||||
|
display: inline-block
|
||||||
|
padding: 0.35rem 0.75rem
|
||||||
|
font-size: var(--font-size-sm)
|
||||||
|
background: var(--color-back)
|
||||||
|
|
||||||
.code
|
.code
|
||||||
background: var(--color-front)
|
background: var(--color-front)
|
||||||
color: var(--color-back)
|
color: var(--color-back)
|
||||||
|
@ -95,6 +113,7 @@
|
||||||
border-bottom-right-radius: var(--border-radius)
|
border-bottom-right-radius: var(--border-radius)
|
||||||
-webkit-font-smoothing: subpixel-antialiased
|
-webkit-font-smoothing: subpixel-antialiased
|
||||||
-moz-osx-font-smoothing: auto
|
-moz-osx-font-smoothing: auto
|
||||||
|
position: relative
|
||||||
|
|
||||||
.results
|
.results
|
||||||
display: block
|
display: block
|
||||||
|
@ -105,6 +124,9 @@
|
||||||
& > span
|
& > span
|
||||||
display: block
|
display: block
|
||||||
|
|
||||||
|
.hide-prompts .prompt:before
|
||||||
|
content: initial !important
|
||||||
|
|
||||||
.prompt:before
|
.prompt:before
|
||||||
color: var(--color-theme)
|
color: var(--color-theme)
|
||||||
margin-right: 1em
|
margin-right: 1em
|
||||||
|
@ -115,6 +137,9 @@
|
||||||
.python:before
|
.python:before
|
||||||
content: ">>>"
|
content: ">>>"
|
||||||
|
|
||||||
|
.comment
|
||||||
|
color: var(--syntax-comment)
|
||||||
|
|
||||||
.divider
|
.divider
|
||||||
padding: 1.5rem 0
|
padding: 1.5rem 0
|
||||||
|
|
||||||
|
@ -123,3 +148,29 @@
|
||||||
|
|
||||||
.input:checked + .label &
|
.input:checked + .label &
|
||||||
color: inherit
|
color: inherit
|
||||||
|
|
||||||
|
.copy-area
|
||||||
|
width: 1px
|
||||||
|
height: 1px
|
||||||
|
opacity: 0
|
||||||
|
position: absolute
|
||||||
|
|
||||||
|
.menu
|
||||||
|
color: var(--color-subtle)
|
||||||
|
padding-right: 1.5rem
|
||||||
|
display: inline-block
|
||||||
|
position: absolute
|
||||||
|
bottom: var(--spacing-xs)
|
||||||
|
right: 0
|
||||||
|
|
||||||
|
.icon-button
|
||||||
|
display: inline-block
|
||||||
|
color: inherit
|
||||||
|
cursor: pointer
|
||||||
|
transition: transform 0.05s ease
|
||||||
|
|
||||||
|
&:not(:last-child)
|
||||||
|
margin-right: 1.5rem
|
||||||
|
|
||||||
|
&:hover
|
||||||
|
transform: scale(1.1)
|
||||||
|
|
|
@ -92,7 +92,7 @@ const QuickstartInstall = ({ id, title }) => (
|
||||||
</QS>
|
</QS>
|
||||||
<QS package="source">pip install -r requirements.txt</QS>
|
<QS package="source">pip install -r requirements.txt</QS>
|
||||||
<QS addition="transformers" package="pip">
|
<QS addition="transformers" package="pip">
|
||||||
pip install -U spacy-lookups-transformers
|
pip install -U spacy-transformers
|
||||||
</QS>
|
</QS>
|
||||||
<QS addition="transformers" package="source">
|
<QS addition="transformers" package="source">
|
||||||
pip install -U spacy-transformers
|
pip install -U spacy-transformers
|
||||||
|
|
118
website/src/widgets/quickstart-training.js
Normal file
118
website/src/widgets/quickstart-training.js
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
import React, { useState } from 'react'
|
||||||
|
import { StaticQuery, graphql } from 'gatsby'
|
||||||
|
|
||||||
|
import { Quickstart, QS } from '../components/quickstart'
|
||||||
|
|
||||||
|
const DEFAULT_LANG = 'en'
|
||||||
|
const MODELS_SMALL = { en: 'roberta-base-small' }
|
||||||
|
const MODELS_LARGE = { en: 'roberta-base' }
|
||||||
|
|
||||||
|
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
||||||
|
const COMMENT = `# This is an auto-generated partial config for training a model.
|
||||||
|
# TODO: intructions for how to fill and use it`
|
||||||
|
const DATA = [
|
||||||
|
{
|
||||||
|
id: 'lang',
|
||||||
|
title: 'Language',
|
||||||
|
defaultValue: DEFAULT_LANG,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'components',
|
||||||
|
title: 'Components',
|
||||||
|
help: 'Pipeline components to train. Requires training data for those annotations.',
|
||||||
|
options: COMPONENTS.map(id => ({ id, title: id })),
|
||||||
|
multiple: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'hardware',
|
||||||
|
title: 'Hardware',
|
||||||
|
options: [
|
||||||
|
{ id: 'cpu-only', title: 'CPU only' },
|
||||||
|
{ id: 'cpu', title: 'CPU preferred' },
|
||||||
|
{ id: 'gpu', title: 'GPU', checked: true },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'optimize',
|
||||||
|
title: 'Optimize for',
|
||||||
|
help: '...',
|
||||||
|
options: [
|
||||||
|
{ id: 'efficiency', title: 'efficiency', checked: true },
|
||||||
|
{ id: 'accuracy', title: 'accuracy' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'config',
|
||||||
|
title: 'Configuration',
|
||||||
|
options: [
|
||||||
|
{
|
||||||
|
id: 'independent',
|
||||||
|
title: 'independent components',
|
||||||
|
help: "Make components independent and don't share weights",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
multiple: true,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
const QuickstartTraining = ({ id, title, download = 'config.cfg' }) => {
|
||||||
|
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||||
|
const [pipeline, setPipeline] = useState([])
|
||||||
|
const setters = { lang: setLang, components: setPipeline }
|
||||||
|
return (
|
||||||
|
<StaticQuery
|
||||||
|
query={query}
|
||||||
|
render={({ site }) => {
|
||||||
|
const langs = site.siteMetadata.languages
|
||||||
|
DATA[0].dropdown = langs.map(({ name, code }) => ({
|
||||||
|
id: code,
|
||||||
|
title: name,
|
||||||
|
}))
|
||||||
|
return (
|
||||||
|
<Quickstart
|
||||||
|
download={download}
|
||||||
|
data={DATA}
|
||||||
|
title={title}
|
||||||
|
id={id}
|
||||||
|
setters={setters}
|
||||||
|
hidePrompts
|
||||||
|
>
|
||||||
|
<QS comment>{COMMENT}</QS>
|
||||||
|
<span>[nlp]</span>
|
||||||
|
<span>lang = "{lang}"</span>
|
||||||
|
<span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
|
||||||
|
<br />
|
||||||
|
<span>[components]</span>
|
||||||
|
<br />
|
||||||
|
<span>[components.transformer]</span>
|
||||||
|
<QS optimize="efficiency">name = "{MODELS_SMALL[lang]}"</QS>
|
||||||
|
<QS optimize="accuracy">name = "{MODELS_LARGE[lang]}"</QS>
|
||||||
|
{!!pipeline.length && <br />}
|
||||||
|
{pipeline.map((pipe, i) => (
|
||||||
|
<>
|
||||||
|
{i !== 0 && <br />}
|
||||||
|
<span>[components.{pipe}]</span>
|
||||||
|
<span>factory = "{pipe}"</span>
|
||||||
|
</>
|
||||||
|
))}
|
||||||
|
</Quickstart>
|
||||||
|
)
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = graphql`
|
||||||
|
query QuickstartTrainingQuery {
|
||||||
|
site {
|
||||||
|
siteMetadata {
|
||||||
|
languages {
|
||||||
|
code
|
||||||
|
name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
export default QuickstartTraining
|
Loading…
Reference in New Issue
Block a user