mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
32011780a2
|
@ -104,9 +104,11 @@ For detailed installation instructions, see the
|
|||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||
of `v2.0.13`).
|
||||
of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
|
||||
your `pip`, `setuptools` and `wheel` are up to date.
|
||||
|
||||
```bash
|
||||
pip install -U pip setuptools wheel
|
||||
pip install spacy
|
||||
```
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ pydantic>=1.5.0,<2.0.0
|
|||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
packaging>=20.0
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
# Development dependencies
|
||||
|
|
|
@ -55,7 +55,7 @@ install_requires =
|
|||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
packaging>=20.0
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a34"
|
||||
__version__ = "3.0.0a35"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -278,7 +278,7 @@ def show_validation_error(
|
|||
"fill-config' command to fill in all the defaults, if possible:",
|
||||
spaced=True,
|
||||
)
|
||||
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
||||
print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
|
||||
sys.exit(1)
|
||||
except InterpolationError as e:
|
||||
msg.fail("Config validation error", e, exits=1)
|
||||
|
|
|
@ -1091,10 +1091,11 @@ class Language:
|
|||
for name, proc in self.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and hasattr(proc, "model")
|
||||
and hasattr(proc, "is_trainable")
|
||||
and proc.is_trainable()
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.model.finish_update(sgd)
|
||||
proc.finish_update(sgd)
|
||||
return losses
|
||||
|
||||
def rehearse(
|
||||
|
@ -1297,7 +1298,9 @@ class Language:
|
|||
for name, pipe in self.pipeline:
|
||||
kwargs = component_cfg.get(name, {})
|
||||
kwargs.setdefault("batch_size", batch_size)
|
||||
if not hasattr(pipe, "pipe"):
|
||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||
# predict and set_annotations methods
|
||||
if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
|
||||
docs = _pipe(docs, pipe, kwargs)
|
||||
else:
|
||||
docs = pipe.pipe(docs, **kwargs)
|
||||
|
@ -1407,7 +1410,9 @@ class Language:
|
|||
kwargs = component_cfg.get(name, {})
|
||||
# Allow component_cfg to overwrite the top-level kwargs.
|
||||
kwargs.setdefault("batch_size", batch_size)
|
||||
if hasattr(proc, "pipe"):
|
||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||
# predict and set_annotations methods
|
||||
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
|
||||
f = functools.partial(proc.pipe, **kwargs)
|
||||
else:
|
||||
# Apply the function, but yield the doc
|
||||
|
|
|
@ -34,7 +34,7 @@ def StaticVectors(
|
|||
def forward(
|
||||
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
||||
) -> Tuple[Ragged, Callable]:
|
||||
if not len(docs):
|
||||
if not sum(len(doc) for doc in docs):
|
||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||
key_attr = model.attrs["key_attr"]
|
||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||
|
|
|
@ -238,7 +238,7 @@ class EntityLinker(Pipe):
|
|||
)
|
||||
bp_context(d_scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
self.set_annotations(docs, predictions)
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any
|
||||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||
|
@ -50,7 +52,7 @@ def make_entity_ruler(
|
|||
)
|
||||
|
||||
|
||||
class EntityRuler:
|
||||
class EntityRuler(Pipe):
|
||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||
rules or exact phrase matches. It can be combined with the statistical
|
||||
`EntityRecognizer` to boost accuracy, or used on its own to implement a
|
||||
|
@ -183,6 +185,26 @@ class EntityRuler:
|
|||
all_labels.add(l)
|
||||
return tuple(all_labels)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
patterns: Optional[Sequence[PatternType]] = None,
|
||||
):
|
||||
"""Initialize the pipe for training.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
patterns Optional[Iterable[PatternType]]: The list of patterns.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
||||
"""
|
||||
if patterns:
|
||||
self.add_patterns(patterns)
|
||||
|
||||
|
||||
@property
|
||||
def ent_ids(self) -> Tuple[str, ...]:
|
||||
"""All entity ids present in the match patterns `id` properties
|
||||
|
@ -320,6 +342,12 @@ class EntityRuler:
|
|||
validate_examples(examples, "EntityRuler.score")
|
||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
|
||||
def predict(self, docs):
|
||||
pass
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
pass
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
|
|
|
@ -209,7 +209,7 @@ class ClozeMultitask(Pipe):
|
|||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||
bp_predictions(d_predictions)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
|
|
@ -132,7 +132,7 @@ cdef class Pipe:
|
|||
loss, d_scores = self.get_loss(examples, scores)
|
||||
bp_scores(d_scores)
|
||||
if sgd not in (None, False):
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
|
@ -228,6 +228,9 @@ cdef class Pipe:
|
|||
def is_resizable(self):
|
||||
return hasattr(self, "model") and "resize_output" in self.model.attrs
|
||||
|
||||
def is_trainable(self):
|
||||
return hasattr(self, "model") and isinstance(self.model, Model)
|
||||
|
||||
def set_output(self, nO):
|
||||
if self.is_resizable():
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
@ -245,6 +248,17 @@ cdef class Pipe:
|
|||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def finish_update(self, sgd):
|
||||
"""Update parameters using the current parameter gradients.
|
||||
The Optimizer instance contains the functionality to perform
|
||||
the stochastic gradient descent.
|
||||
|
||||
sgd (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#finish_update
|
||||
"""
|
||||
self.model.finish_update(sgd)
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
|
|
|
@ -203,7 +203,7 @@ class Tagger(Pipe):
|
|||
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
||||
bp_tag_scores(d_tag_scores)
|
||||
if sgd not in (None, False):
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
|
@ -238,7 +238,7 @@ class Tagger(Pipe):
|
|||
target = self._rehearsal_model(examples)
|
||||
gradient = guesses - target
|
||||
backprop(gradient)
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
|
|
|
@ -212,7 +212,7 @@ class TextCategorizer(Pipe):
|
|||
loss, d_scores = self.get_loss(examples, scores)
|
||||
bp_scores(d_scores)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
|
@ -256,7 +256,7 @@ class TextCategorizer(Pipe):
|
|||
gradient = scores - target
|
||||
bp_scores(gradient)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += (gradient ** 2).sum()
|
||||
return losses
|
||||
|
|
|
@ -188,7 +188,7 @@ class Tok2Vec(Pipe):
|
|||
accumulate_gradient(one_d_tokvecs)
|
||||
d_docs = bp_tokvecs(d_tokvecs)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
return d_docs
|
||||
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
|
|
|
@ -315,7 +315,7 @@ cdef class Parser(Pipe):
|
|||
|
||||
backprop_tok2vec(golds)
|
||||
if sgd not in (None, False):
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
if set_annotations:
|
||||
docs = [eg.predicted for eg in examples]
|
||||
self.set_annotations(docs, all_states)
|
||||
|
@ -367,7 +367,7 @@ cdef class Parser(Pipe):
|
|||
# Do the backprop
|
||||
backprop_tok2vec(docs)
|
||||
if sgd is not None:
|
||||
self.model.finish_update(sgd)
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss / n_scores
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
|
@ -437,7 +437,9 @@ cdef class Parser(Pipe):
|
|||
for name, component in nlp.pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||
# predict and set_annotations methods
|
||||
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
|
||||
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
||||
else:
|
||||
doc_sample = [component(doc) for doc in doc_sample]
|
||||
|
|
|
@ -119,7 +119,7 @@ def validate_init_settings(
|
|||
if types don't match or required values are missing.
|
||||
|
||||
func (Callable): The initialize method of a given component etc.
|
||||
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
||||
settings (Dict[str, Any]): The settings from the respective [initialize] block.
|
||||
section (str): Initialize section, for error message.
|
||||
name (str): Name of the block in the section.
|
||||
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||
|
|
|
@ -10,12 +10,14 @@ def test_build_dependencies():
|
|||
"mock",
|
||||
"flake8",
|
||||
]
|
||||
# ignore language-specific packages that shouldn't be installed by all
|
||||
libs_ignore_setup = [
|
||||
"fugashi",
|
||||
"natto-py",
|
||||
"pythainlp",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
"spacy-pkuseg",
|
||||
]
|
||||
|
||||
# check requirements.txt
|
||||
|
|
|
@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
# initialize with patterns from asset
|
||||
# initialize with patterns from misc registry
|
||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import pytest
|
||||
|
||||
from spacy import registry
|
||||
from spacy.tokens import Span
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
@ -11,6 +13,7 @@ def nlp():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
@registry.misc("entity_ruler_patterns")
|
||||
def patterns():
|
||||
return [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
|
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
|
|||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
def test_entity_ruler_init_patterns(nlp, patterns):
|
||||
# initialize with patterns
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
ruler.initialize(lambda: [], patterns=patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
doc = nlp("hello world bye bye")
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
nlp.remove_pipe("entity_ruler")
|
||||
# initialize with patterns from misc registry
|
||||
nlp.config["initialize"]["components"]["entity_ruler"] = {
|
||||
"patterns": {"@misc": "entity_ruler_patterns"}
|
||||
}
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
nlp.initialize()
|
||||
assert len(ruler.labels) == 4
|
||||
doc = nlp("hello world bye bye")
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
def test_entity_ruler_existing(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
|
|
@ -7,6 +7,7 @@ import numpy
|
|||
|
||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||
|
||||
|
@ -185,3 +186,22 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
|||
model1 = get_updated_model()
|
||||
model2 = get_updated_model()
|
||||
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_func,kwargs",
|
||||
[
|
||||
(StaticVectors, {"nO": 128, "nM": 300}),
|
||||
]
|
||||
)
|
||||
def test_empty_docs(model_func, kwargs):
|
||||
nlp = English()
|
||||
model = model_func(**kwargs).initialize()
|
||||
# Test the layer can be called successfully with 0, 1 and 2 empty docs.
|
||||
for n_docs in range(3):
|
||||
docs = [nlp("") for _ in range(n_docs)]
|
||||
# Test predict
|
||||
_ = model.predict(docs)
|
||||
# Test backprop
|
||||
output, backprop = model.begin_update(docs)
|
||||
_ = backprop(output)
|
||||
|
|
|
@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
logger.info("Initialized pipeline components")
|
||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||
return nlp
|
||||
|
||||
|
||||
|
|
|
@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False):
|
|||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
||||
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
|
||||
msg = Printer(no_print=True)
|
||||
# we assume here that only components are enabled that should be trained & logged
|
||||
logged_pipes = nlp.pipe_names
|
||||
# ensure that only trainable components are logged
|
||||
logged_pipes = [
|
||||
name
|
||||
for name, proc in nlp.pipeline
|
||||
if hasattr(proc, "is_trainable") and proc.is_trainable()
|
||||
]
|
||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||
|
@ -41,19 +45,10 @@ def console_logger(progress_bar: bool = False):
|
|||
if progress is not None:
|
||||
progress.update(1)
|
||||
return
|
||||
try:
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
for pipe_name in logged_pipes
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict="scores (losses)",
|
||||
key=str(e),
|
||||
keys=list(info["losses"].keys()),
|
||||
)
|
||||
) from None
|
||||
|
||||
scores = []
|
||||
for col in score_cols:
|
||||
|
|
|
@ -187,10 +187,11 @@ def train_while_improving(
|
|||
for name, proc in nlp.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and hasattr(proc, "model")
|
||||
and hasattr(proc, "is_trainable")
|
||||
and proc.is_trainable()
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.model.finish_update(optimizer)
|
||||
proc.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
if optimizer.averages:
|
||||
|
@ -293,6 +294,7 @@ def update_meta(
|
|||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
if pipe_name in info["losses"]:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
||||
|
|
|
@ -128,8 +128,8 @@ Get all patterns that have been added to the attribute ruler in the
|
|||
|
||||
## AttributeRuler.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component with data. Typically called before training to load in
|
||||
rules from a file. This method is typically called by
|
||||
Initialize the component with data and used before training to load in rules
|
||||
from a file. This method is typically called by
|
||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
|
|
|
@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show
|
|||
|
||||
```
|
||||
✘ Config validation error
|
||||
dropout field required
|
||||
optimizer field required
|
||||
optimize extra fields not permitted
|
||||
|
||||
training -> dropout field required
|
||||
training -> optimizer field required
|
||||
training -> optimize extra fields not permitted
|
||||
|
||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
|
||||
{'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
|
||||
|
||||
If your config contains missing values, you can run the 'init fill-config'
|
||||
command to fill in all the defaults, if possible:
|
||||
|
||||
python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg
|
||||
python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
|
|
@ -181,20 +181,20 @@ This section defines settings and controls for the training and evaluation
|
|||
process that are used when you run [`spacy train`](/api/cli#train).
|
||||
|
||||
| Name | Description |
|
||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
|
||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
||||
|
@ -206,16 +206,16 @@ This section is optional and defines settings and controls for
|
|||
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
|
||||
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
||||
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
| `corpus` | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~ |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||
| `component` | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
||||
| `layer` | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~ |
|
||||
|
||||
### initialize {#config-initialize tag="section"}
|
||||
|
||||
|
@ -224,6 +224,9 @@ It's used by [`Language.initialize`](/api/language#initialize) and typically
|
|||
called right before training (but not at runtime). The section allows you to
|
||||
specify local file paths or custom functions to load data resources from,
|
||||
without requiring them at runtime when you load the trained pipeline back in.
|
||||
Also see the usage guides on the
|
||||
[config lifecycle](/usage/training#config-lifecycle) and
|
||||
[custom initialization](/usage/training#initialization).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -74,6 +74,38 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
|||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
|
||||
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
||||
|
||||
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component with data and used before training to load in rules
|
||||
from a file. This method is typically called by
|
||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> entity_ruler = nlp.add_pipe("entity_ruler")
|
||||
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.entity_ruler]
|
||||
>
|
||||
> [initialize.components.entity_ruler.patterns]
|
||||
> @readers = "srsly.read_jsonl.v1"
|
||||
> path = "corpus/entity_ruler_patterns.jsonl
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
|
||||
|
||||
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
||||
|
||||
The number of all patterns added to the entity ruler.
|
||||
|
@ -177,7 +209,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a
|
|||
|
||||
## EntityRuler.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the entity ruler from a file. Expects either a file containing
|
||||
Load the entity ruler from a path. Expects either a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line, or a directory
|
||||
containing a `patterns.jsonl` file and a `cfg` file with the component
|
||||
configuration.
|
||||
|
|
|
@ -294,6 +294,24 @@ context, the original parameters are restored.
|
|||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Pipe.finish_update {#finish_update tag="method"}
|
||||
|
||||
Update parameters using the current parameter gradients. Defaults to calling
|
||||
[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = pipe.update(examples, sgd=None)
|
||||
> pipe.finish_update(sgd)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----- | ------------------------------------- |
|
||||
| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
|
||||
|
||||
## Pipe.add_label {#add_label tag="method"}
|
||||
|
||||
> #### Example
|
||||
|
|
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 50 KiB |
55
website/docs/images/trainable_component.svg
Normal file
55
website/docs/images/trainable_component.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 76 KiB |
|
@ -646,7 +646,9 @@ get_candidates = model.attrs["get_candidates"]
|
|||
|
||||
To use our new relation extraction model as part of a custom
|
||||
[trainable component](/usage/processing-pipelines#trainable-components), we
|
||||
create a subclass of [`Pipe`](/api/pipe) that holds the model:
|
||||
create a subclass of [`Pipe`](/api/pipe) that holds the model.
|
||||
|
||||

|
||||
|
||||
```python
|
||||
### Pipeline component skeleton
|
||||
|
@ -826,7 +828,7 @@ def __call__(self, Doc doc):
|
|||
|
||||
Once our `Pipe` subclass is fully implemented, we can
|
||||
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
|
||||
component with the [`@Language.factory`](/api/language#factory) decorator. This
|
||||
assigns it a name and lets you create the component with
|
||||
[`nlp.add_pipe`](/api/language#add_pipe) and via the
|
||||
[config](/usage/training#config).
|
||||
|
|
|
@ -98,10 +98,10 @@ The Chinese language class supports three word segmentation options, `char`,
|
|||
> # Jieba
|
||||
> cfg = {"segmenter": "jieba"}
|
||||
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||
> # PKUSeg with "default" model provided by pkuseg
|
||||
> # PKUSeg with "mixed" model provided by pkuseg
|
||||
> cfg = {"segmenter": "pkuseg"}
|
||||
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||
> nlp.tokenizer.initialize(pkuseg_model="default")
|
||||
> nlp.tokenizer.initialize(pkuseg_model="mixed")
|
||||
> ```
|
||||
|
||||
```ini
|
||||
|
@ -115,7 +115,7 @@ segmenter = "char"
|
|||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. |
|
||||
| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. |
|
||||
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
|
||||
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/explosion/spacy-pkuseg) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -134,9 +134,9 @@ The `initialize` method for the Chinese tokenizer class supports the following
|
|||
config settings for loading `pkuseg` models:
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
|
||||
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
|
||||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pkuseg_model` | Name of a model provided by `spacy-pkuseg` or the path to a local model directory. ~~str~~ |
|
||||
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`, the default provided dictionary. ~~str~~ |
|
||||
|
||||
The initialization settings are typically provided in the
|
||||
[training config](/usage/training#config) and the data is loaded in before
|
||||
|
@ -164,14 +164,17 @@ You can also initialize the tokenizer for a blank language class by calling its
|
|||
cfg = {"segmenter": "pkuseg"}
|
||||
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||
|
||||
# Load "default" model
|
||||
nlp.tokenizer.initialize(pkuseg_model="default")
|
||||
# Load spaCy's OntoNotes model
|
||||
nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes")
|
||||
|
||||
# Load pkuseg's "news" model
|
||||
nlp.tokenizer.initialize(pkuseg_model="news")
|
||||
|
||||
# Load local model
|
||||
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
|
||||
|
||||
# Override the user directory
|
||||
nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict")
|
||||
nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes", pkuseg_user_dict="/path/to/user_dict")
|
||||
```
|
||||
|
||||
You can also modify the user dictionary on-the-fly:
|
||||
|
@ -195,13 +198,13 @@ The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
|
|||
model trained only on
|
||||
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
||||
models provided by `pkuseg` include data restricted to research use. For
|
||||
research use, `pkuseg` provides models for several different domains
|
||||
(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses,
|
||||
`pkuseg` provides a simple
|
||||
[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage):
|
||||
research use, `pkuseg` provides models for several different domains (`"mixed"`
|
||||
(equivalent to `"default"` from `pkuseg` packages), `"news"` `"web"`,
|
||||
`"medicine"`, `"tourism"`) and for other uses, `pkuseg` provides a simple
|
||||
[training API](https://github.com/explosion/spacy-pkuseg/blob/master/readme/readme_english.md#usage):
|
||||
|
||||
```python
|
||||
import pkuseg
|
||||
import spacy_pkuseg as pkuseg
|
||||
from spacy.lang.zh import Chinese
|
||||
|
||||
# Train pkuseg model
|
||||
|
|
|
@ -1172,13 +1172,15 @@ doc = nlp("This is a text...")
|
|||
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
||||
components that have their own model instance, make predictions over `Doc`
|
||||
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
||||
plug fully custom machine learning components into your pipeline. You'll need
|
||||
the following:
|
||||
plug fully custom machine learning components into your pipeline.
|
||||
|
||||

|
||||
|
||||
You'll need the following:
|
||||
|
||||
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
||||
can be a model implemented in
|
||||
[Thinc](/usage/layers-architectures#thinc), or a
|
||||
[wrapped model](/usage/layers-architectures#frameworks) implemented in
|
||||
can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or
|
||||
a [wrapped model](/usage/layers-architectures#frameworks) implemented in
|
||||
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
||||
list of [`Doc`](/api/doc) objects as input and can have any type of output.
|
||||
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
|
||||
|
@ -1283,7 +1285,7 @@ loss is calculated and to add evaluation scores to the training output.
|
|||
For more details on how to implement your own trainable components and model
|
||||
architectures, and plug existing models implemented in PyTorch or TensorFlow
|
||||
into your spaCy pipeline, see the usage guide on
|
||||
[layers and model architectures](/usage/layers-architectures).
|
||||
[layers and model architectures](/usage/layers-architectures#components).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -404,8 +404,73 @@ import Training101 from 'usage/101/\_training.md'
|
|||
<Infobox title="Training pipelines and models" emoji="📖">
|
||||
|
||||
To learn more about **training and updating** pipelines, how to create training
|
||||
data and how to improve spaCy's named entity recognition models, see the usage
|
||||
guides on [training](/usage/training).
|
||||
data and how to improve spaCy's named models, see the usage guides on
|
||||
[training](/usage/training).
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Training config and lifecycle {#training-config}
|
||||
|
||||
Training config files include all **settings and hyperparameters** for training
|
||||
your pipeline. Instead of providing lots of arguments on the command line, you
|
||||
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
||||
This also makes it easy to integrate custom models and architectures, written in
|
||||
your framework of choice. A pipeline's `config.cfg` is considered the "single
|
||||
source of truth", both at **training** and **runtime**.
|
||||
|
||||
> ```ini
|
||||
> ### config.cfg (excerpt)
|
||||
> [training]
|
||||
> accumulate_gradient = 3
|
||||
>
|
||||
> [training.optimizer]
|
||||
> @optimizers = "Adam.v1"
|
||||
>
|
||||
> [training.optimizer.learn_rate]
|
||||
> @schedules = "warmup_linear.v1"
|
||||
> warmup_steps = 250
|
||||
> total_steps = 20000
|
||||
> initial_rate = 0.01
|
||||
> ```
|
||||
|
||||

|
||||
|
||||
<Infobox title="Training configuration system" emoji="📖">
|
||||
|
||||
For more details on spaCy's **configuration system** and how to use it to
|
||||
customize your pipeline components, component models, training settings and
|
||||
hyperparameters, see the [training config](/usage/training#config) usage guide.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Trainable components {#training-components}
|
||||
|
||||
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
||||
components that have their own model instance, make predictions over `Doc`
|
||||
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
||||
plug fully custom machine learning components into your pipeline that can be
|
||||
configured via a single training config.
|
||||
|
||||
> #### config.cfg (excerpt)
|
||||
>
|
||||
> ```ini
|
||||
> [components.my_component]
|
||||
> factory = "my_component"
|
||||
>
|
||||
> [components.my_component.model]
|
||||
> @architectures = "my_model.v1"
|
||||
> width = 128
|
||||
> ```
|
||||
|
||||

|
||||
|
||||
<Infobox title="Custom trainable components" emoji="📖">
|
||||
|
||||
To learn more about how to implement your own **model architectures** and use
|
||||
them to power custom **trainable components**, see the usage guides on the
|
||||
[trainable component API](/usage/processing-pipelines#trainable-components) and
|
||||
implementing [layers and architectures](/usage/layers-architectures#components)
|
||||
for trainable components.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
|
|||
If you don't want a component to be updated, you can **freeze** it by adding it
|
||||
to the `frozen_components` list in the `[training]` block. Frozen components are
|
||||
**not updated** during training and are included in the final trained pipeline
|
||||
as-is.
|
||||
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
|
||||
|
||||
> #### Note on frozen components
|
||||
>
|
||||
|
|
|
@ -168,9 +168,13 @@ follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each
|
|||
`Model` can also be used as a sublayer of a larger network, allowing you to
|
||||
freely combine implementations from different frameworks into a single model.
|
||||
|
||||

|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage: ** [Layers and architectures](/usage/layers-architectures)
|
||||
- **Usage: ** [Layers and architectures](/usage/layers-architectures),
|
||||
[Trainable component API](/usage/processing-pipelines#trainable-components),
|
||||
[Trainable components and models](/usage/layers-architectures#components)
|
||||
- **Thinc: **
|
||||
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
|
||||
[`Model` API](https://thinc.ai/docs/api-model)
|
||||
|
@ -503,36 +507,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
- Pipeline package symlinks, the `link` command and shortcut names are now
|
||||
deprecated. There can be many [different trained pipelines](/models) and not
|
||||
just one "English model", so you should always use the full package name like
|
||||
[`en_core_web_sm`](/models/en) explicitly.
|
||||
- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide
|
||||
meta information like the package name, author, license and labels. It's
|
||||
**not** used to construct the processing pipeline anymore. This is all defined
|
||||
in the [`config.cfg`](/api/data-formats#config), which also includes all
|
||||
settings used to train the pipeline.
|
||||
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
||||
only take a `config.cfg` file containing the full
|
||||
[training config](/usage/training#config).
|
||||
`en_core_web_sm` explicitly.
|
||||
- A pipeline's `meta.json` is now only used to provide meta information like the
|
||||
package name, author, license and labels. It's **not** used to construct the
|
||||
processing pipeline anymore. This is all defined in the
|
||||
[`config.cfg`](/api/data-formats#config), which also includes all settings
|
||||
used to train the pipeline.
|
||||
- The `train`, `pretrain` and `debug data` commands now only take a
|
||||
`config.cfg`.
|
||||
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
||||
the component factory instead of the component function.
|
||||
- **Custom pipeline components** now need to be decorated with the
|
||||
[`@Language.component`](/api/language#component) or
|
||||
[`@Language.factory`](/api/language#factory) decorator.
|
||||
- [`Language.update`](/api/language#update) now takes a batch of
|
||||
[`Example`](/api/example) objects instead of raw texts and annotations, or
|
||||
`Doc` and `GoldParse` objects.
|
||||
- The `Language.disable_pipes` context manager has been replaced by
|
||||
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
|
||||
disable or enable components.
|
||||
- The [`Language.update`](/api/language#update),
|
||||
[`Language.evaluate`](/api/language#evaluate) and
|
||||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||
raw text and a dictionary of annotations.
|
||||
[`Language.initialize`](/api/language#initialize) and
|
||||
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
|
||||
sequence of `Example` objects to initialize the model instead of a list of
|
||||
tuples.
|
||||
- The `begin_training` methods have been renamed to `initialize`.
|
||||
- The `begin_training` methods have been renamed to `initialize` and now take a
|
||||
function that returns a sequence of `Example` objects to initialize the model
|
||||
instead of a list of tuples.
|
||||
- [`Matcher.add`](/api/matcher#add) and
|
||||
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
||||
patterns as the second argument (instead of a variable number of arguments).
|
||||
|
@ -557,7 +552,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
|
||||
| Removed | Replacement |
|
||||
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) |
|
||||
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
|
||||
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
||||
| `GoldParse` | [`Example`](/api/example) |
|
||||
|
|
Loading…
Reference in New Issue
Block a user