mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
32011780a2
|
@ -104,9 +104,11 @@ For detailed installation instructions, see the
|
||||||
### pip
|
### pip
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||||
of `v2.0.13`).
|
of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
|
||||||
|
your `pip`, `setuptools` and `wheel` are up to date.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -U pip setuptools wheel
|
||||||
pip install spacy
|
pip install spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ pydantic>=1.5.0,<2.0.0
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging>=20.0
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
typing_extensions>=3.7.4; python_version < "3.8"
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
|
|
|
@ -55,7 +55,7 @@ install_requires =
|
||||||
pytokenizations
|
pytokenizations
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging>=20.0
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
typing_extensions>=3.7.4; python_version < "3.8"
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a34"
|
__version__ = "3.0.0a35"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -278,7 +278,7 @@ def show_validation_error(
|
||||||
"fill-config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
spaced=True,
|
spaced=True,
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
except InterpolationError as e:
|
except InterpolationError as e:
|
||||||
msg.fail("Config validation error", e, exits=1)
|
msg.fail("Config validation error", e, exits=1)
|
||||||
|
|
|
@ -1091,10 +1091,11 @@ class Language:
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "model")
|
and hasattr(proc, "is_trainable")
|
||||||
|
and proc.is_trainable()
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.model.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
|
@ -1297,7 +1298,9 @@ class Language:
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if not hasattr(pipe, "pipe"):
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
|
# predict and set_annotations methods
|
||||||
|
if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
|
||||||
docs = _pipe(docs, pipe, kwargs)
|
docs = _pipe(docs, pipe, kwargs)
|
||||||
else:
|
else:
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
docs = pipe.pipe(docs, **kwargs)
|
||||||
|
@ -1407,7 +1410,9 @@ class Language:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
# Allow component_cfg to overwrite the top-level kwargs.
|
# Allow component_cfg to overwrite the top-level kwargs.
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if hasattr(proc, "pipe"):
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
|
# predict and set_annotations methods
|
||||||
|
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
|
||||||
f = functools.partial(proc.pipe, **kwargs)
|
f = functools.partial(proc.pipe, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
|
|
|
@ -34,7 +34,7 @@ def StaticVectors(
|
||||||
def forward(
|
def forward(
|
||||||
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
|
||||||
) -> Tuple[Ragged, Callable]:
|
) -> Tuple[Ragged, Callable]:
|
||||||
if not len(docs):
|
if not sum(len(doc) for doc in docs):
|
||||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||||
key_attr = model.attrs["key_attr"]
|
key_attr = model.attrs["key_attr"]
|
||||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||||
|
|
|
@ -238,7 +238,7 @@ class EntityLinker(Pipe):
|
||||||
)
|
)
|
||||||
bp_context(d_scores)
|
bp_context(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any
|
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from .pipe import Pipe
|
||||||
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||||
|
@ -50,7 +52,7 @@ def make_entity_ruler(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class EntityRuler:
|
class EntityRuler(Pipe):
|
||||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||||
rules or exact phrase matches. It can be combined with the statistical
|
rules or exact phrase matches. It can be combined with the statistical
|
||||||
`EntityRecognizer` to boost accuracy, or used on its own to implement a
|
`EntityRecognizer` to boost accuracy, or used on its own to implement a
|
||||||
|
@ -183,6 +185,26 @@ class EntityRuler:
|
||||||
all_labels.add(l)
|
all_labels.add(l)
|
||||||
return tuple(all_labels)
|
return tuple(all_labels)
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
patterns: Optional[Sequence[PatternType]] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the pipe for training.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
patterns Optional[Iterable[PatternType]]: The list of patterns.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
||||||
|
"""
|
||||||
|
if patterns:
|
||||||
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_ids(self) -> Tuple[str, ...]:
|
def ent_ids(self) -> Tuple[str, ...]:
|
||||||
"""All entity ids present in the match patterns `id` properties
|
"""All entity ids present in the match patterns `id` properties
|
||||||
|
@ -320,6 +342,12 @@ class EntityRuler:
|
||||||
validate_examples(examples, "EntityRuler.score")
|
validate_examples(examples, "EntityRuler.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores):
|
||||||
|
pass
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> "EntityRuler":
|
) -> "EntityRuler":
|
||||||
|
|
|
@ -209,7 +209,7 @@ class ClozeMultitask(Pipe):
|
||||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions)
|
bp_predictions(d_predictions)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return losses
|
return losses
|
||||||
|
|
|
@ -132,7 +132,7 @@ cdef class Pipe:
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
bp_scores(d_scores)
|
bp_scores(d_scores)
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -228,6 +228,9 @@ cdef class Pipe:
|
||||||
def is_resizable(self):
|
def is_resizable(self):
|
||||||
return hasattr(self, "model") and "resize_output" in self.model.attrs
|
return hasattr(self, "model") and "resize_output" in self.model.attrs
|
||||||
|
|
||||||
|
def is_trainable(self):
|
||||||
|
return hasattr(self, "model") and isinstance(self.model, Model)
|
||||||
|
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
if self.is_resizable():
|
if self.is_resizable():
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
@ -245,6 +248,17 @@ cdef class Pipe:
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
def finish_update(self, sgd):
|
||||||
|
"""Update parameters using the current parameter gradients.
|
||||||
|
The Optimizer instance contains the functionality to perform
|
||||||
|
the stochastic gradient descent.
|
||||||
|
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#finish_update
|
||||||
|
"""
|
||||||
|
self.model.finish_update(sgd)
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
|
|
@ -203,7 +203,7 @@ class Tagger(Pipe):
|
||||||
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
||||||
bp_tag_scores(d_tag_scores)
|
bp_tag_scores(d_tag_scores)
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
|
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
|
@ -238,7 +238,7 @@ class Tagger(Pipe):
|
||||||
target = self._rehearsal_model(examples)
|
target = self._rehearsal_model(examples)
|
||||||
gradient = guesses - target
|
gradient = guesses - target
|
||||||
backprop(gradient)
|
backprop(gradient)
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
|
@ -212,7 +212,7 @@ class TextCategorizer(Pipe):
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
bp_scores(d_scores)
|
bp_scores(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -256,7 +256,7 @@ class TextCategorizer(Pipe):
|
||||||
gradient = scores - target
|
gradient = scores - target
|
||||||
bp_scores(gradient)
|
bp_scores(gradient)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += (gradient ** 2).sum()
|
losses[self.name] += (gradient ** 2).sum()
|
||||||
return losses
|
return losses
|
||||||
|
|
|
@ -188,7 +188,7 @@ class Tok2Vec(Pipe):
|
||||||
accumulate_gradient(one_d_tokvecs)
|
accumulate_gradient(one_d_tokvecs)
|
||||||
d_docs = bp_tokvecs(d_tokvecs)
|
d_docs = bp_tokvecs(d_tokvecs)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
return d_docs
|
return d_docs
|
||||||
|
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
|
|
|
@ -315,7 +315,7 @@ cdef class Parser(Pipe):
|
||||||
|
|
||||||
backprop_tok2vec(golds)
|
backprop_tok2vec(golds)
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, all_states)
|
self.set_annotations(docs, all_states)
|
||||||
|
@ -367,7 +367,7 @@ cdef class Parser(Pipe):
|
||||||
# Do the backprop
|
# Do the backprop
|
||||||
backprop_tok2vec(docs)
|
backprop_tok2vec(docs)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss / n_scores
|
losses[self.name] += loss / n_scores
|
||||||
del backprop
|
del backprop
|
||||||
del backprop_tok2vec
|
del backprop_tok2vec
|
||||||
|
@ -437,7 +437,9 @@ cdef class Parser(Pipe):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if component is self:
|
if component is self:
|
||||||
break
|
break
|
||||||
if hasattr(component, "pipe"):
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
|
# predict and set_annotations methods
|
||||||
|
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
|
||||||
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
||||||
else:
|
else:
|
||||||
doc_sample = [component(doc) for doc in doc_sample]
|
doc_sample = [component(doc) for doc in doc_sample]
|
||||||
|
|
|
@ -119,7 +119,7 @@ def validate_init_settings(
|
||||||
if types don't match or required values are missing.
|
if types don't match or required values are missing.
|
||||||
|
|
||||||
func (Callable): The initialize method of a given component etc.
|
func (Callable): The initialize method of a given component etc.
|
||||||
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
settings (Dict[str, Any]): The settings from the respective [initialize] block.
|
||||||
section (str): Initialize section, for error message.
|
section (str): Initialize section, for error message.
|
||||||
name (str): Name of the block in the section.
|
name (str): Name of the block in the section.
|
||||||
exclude (Iterable[str]): Parameter names to exclude from schema.
|
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||||
|
|
|
@ -10,12 +10,14 @@ def test_build_dependencies():
|
||||||
"mock",
|
"mock",
|
||||||
"flake8",
|
"flake8",
|
||||||
]
|
]
|
||||||
|
# ignore language-specific packages that shouldn't be installed by all
|
||||||
libs_ignore_setup = [
|
libs_ignore_setup = [
|
||||||
"fugashi",
|
"fugashi",
|
||||||
"natto-py",
|
"natto-py",
|
||||||
"pythainlp",
|
"pythainlp",
|
||||||
"sudachipy",
|
"sudachipy",
|
||||||
"sudachidict_core",
|
"sudachidict_core",
|
||||||
|
"spacy-pkuseg",
|
||||||
]
|
]
|
||||||
|
|
||||||
# check requirements.txt
|
# check requirements.txt
|
||||||
|
|
|
@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from misc registry
|
||||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from spacy import registry
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
@ -11,6 +13,7 @@ def nlp():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@registry.misc("entity_ruler_patterns")
|
||||||
def patterns():
|
def patterns():
|
||||||
return [
|
return [
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
|
||||||
assert doc.ents[1].label_ == "BYE"
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_init_patterns(nlp, patterns):
|
||||||
|
# initialize with patterns
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
ruler.initialize(lambda: [], patterns=patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world bye bye")
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
nlp.remove_pipe("entity_ruler")
|
||||||
|
# initialize with patterns from misc registry
|
||||||
|
nlp.config["initialize"]["components"]["entity_ruler"] = {
|
||||||
|
"patterns": {"@misc": "entity_ruler_patterns"}
|
||||||
|
}
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
nlp.initialize()
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world bye bye")
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns):
|
def test_entity_ruler_existing(nlp, patterns):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -7,6 +7,7 @@ import numpy
|
||||||
|
|
||||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||||
|
|
||||||
|
@ -185,3 +186,22 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
||||||
model1 = get_updated_model()
|
model1 = get_updated_model()
|
||||||
model2 = get_updated_model()
|
model2 = get_updated_model()
|
||||||
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model_func,kwargs",
|
||||||
|
[
|
||||||
|
(StaticVectors, {"nO": 128, "nM": 300}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_empty_docs(model_func, kwargs):
|
||||||
|
nlp = English()
|
||||||
|
model = model_func(**kwargs).initialize()
|
||||||
|
# Test the layer can be called successfully with 0, 1 and 2 empty docs.
|
||||||
|
for n_docs in range(3):
|
||||||
|
docs = [nlp("") for _ in range(n_docs)]
|
||||||
|
# Test predict
|
||||||
|
_ = model.predict(docs)
|
||||||
|
# Test backprop
|
||||||
|
output, backprop = model.begin_update(docs)
|
||||||
|
_ = backprop(output)
|
||||||
|
|
|
@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
logger.info("Initialized pipeline components")
|
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False):
|
||||||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
||||||
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
|
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
|
||||||
msg = Printer(no_print=True)
|
msg = Printer(no_print=True)
|
||||||
# we assume here that only components are enabled that should be trained & logged
|
# ensure that only trainable components are logged
|
||||||
logged_pipes = nlp.pipe_names
|
logged_pipes = [
|
||||||
|
name
|
||||||
|
for name, proc in nlp.pipeline
|
||||||
|
if hasattr(proc, "is_trainable") and proc.is_trainable()
|
||||||
|
]
|
||||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||||
|
@ -41,19 +45,10 @@ def console_logger(progress_bar: bool = False):
|
||||||
if progress is not None:
|
if progress is not None:
|
||||||
progress.update(1)
|
progress.update(1)
|
||||||
return
|
return
|
||||||
try:
|
losses = [
|
||||||
losses = [
|
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
for pipe_name in logged_pipes
|
||||||
for pipe_name in logged_pipes
|
]
|
||||||
]
|
|
||||||
except KeyError as e:
|
|
||||||
raise KeyError(
|
|
||||||
Errors.E983.format(
|
|
||||||
dict="scores (losses)",
|
|
||||||
key=str(e),
|
|
||||||
keys=list(info["losses"].keys()),
|
|
||||||
)
|
|
||||||
) from None
|
|
||||||
|
|
||||||
scores = []
|
scores = []
|
||||||
for col in score_cols:
|
for col in score_cols:
|
||||||
|
|
|
@ -187,10 +187,11 @@ def train_while_improving(
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "model")
|
and hasattr(proc, "is_trainable")
|
||||||
|
and proc.is_trainable()
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.model.finish_update(optimizer)
|
proc.finish_update(optimizer)
|
||||||
optimizer.step_schedules()
|
optimizer.step_schedules()
|
||||||
if not (step % eval_frequency):
|
if not (step % eval_frequency):
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
|
@ -293,7 +294,8 @@ def update_meta(
|
||||||
if metric is not None:
|
if metric is not None:
|
||||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||||
for pipe_name in nlp.pipe_names:
|
for pipe_name in nlp.pipe_names:
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
if pipe_name in info["losses"]:
|
||||||
|
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||||
|
|
||||||
|
|
||||||
def create_before_to_disk_callback(
|
def create_before_to_disk_callback(
|
||||||
|
|
|
@ -128,8 +128,8 @@ Get all patterns that have been added to the attribute ruler in the
|
||||||
|
|
||||||
## AttributeRuler.initialize {#initialize tag="method"}
|
## AttributeRuler.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component with data. Typically called before training to load in
|
Initialize the component with data and used before training to load in rules
|
||||||
rules from a file. This method is typically called by
|
from a file. This method is typically called by
|
||||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
arguments it receives via the
|
arguments it receives via the
|
||||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
|
|
@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show
|
||||||
|
|
||||||
```
|
```
|
||||||
✘ Config validation error
|
✘ Config validation error
|
||||||
|
dropout field required
|
||||||
|
optimizer field required
|
||||||
|
optimize extra fields not permitted
|
||||||
|
|
||||||
training -> dropout field required
|
{'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
|
||||||
training -> optimizer field required
|
|
||||||
training -> optimize extra fields not permitted
|
|
||||||
|
|
||||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
|
|
||||||
|
|
||||||
If your config contains missing values, you can run the 'init fill-config'
|
If your config contains missing values, you can run the 'init fill-config'
|
||||||
command to fill in all the defaults, if possible:
|
command to fill in all the defaults, if possible:
|
||||||
|
|
||||||
python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg
|
python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg
|
||||||
```
|
```
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
|
@ -180,24 +180,24 @@ single corpus once and then divide it up into `train` and `dev` partitions.
|
||||||
This section defines settings and controls for the training and evaluation
|
This section defines settings and controls for the training and evaluation
|
||||||
process that are used when you run [`spacy train`](/api/cli#train).
|
process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
|
||||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
@ -205,17 +205,17 @@ This section is optional and defines settings and controls for
|
||||||
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
|
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
|
||||||
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
|
| `corpus` | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~ |
|
||||||
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
| `component` | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
||||||
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
|
| `layer` | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~ |
|
||||||
|
|
||||||
### initialize {#config-initialize tag="section"}
|
### initialize {#config-initialize tag="section"}
|
||||||
|
|
||||||
|
@ -224,6 +224,9 @@ It's used by [`Language.initialize`](/api/language#initialize) and typically
|
||||||
called right before training (but not at runtime). The section allows you to
|
called right before training (but not at runtime). The section allows you to
|
||||||
specify local file paths or custom functions to load data resources from,
|
specify local file paths or custom functions to load data resources from,
|
||||||
without requiring them at runtime when you load the trained pipeline back in.
|
without requiring them at runtime when you load the trained pipeline back in.
|
||||||
|
Also see the usage guides on the
|
||||||
|
[config lifecycle](/usage/training#config-lifecycle) and
|
||||||
|
[custom initialization](/usage/training#initialization).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -74,6 +74,38 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
||||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
|
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
|
||||||
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
||||||
|
|
||||||
|
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
|
Initialize the component with data and used before training to load in rules
|
||||||
|
from a file. This method is typically called by
|
||||||
|
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> entity_ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.entity_ruler]
|
||||||
|
>
|
||||||
|
> [initialize.components.entity_ruler.patterns]
|
||||||
|
> @readers = "srsly.read_jsonl.v1"
|
||||||
|
> path = "corpus/entity_ruler_patterns.jsonl
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
|
||||||
|
|
||||||
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
The number of all patterns added to the entity ruler.
|
The number of all patterns added to the entity ruler.
|
||||||
|
@ -177,7 +209,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a
|
||||||
|
|
||||||
## EntityRuler.from_disk {#from_disk tag="method"}
|
## EntityRuler.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
Load the entity ruler from a file. Expects either a file containing
|
Load the entity ruler from a path. Expects either a file containing
|
||||||
newline-delimited JSON (JSONL) with one entry per line, or a directory
|
newline-delimited JSON (JSONL) with one entry per line, or a directory
|
||||||
containing a `patterns.jsonl` file and a `cfg` file with the component
|
containing a `patterns.jsonl` file and a `cfg` file with the component
|
||||||
configuration.
|
configuration.
|
||||||
|
@ -256,6 +288,6 @@ Get all patterns that were added to the entity ruler.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
|
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
|
||||||
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
|
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
|
||||||
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
|
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
|
||||||
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |
|
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |
|
||||||
|
|
|
@ -294,6 +294,24 @@ context, the original parameters are restored.
|
||||||
| -------- | -------------------------------------------------- |
|
| -------- | -------------------------------------------------- |
|
||||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||||
|
|
||||||
|
## Pipe.finish_update {#finish_update tag="method"}
|
||||||
|
|
||||||
|
Update parameters using the current parameter gradients. Defaults to calling
|
||||||
|
[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||||
|
> optimizer = nlp.initialize()
|
||||||
|
> losses = pipe.update(examples, sgd=None)
|
||||||
|
> pipe.finish_update(sgd)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----- | ------------------------------------- |
|
||||||
|
| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
|
||||||
|
|
||||||
## Pipe.add_label {#add_label tag="method"}
|
## Pipe.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 50 KiB |
55
website/docs/images/trainable_component.svg
Normal file
55
website/docs/images/trainable_component.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 76 KiB |
|
@ -646,7 +646,9 @@ get_candidates = model.attrs["get_candidates"]
|
||||||
|
|
||||||
To use our new relation extraction model as part of a custom
|
To use our new relation extraction model as part of a custom
|
||||||
[trainable component](/usage/processing-pipelines#trainable-components), we
|
[trainable component](/usage/processing-pipelines#trainable-components), we
|
||||||
create a subclass of [`Pipe`](/api/pipe) that holds the model:
|
create a subclass of [`Pipe`](/api/pipe) that holds the model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Pipeline component skeleton
|
### Pipeline component skeleton
|
||||||
|
@ -826,7 +828,7 @@ def __call__(self, Doc doc):
|
||||||
|
|
||||||
Once our `Pipe` subclass is fully implemented, we can
|
Once our `Pipe` subclass is fully implemented, we can
|
||||||
[register](/usage/processing-pipelines#custom-components-factories) the
|
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||||
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
|
component with the [`@Language.factory`](/api/language#factory) decorator. This
|
||||||
assigns it a name and lets you create the component with
|
assigns it a name and lets you create the component with
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe) and via the
|
[`nlp.add_pipe`](/api/language#add_pipe) and via the
|
||||||
[config](/usage/training#config).
|
[config](/usage/training#config).
|
||||||
|
|
|
@ -98,10 +98,10 @@ The Chinese language class supports three word segmentation options, `char`,
|
||||||
> # Jieba
|
> # Jieba
|
||||||
> cfg = {"segmenter": "jieba"}
|
> cfg = {"segmenter": "jieba"}
|
||||||
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
> # PKUSeg with "default" model provided by pkuseg
|
> # PKUSeg with "mixed" model provided by pkuseg
|
||||||
> cfg = {"segmenter": "pkuseg"}
|
> cfg = {"segmenter": "pkuseg"}
|
||||||
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
> nlp.tokenizer.initialize(pkuseg_model="default")
|
> nlp.tokenizer.initialize(pkuseg_model="mixed")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
|
@ -115,7 +115,7 @@ segmenter = "char"
|
||||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. |
|
| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. |
|
||||||
| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. |
|
| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. |
|
||||||
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
|
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/explosion/spacy-pkuseg) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
|
||||||
|
|
||||||
<Infobox title="Changed in v3.0" variant="warning">
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
@ -133,10 +133,10 @@ runtime.
|
||||||
The `initialize` method for the Chinese tokenizer class supports the following
|
The `initialize` method for the Chinese tokenizer class supports the following
|
||||||
config settings for loading `pkuseg` models:
|
config settings for loading `pkuseg` models:
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
|
| `pkuseg_model` | Name of a model provided by `spacy-pkuseg` or the path to a local model directory. ~~str~~ |
|
||||||
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
|
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`, the default provided dictionary. ~~str~~ |
|
||||||
|
|
||||||
The initialization settings are typically provided in the
|
The initialization settings are typically provided in the
|
||||||
[training config](/usage/training#config) and the data is loaded in before
|
[training config](/usage/training#config) and the data is loaded in before
|
||||||
|
@ -164,14 +164,17 @@ You can also initialize the tokenizer for a blank language class by calling its
|
||||||
cfg = {"segmenter": "pkuseg"}
|
cfg = {"segmenter": "pkuseg"}
|
||||||
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||||
|
|
||||||
# Load "default" model
|
# Load spaCy's OntoNotes model
|
||||||
nlp.tokenizer.initialize(pkuseg_model="default")
|
nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes")
|
||||||
|
|
||||||
|
# Load pkuseg's "news" model
|
||||||
|
nlp.tokenizer.initialize(pkuseg_model="news")
|
||||||
|
|
||||||
# Load local model
|
# Load local model
|
||||||
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
|
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
|
||||||
|
|
||||||
# Override the user directory
|
# Override the user directory
|
||||||
nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict")
|
nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes", pkuseg_user_dict="/path/to/user_dict")
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also modify the user dictionary on-the-fly:
|
You can also modify the user dictionary on-the-fly:
|
||||||
|
@ -195,13 +198,13 @@ The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
|
||||||
model trained only on
|
model trained only on
|
||||||
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
|
||||||
models provided by `pkuseg` include data restricted to research use. For
|
models provided by `pkuseg` include data restricted to research use. For
|
||||||
research use, `pkuseg` provides models for several different domains
|
research use, `pkuseg` provides models for several different domains (`"mixed"`
|
||||||
(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses,
|
(equivalent to `"default"` from `pkuseg` packages), `"news"` `"web"`,
|
||||||
`pkuseg` provides a simple
|
`"medicine"`, `"tourism"`) and for other uses, `pkuseg` provides a simple
|
||||||
[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage):
|
[training API](https://github.com/explosion/spacy-pkuseg/blob/master/readme/readme_english.md#usage):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import pkuseg
|
import spacy_pkuseg as pkuseg
|
||||||
from spacy.lang.zh import Chinese
|
from spacy.lang.zh import Chinese
|
||||||
|
|
||||||
# Train pkuseg model
|
# Train pkuseg model
|
||||||
|
|
|
@ -1172,13 +1172,15 @@ doc = nlp("This is a text...")
|
||||||
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
||||||
components that have their own model instance, make predictions over `Doc`
|
components that have their own model instance, make predictions over `Doc`
|
||||||
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
||||||
plug fully custom machine learning components into your pipeline. You'll need
|
plug fully custom machine learning components into your pipeline.
|
||||||
the following:
|
|
||||||
|

|
||||||
|
|
||||||
|
You'll need the following:
|
||||||
|
|
||||||
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
||||||
can be a model implemented in
|
can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or
|
||||||
[Thinc](/usage/layers-architectures#thinc), or a
|
a [wrapped model](/usage/layers-architectures#frameworks) implemented in
|
||||||
[wrapped model](/usage/layers-architectures#frameworks) implemented in
|
|
||||||
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
||||||
list of [`Doc`](/api/doc) objects as input and can have any type of output.
|
list of [`Doc`](/api/doc) objects as input and can have any type of output.
|
||||||
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
|
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
|
||||||
|
@ -1283,7 +1285,7 @@ loss is calculated and to add evaluation scores to the training output.
|
||||||
For more details on how to implement your own trainable components and model
|
For more details on how to implement your own trainable components and model
|
||||||
architectures, and plug existing models implemented in PyTorch or TensorFlow
|
architectures, and plug existing models implemented in PyTorch or TensorFlow
|
||||||
into your spaCy pipeline, see the usage guide on
|
into your spaCy pipeline, see the usage guide on
|
||||||
[layers and model architectures](/usage/layers-architectures).
|
[layers and model architectures](/usage/layers-architectures#components).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -404,8 +404,73 @@ import Training101 from 'usage/101/\_training.md'
|
||||||
<Infobox title="Training pipelines and models" emoji="📖">
|
<Infobox title="Training pipelines and models" emoji="📖">
|
||||||
|
|
||||||
To learn more about **training and updating** pipelines, how to create training
|
To learn more about **training and updating** pipelines, how to create training
|
||||||
data and how to improve spaCy's named entity recognition models, see the usage
|
data and how to improve spaCy's named models, see the usage guides on
|
||||||
guides on [training](/usage/training).
|
[training](/usage/training).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Training config and lifecycle {#training-config}
|
||||||
|
|
||||||
|
Training config files include all **settings and hyperparameters** for training
|
||||||
|
your pipeline. Instead of providing lots of arguments on the command line, you
|
||||||
|
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
||||||
|
This also makes it easy to integrate custom models and architectures, written in
|
||||||
|
your framework of choice. A pipeline's `config.cfg` is considered the "single
|
||||||
|
source of truth", both at **training** and **runtime**.
|
||||||
|
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg (excerpt)
|
||||||
|
> [training]
|
||||||
|
> accumulate_gradient = 3
|
||||||
|
>
|
||||||
|
> [training.optimizer]
|
||||||
|
> @optimizers = "Adam.v1"
|
||||||
|
>
|
||||||
|
> [training.optimizer.learn_rate]
|
||||||
|
> @schedules = "warmup_linear.v1"
|
||||||
|
> warmup_steps = 250
|
||||||
|
> total_steps = 20000
|
||||||
|
> initial_rate = 0.01
|
||||||
|
> ```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
<Infobox title="Training configuration system" emoji="📖">
|
||||||
|
|
||||||
|
For more details on spaCy's **configuration system** and how to use it to
|
||||||
|
customize your pipeline components, component models, training settings and
|
||||||
|
hyperparameters, see the [training config](/usage/training#config) usage guide.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Trainable components {#training-components}
|
||||||
|
|
||||||
|
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
||||||
|
components that have their own model instance, make predictions over `Doc`
|
||||||
|
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
||||||
|
plug fully custom machine learning components into your pipeline that can be
|
||||||
|
configured via a single training config.
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.my_component]
|
||||||
|
> factory = "my_component"
|
||||||
|
>
|
||||||
|
> [components.my_component.model]
|
||||||
|
> @architectures = "my_model.v1"
|
||||||
|
> width = 128
|
||||||
|
> ```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
<Infobox title="Custom trainable components" emoji="📖">
|
||||||
|
|
||||||
|
To learn more about how to implement your own **model architectures** and use
|
||||||
|
them to power custom **trainable components**, see the usage guides on the
|
||||||
|
[trainable component API](/usage/processing-pipelines#trainable-components) and
|
||||||
|
implementing [layers and architectures](/usage/layers-architectures#components)
|
||||||
|
for trainable components.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
|
||||||
If you don't want a component to be updated, you can **freeze** it by adding it
|
If you don't want a component to be updated, you can **freeze** it by adding it
|
||||||
to the `frozen_components` list in the `[training]` block. Frozen components are
|
to the `frozen_components` list in the `[training]` block. Frozen components are
|
||||||
**not updated** during training and are included in the final trained pipeline
|
**not updated** during training and are included in the final trained pipeline
|
||||||
as-is.
|
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
|
||||||
|
|
||||||
> #### Note on frozen components
|
> #### Note on frozen components
|
||||||
>
|
>
|
||||||
|
|
|
@ -168,9 +168,13 @@ follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each
|
||||||
`Model` can also be used as a sublayer of a larger network, allowing you to
|
`Model` can also be used as a sublayer of a larger network, allowing you to
|
||||||
freely combine implementations from different frameworks into a single model.
|
freely combine implementations from different frameworks into a single model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage: ** [Layers and architectures](/usage/layers-architectures)
|
- **Usage: ** [Layers and architectures](/usage/layers-architectures),
|
||||||
|
[Trainable component API](/usage/processing-pipelines#trainable-components),
|
||||||
|
[Trainable components and models](/usage/layers-architectures#components)
|
||||||
- **Thinc: **
|
- **Thinc: **
|
||||||
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
|
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
|
||||||
[`Model` API](https://thinc.ai/docs/api-model)
|
[`Model` API](https://thinc.ai/docs/api-model)
|
||||||
|
@ -503,36 +507,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
- Pipeline package symlinks, the `link` command and shortcut names are now
|
- Pipeline package symlinks, the `link` command and shortcut names are now
|
||||||
deprecated. There can be many [different trained pipelines](/models) and not
|
deprecated. There can be many [different trained pipelines](/models) and not
|
||||||
just one "English model", so you should always use the full package name like
|
just one "English model", so you should always use the full package name like
|
||||||
[`en_core_web_sm`](/models/en) explicitly.
|
`en_core_web_sm` explicitly.
|
||||||
- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide
|
- A pipeline's `meta.json` is now only used to provide meta information like the
|
||||||
meta information like the package name, author, license and labels. It's
|
package name, author, license and labels. It's **not** used to construct the
|
||||||
**not** used to construct the processing pipeline anymore. This is all defined
|
processing pipeline anymore. This is all defined in the
|
||||||
in the [`config.cfg`](/api/data-formats#config), which also includes all
|
[`config.cfg`](/api/data-formats#config), which also includes all settings
|
||||||
settings used to train the pipeline.
|
used to train the pipeline.
|
||||||
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
- The `train`, `pretrain` and `debug data` commands now only take a
|
||||||
only take a `config.cfg` file containing the full
|
`config.cfg`.
|
||||||
[training config](/usage/training#config).
|
|
||||||
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
||||||
the component factory instead of the component function.
|
the component factory instead of the component function.
|
||||||
- **Custom pipeline components** now need to be decorated with the
|
- **Custom pipeline components** now need to be decorated with the
|
||||||
[`@Language.component`](/api/language#component) or
|
[`@Language.component`](/api/language#component) or
|
||||||
[`@Language.factory`](/api/language#factory) decorator.
|
[`@Language.factory`](/api/language#factory) decorator.
|
||||||
- [`Language.update`](/api/language#update) now takes a batch of
|
|
||||||
[`Example`](/api/example) objects instead of raw texts and annotations, or
|
|
||||||
`Doc` and `GoldParse` objects.
|
|
||||||
- The `Language.disable_pipes` context manager has been replaced by
|
|
||||||
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
|
|
||||||
disable or enable components.
|
|
||||||
- The [`Language.update`](/api/language#update),
|
- The [`Language.update`](/api/language#update),
|
||||||
[`Language.evaluate`](/api/language#evaluate) and
|
[`Language.evaluate`](/api/language#evaluate) and
|
||||||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
raw text and a dictionary of annotations.
|
raw text and a dictionary of annotations.
|
||||||
[`Language.initialize`](/api/language#initialize) and
|
- The `begin_training` methods have been renamed to `initialize` and now take a
|
||||||
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
|
function that returns a sequence of `Example` objects to initialize the model
|
||||||
sequence of `Example` objects to initialize the model instead of a list of
|
instead of a list of tuples.
|
||||||
tuples.
|
|
||||||
- The `begin_training` methods have been renamed to `initialize`.
|
|
||||||
- [`Matcher.add`](/api/matcher#add) and
|
- [`Matcher.add`](/api/matcher#add) and
|
||||||
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
||||||
patterns as the second argument (instead of a variable number of arguments).
|
patterns as the second argument (instead of a variable number of arguments).
|
||||||
|
@ -557,7 +552,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
|
|
||||||
| Removed | Replacement |
|
| Removed | Replacement |
|
||||||
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) |
|
||||||
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
|
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
|
||||||
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
||||||
| `GoldParse` | [`Example`](/api/example) |
|
| `GoldParse` | [`Example`](/api/example) |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user