mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge branch 'develop' into feature/component-scores
This commit is contained in:
commit
894e20c466
|
@ -19,6 +19,7 @@ def package_cli(
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
|
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
@ -37,6 +38,7 @@ def package_cli(
|
||||||
meta_path=meta_path,
|
meta_path=meta_path,
|
||||||
version=version,
|
version=version,
|
||||||
create_meta=create_meta,
|
create_meta=create_meta,
|
||||||
|
create_sdist=not no_sdist,
|
||||||
force=force,
|
force=force,
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
@ -48,6 +50,7 @@ def package(
|
||||||
meta_path: Optional[Path] = None,
|
meta_path: Optional[Path] = None,
|
||||||
version: Optional[str] = None,
|
version: Optional[str] = None,
|
||||||
create_meta: bool = False,
|
create_meta: bool = False,
|
||||||
|
create_sdist: bool = True,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -61,7 +64,6 @@ def package(
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||||
|
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
||||||
|
@ -80,7 +82,6 @@ def package(
|
||||||
model_name_v = model_name + "-" + meta["version"]
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_dir / model_name_v
|
main_path = output_dir / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(str(package_path))
|
shutil.rmtree(str(package_path))
|
||||||
|
@ -98,10 +99,11 @@ def package(
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
with util.working_dir(main_path):
|
if create_sdist:
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
with util.working_dir(main_path):
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
util.run_command([sys.executable, "setup.py", "sdist"])
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||||
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
|
|
||||||
|
|
||||||
def create_file(file_path: Path, contents: str) -> None:
|
def create_file(file_path: Path, contents: str) -> None:
|
||||||
|
|
|
@ -39,18 +39,13 @@ score_weights = {}
|
||||||
# These settings are invalid for the transformer models.
|
# These settings are invalid for the transformer models.
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
discard_oversize = false
|
discard_oversize = false
|
||||||
batch_by = "sequences"
|
|
||||||
raw_text = null
|
raw_text = null
|
||||||
tag_map = null
|
tag_map = null
|
||||||
morph_rules = null
|
morph_rules = null
|
||||||
base_model = null
|
base_model = null
|
||||||
vectors = null
|
vectors = null
|
||||||
|
batch_by = "words"
|
||||||
[training.batch_size]
|
batch_size = 1000
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 1000
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
[training.optimizer]
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
|
|
|
@ -110,6 +110,7 @@ class Language:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab: Union[Vocab, bool] = True,
|
vocab: Union[Vocab, bool] = True,
|
||||||
|
*,
|
||||||
max_length: int = 10 ** 6,
|
max_length: int = 10 ** 6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
|
@ -549,6 +550,7 @@ class Language:
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate, overrides=overrides)
|
resolved, filled = registry.resolve(cfg, validate=validate, overrides=overrides)
|
||||||
filled = filled[factory_name]
|
filled = filled[factory_name]
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
|
filled.pop("@factories", None)
|
||||||
self._pipe_configs[name] = filled
|
self._pipe_configs[name] = filled
|
||||||
return resolved[factory_name]
|
return resolved[factory_name]
|
||||||
|
|
||||||
|
@ -1284,6 +1286,7 @@ class Language:
|
||||||
def from_config(
|
def from_config(
|
||||||
cls,
|
cls,
|
||||||
config: Union[Dict[str, Any], Config] = {},
|
config: Union[Dict[str, Any], Config] = {},
|
||||||
|
*,
|
||||||
disable: Iterable[str] = tuple(),
|
disable: Iterable[str] = tuple(),
|
||||||
overrides: Dict[str, Any] = {},
|
overrides: Dict[str, Any] = {},
|
||||||
auto_fill: bool = True,
|
auto_fill: bool = True,
|
||||||
|
|
|
@ -53,7 +53,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCat.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier(
|
||||||
width,
|
width,
|
||||||
embed_size,
|
embed_size,
|
||||||
|
|
|
@ -73,7 +73,6 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser
|
DOCS: https://spacy.io/api/dependencyparser
|
||||||
"""
|
"""
|
||||||
# cdef classes can't have decorators, so we're defining this here
|
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -107,6 +106,14 @@ cdef class DependencyParser(Parser):
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||||
|
and Scorer.score_deps.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/dependencyparser#score
|
||||||
|
"""
|
||||||
def dep_getter(token, attr):
|
def dep_getter(token, attr):
|
||||||
dep = getattr(token, attr)
|
dep = getattr(token, attr)
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
|
|
@ -86,6 +86,19 @@ class EntityLinker(Pipe):
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
kb (KnowledgeBase): TODO:
|
||||||
|
labels_discard (Iterable[str]): TODO:
|
||||||
|
incl_prior (bool): TODO:
|
||||||
|
incl_context (bool): TODO:
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -119,6 +132,19 @@ class EntityLinker(Pipe):
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
returns gold-standard Example objects.
|
||||||
|
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||||
|
components that this component is part of. Corresponds to
|
||||||
|
nlp.pipeline.
|
||||||
|
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||||
|
create_optimizer if it doesn't exist.
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#begin_training
|
||||||
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
self.set_output(nO)
|
self.set_output(nO)
|
||||||
|
@ -136,6 +162,20 @@ class EntityLinker(Pipe):
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
) -> Dict[str, float]:
|
) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
set_annotations (bool): Whether or not to update the Example objects
|
||||||
|
with the predictions.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#update
|
||||||
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -215,18 +255,43 @@ class EntityLinker(Pipe):
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
|
"""Apply the pipe to a Doc.
|
||||||
|
|
||||||
|
doc (Doc): The document to process.
|
||||||
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#call
|
||||||
|
"""
|
||||||
kb_ids = self.predict([doc])
|
kb_ids = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids)
|
self.set_annotations([doc], kb_ids)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
applied to the Doc.
|
||||||
|
|
||||||
|
stream (Iterable[Doc]): A stream of documents.
|
||||||
|
batch_size (int): The number of documents to buffer.
|
||||||
|
YIELDS (Doc): PRocessed documents in order.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#pipe
|
||||||
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
kb_ids = self.predict(docs)
|
kb_ids = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids)
|
self.set_annotations(docs, kb_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||||
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
Returns the KB IDs for each entity in each doc, including NIL if there is
|
||||||
|
no prediction.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS (List[int]): The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#predict
|
||||||
|
"""
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
final_kb_ids = []
|
final_kb_ids = []
|
||||||
|
@ -315,7 +380,14 @@ class EntityLinker(Pipe):
|
||||||
raise RuntimeError(err)
|
raise RuntimeError(err)
|
||||||
return final_kb_ids
|
return final_kb_ids
|
||||||
|
|
||||||
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[int]) -> None:
|
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#predict
|
||||||
|
"""
|
||||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||||
if count_ents != len(kb_ids):
|
if count_ents != len(kb_ids):
|
||||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||||
|
@ -328,6 +400,13 @@ class EntityLinker(Pipe):
|
||||||
token.ent_kb_id_ = kb_id
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None:
|
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None:
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||||
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
self.cfg["entity_width"] = self.kb.entity_vector_length
|
self.cfg["entity_width"] = self.kb.entity_vector_length
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
@ -339,6 +418,15 @@ class EntityLinker(Pipe):
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], exclude: Iterable[str] = tuple()
|
self, path: Union[str, Path], exclude: Iterable[str] = tuple()
|
||||||
) -> "EntityLinker":
|
) -> "EntityLinker":
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#from_disk
|
||||||
|
"""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(p.open("rb").read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
|
@ -359,7 +447,7 @@ class EntityLinker(Pipe):
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **config):
|
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
|
|
|
@ -74,6 +74,10 @@ class EntityRuler:
|
||||||
|
|
||||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||||
and process phrase patterns.
|
and process phrase patterns.
|
||||||
|
name (str): Instance name of the current pipeline component. Typically
|
||||||
|
passed in automatically from the factory when the component is
|
||||||
|
added. Used to disable the current entity ruler while creating
|
||||||
|
phrase patterns with the nlp object.
|
||||||
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
||||||
to the internal PhraseMatcher as `attr`
|
to the internal PhraseMatcher as `attr`
|
||||||
validate (bool): Whether patterns should be validated, passed to
|
validate (bool): Whether patterns should be validated, passed to
|
||||||
|
|
|
@ -63,6 +63,17 @@ class Morphologizer(Tagger):
|
||||||
labels_morph: Optional[dict] = None,
|
labels_morph: Optional[dict] = None,
|
||||||
labels_pos: Optional[dict] = None,
|
labels_pos: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
|
"""Initialize a morphologizer.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
labels_morph (dict): TODO:
|
||||||
|
labels_pos (dict): TODO:
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#init
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -79,9 +90,17 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||||
return tuple(self.cfg["labels_morph"].keys())
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
label (str): The label to add.
|
||||||
|
RETURNS (int): 1
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#add_label
|
||||||
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
|
@ -101,7 +120,20 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
returns gold-standard Example objects.
|
||||||
|
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||||
|
components that this component is part of. Corresponds to
|
||||||
|
nlp.pipeline.
|
||||||
|
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||||
|
create_optimizer if it doesn't exist.
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#begin_training
|
||||||
|
"""
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
|
@ -123,6 +155,13 @@ class Morphologizer(Tagger):
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#predict
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -139,6 +178,15 @@ class Morphologizer(Tagger):
|
||||||
doc.is_morphed = True
|
doc.is_morphed = True
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||||
|
"""
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
@ -166,6 +214,15 @@ class Morphologizer(Tagger):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by
|
||||||
|
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||||
|
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#score
|
||||||
|
"""
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
||||||
|
@ -174,6 +231,13 @@ class Morphologizer(Tagger):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#to_bytes
|
||||||
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -181,6 +245,14 @@ class Morphologizer(Tagger):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
bytes_data (bytes): The serialized pipe.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#from_bytes
|
||||||
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -196,6 +268,13 @@ class Morphologizer(Tagger):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#to_disk
|
||||||
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
|
@ -204,6 +283,14 @@ class Morphologizer(Tagger):
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple()):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/morphologizer#from_disk
|
||||||
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -94,4 +94,11 @@ cdef class EntityRecognizer(Parser):
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entityrecognizer#score
|
||||||
|
"""
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
|
@ -23,7 +23,7 @@ class Pipe:
|
||||||
|
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, name, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ class Pipe:
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
return create_default_optimizer()
|
return create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
|
|
@ -43,7 +43,7 @@ class Sentencizer(Pipe):
|
||||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||||
'。', '。']
|
'。', '。']
|
||||||
|
|
||||||
def __init__(self, name="sentencizer", *, punct_chars):
|
def __init__(self, name="sentencizer", *, punct_chars=None):
|
||||||
"""Initialize the sentencizer.
|
"""Initialize the sentencizer.
|
||||||
|
|
||||||
punct_chars (list): Punctuation characters to split on. Will be
|
punct_chars (list): Punctuation characters to split on. Will be
|
||||||
|
@ -64,8 +64,8 @@ class Sentencizer(Pipe):
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
example (Doc or Example): The document to process.
|
doc (Doc): The document to process.
|
||||||
RETURNS (Doc or Example): The processed Doc or Example.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
|
@ -85,14 +85,26 @@ class Sentencizer(Pipe):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, batch_size=128):
|
||||||
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
applied to the Doc.
|
||||||
|
|
||||||
|
stream (Iterable[Doc]): A stream of documents.
|
||||||
|
batch_size (int): The number of documents to buffer.
|
||||||
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencizer#pipe
|
||||||
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
"""Apply the pipeline's model to a batch of docs, without
|
"""Apply the pipe to a batch of docs, without modifying them.
|
||||||
modifying them.
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: The predictions for each document.
|
||||||
"""
|
"""
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -119,6 +131,11 @@ class Sentencizer(Pipe):
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
scores: The tag IDs produced by Sentencizer.predict.
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -134,6 +151,13 @@ class Sentencizer(Pipe):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencizer#score
|
||||||
|
"""
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -47,6 +47,15 @@ class SentenceRecognizer(Tagger):
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="senter"):
|
def __init__(self, vocab, model, name="senter"):
|
||||||
|
"""Initialize a sentence recognizer.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#init
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -55,12 +64,20 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
"""RETURNS (Tuple[str]): The labels."""
|
||||||
# labels are numbered by index internally, so this matches GoldParse
|
# labels are numbered by index internally, so this matches GoldParse
|
||||||
# and Example where the sentence-initial tag is 1 and other positions
|
# and Example where the sentence-initial tag is 1 and other positions
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#predict
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -77,6 +94,15 @@ class SentenceRecognizer(Tagger):
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
|
||||||
|
"""
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||||
truths = []
|
truths = []
|
||||||
|
@ -96,7 +122,20 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
returns gold-standard Example objects.
|
||||||
|
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||||
|
components that this component is part of. Corresponds to
|
||||||
|
nlp.pipeline.
|
||||||
|
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||||
|
create_optimizer if it doesn't exist.
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#begin_training
|
||||||
|
"""
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
util.link_vectors_to_models(self.vocab)
|
util.link_vectors_to_models(self.vocab)
|
||||||
|
@ -108,11 +147,24 @@ class SentenceRecognizer(Tagger):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#score
|
||||||
|
"""
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#to_bytes
|
||||||
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -120,6 +172,14 @@ class SentenceRecognizer(Tagger):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
bytes_data (bytes): The serialized pipe.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#from_bytes
|
||||||
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -135,6 +195,13 @@ class SentenceRecognizer(Tagger):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#to_disk
|
||||||
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||||
|
@ -143,6 +210,14 @@ class SentenceRecognizer(Tagger):
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple()):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/sentencerecognizer#from_disk
|
||||||
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -53,6 +53,16 @@ class Tagger(Pipe):
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
|
def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
|
||||||
|
"""Initialize a part-of-speech tagger.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
set_morphology (bool): Whether to set morphological features.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#init
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -62,20 +72,52 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
"""The labels currently added to the component. Note that even for a
|
||||||
|
blank component, this will always include the built-in coarse-grained
|
||||||
|
part-of-speech tags by default.
|
||||||
|
|
||||||
|
RETURNS (Tuple[str]): The labels.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#labels
|
||||||
|
"""
|
||||||
return tuple(self.vocab.morphology.tag_names)
|
return tuple(self.vocab.morphology.tag_names)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
"""Apply the pipe to a Doc.
|
||||||
|
|
||||||
|
doc (Doc): The document to process.
|
||||||
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#call
|
||||||
|
"""
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128):
|
def pipe(self, stream, *, batch_size=128):
|
||||||
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
applied to the Doc.
|
||||||
|
|
||||||
|
stream (Iterable[Doc]): A stream of documents.
|
||||||
|
batch_size (int): The number of documents to buffer.
|
||||||
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#pipe
|
||||||
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#predict
|
||||||
|
"""
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
n_labels = len(self.labels)
|
n_labels = len(self.labels)
|
||||||
|
@ -98,6 +140,13 @@ class Tagger(Pipe):
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#predict
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -123,10 +172,23 @@ class Tagger(Pipe):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
set_annotations (bool): Whether or not to update the Example objects
|
||||||
|
with the predictions.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#update
|
||||||
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -151,9 +213,20 @@ class Tagger(Pipe):
|
||||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
an initial model.
|
teach the current model to make predictions similar to an initial model,
|
||||||
|
to try to address the "catastrophic forgetting" problem. This feature is
|
||||||
|
experimental.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
|
@ -176,6 +249,15 @@ class Tagger(Pipe):
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#get_loss
|
||||||
|
"""
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
@ -183,7 +265,20 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
|
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
returns gold-standard Example objects.
|
||||||
|
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||||
|
components that this component is part of. Corresponds to
|
||||||
|
nlp.pipeline.
|
||||||
|
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||||
|
create_optimizer if it doesn't exist.
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#begin_training
|
||||||
|
"""
|
||||||
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
if not any(table in self.vocab.lookups for table in lemma_tables):
|
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||||
warnings.warn(Warnings.W022)
|
warnings.warn(Warnings.W022)
|
||||||
|
@ -229,6 +324,15 @@ class Tagger(Pipe):
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
label (str): The label to add.
|
||||||
|
values (Dict[int, str]): Optional values to map to the label, e.g. a
|
||||||
|
tag map dictionary.
|
||||||
|
RETURNS (int): 1
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#add_label
|
||||||
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
|
@ -256,6 +360,14 @@ class Tagger(Pipe):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by
|
||||||
|
Scorer.score_token_attr for the attributes "tag", "pos" and "lemma".
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#score
|
||||||
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||||
scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
|
@ -263,6 +375,13 @@ class Tagger(Pipe):
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#to_bytes
|
||||||
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["model"] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
|
@ -274,6 +393,14 @@ class Tagger(Pipe):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
bytes_data (bytes): The serialized pipe.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Tagger): The loaded Tagger.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#from_bytes
|
||||||
|
"""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
try:
|
try:
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -302,6 +429,13 @@ class Tagger(Pipe):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#to_disk
|
||||||
|
"""
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
morph_rules = dict(self.vocab.morphology.exc)
|
morph_rules = dict(self.vocab.morphology.exc)
|
||||||
serialize = {
|
serialize = {
|
||||||
|
@ -314,6 +448,14 @@ class Tagger(Pipe):
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple()):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Tagger): The modified Tagger object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#from_disk
|
||||||
|
"""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
with p.open("rb") as file_:
|
with p.open("rb") as file_:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..vocab import Vocab
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCat.v1"
|
@architectures = "spacy.TextCatEnsemble.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
pretrained_vectors = null
|
pretrained_vectors = null
|
||||||
width = 64
|
width = 64
|
||||||
|
@ -79,6 +79,16 @@ class TextCategorizer(Pipe):
|
||||||
*,
|
*,
|
||||||
labels: Iterable[str],
|
labels: Iterable[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize a text categorizer.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
labels (Iterable[str]): The labels to use.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#init
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -88,6 +98,10 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
|
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#labels
|
||||||
|
"""
|
||||||
return tuple(self.cfg.setdefault("labels", []))
|
return tuple(self.cfg.setdefault("labels", []))
|
||||||
|
|
||||||
def require_labels(self) -> None:
|
def require_labels(self) -> None:
|
||||||
|
@ -99,13 +113,30 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self, value: Iterable[str]) -> None:
|
def labels(self, value: Iterable[str]) -> None:
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream: Iterator[str], batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
applied to the Doc.
|
||||||
|
|
||||||
|
stream (Iterable[Doc]): A stream of documents.
|
||||||
|
batch_size (int): The number of documents to buffer.
|
||||||
|
YIELDS (Doc): PRocessed documents in order.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#pipe
|
||||||
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]):
|
def predict(self, docs: Iterable[Doc]):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#predict
|
||||||
|
"""
|
||||||
tensors = [doc.tensor for doc in docs]
|
tensors = [doc.tensor for doc in docs]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -117,6 +148,13 @@ class TextCategorizer(Pipe):
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
def set_annotations(self, docs: Iterable[Doc], scores) -> None:
|
def set_annotations(self, docs: Iterable[Doc], scores) -> None:
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
scores: The scores to set, produced by TextCategorizer.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#predict
|
||||||
|
"""
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
@ -130,6 +168,20 @@ class TextCategorizer(Pipe):
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
) -> Dict[str, float]:
|
) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
set_annotations (bool): Whether or not to update the Example objects
|
||||||
|
with the predictions.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#update
|
||||||
|
"""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -157,10 +209,25 @@ class TextCategorizer(Pipe):
|
||||||
def rehearse(
|
def rehearse(
|
||||||
self,
|
self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
drop: float = 0.0,
|
drop: float = 0.0,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
) -> None:
|
) -> Dict[str, float]:
|
||||||
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
|
teach the current model to make predictions similar to an initial model,
|
||||||
|
to try to address the "catastrophic forgetting" problem. This feature is
|
||||||
|
experimental.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#rehearse
|
||||||
|
"""
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
@ -184,6 +251,7 @@ class TextCategorizer(Pipe):
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += (gradient ** 2).sum()
|
losses[self.name] += (gradient ** 2).sum()
|
||||||
|
return losses
|
||||||
|
|
||||||
def _examples_to_truth(
|
def _examples_to_truth(
|
||||||
self, examples: List[Example]
|
self, examples: List[Example]
|
||||||
|
@ -200,6 +268,15 @@ class TextCategorizer(Pipe):
|
||||||
return truths, not_missing
|
return truths, not_missing
|
||||||
|
|
||||||
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#get_loss
|
||||||
|
"""
|
||||||
truths, not_missing = self._examples_to_truth(examples)
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
not_missing = self.model.ops.asarray(not_missing)
|
not_missing = self.model.ops.asarray(not_missing)
|
||||||
d_scores = (scores - truths) / scores.shape[0]
|
d_scores = (scores - truths) / scores.shape[0]
|
||||||
|
@ -208,6 +285,13 @@ class TextCategorizer(Pipe):
|
||||||
return float(mean_square_error), d_scores
|
return float(mean_square_error), d_scores
|
||||||
|
|
||||||
def add_label(self, label: str) -> int:
|
def add_label(self, label: str) -> int:
|
||||||
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
label (str): The label to add.
|
||||||
|
RETURNS (int): 1.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#add_label
|
||||||
|
"""
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
|
@ -228,10 +312,24 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable = lambda: [],
|
get_examples: Callable[[], Iterable[Example]] = lambda: [],
|
||||||
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||||
|
returns gold-standard Example objects.
|
||||||
|
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||||
|
components that this component is part of. Corresponds to
|
||||||
|
nlp.pipeline.
|
||||||
|
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||||
|
create_optimizer if it doesn't exist.
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#begin_training
|
||||||
|
"""
|
||||||
# TODO: begin_training is not guaranteed to see all data / labels ?
|
# TODO: begin_training is not guaranteed to see all data / labels ?
|
||||||
examples = list(get_examples())
|
examples = list(get_examples())
|
||||||
for example in examples:
|
for example in examples:
|
||||||
|
@ -257,9 +355,18 @@ class TextCategorizer(Pipe):
|
||||||
def score(
|
def score(
|
||||||
self,
|
self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
positive_label: Optional[str] = None,
|
positive_label: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
positive_label (str): Optional positive label.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/textcategorizer#score
|
||||||
|
"""
|
||||||
return Scorer.score_cats(
|
return Scorer.score_cats(
|
||||||
examples,
|
examples,
|
||||||
"cats",
|
"cats",
|
||||||
|
|
|
@ -160,7 +160,7 @@ cdef class Parser:
|
||||||
self.set_annotations([doc], states)
|
self.set_annotations([doc], states)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=256):
|
def pipe(self, docs, *, int batch_size=256):
|
||||||
"""Process a stream of documents.
|
"""Process a stream of documents.
|
||||||
|
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
|
|
|
@ -155,7 +155,7 @@ def test_pipe_class_component_model():
|
||||||
name = "test_class_component_model"
|
name = "test_class_component_model"
|
||||||
default_config = {
|
default_config = {
|
||||||
"model": {
|
"model": {
|
||||||
"@architectures": "spacy.TextCat.v1",
|
"@architectures": "spacy.TextCatEnsemble.v1",
|
||||||
"exclusive_classes": False,
|
"exclusive_classes": False,
|
||||||
"pretrained_vectors": None,
|
"pretrained_vectors": None,
|
||||||
"width": 64,
|
"width": 64,
|
||||||
|
|
|
@ -133,9 +133,9 @@ def test_overfitting_IO():
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
||||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
||||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
||||||
],
|
],
|
||||||
|
|
|
@ -384,7 +384,7 @@ original file is shown at the top of the widget.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_countries_api.py
|
https://github.com/explosion/spaCy/tree/master/spacy/language.py
|
||||||
```
|
```
|
||||||
|
|
||||||
### Infobox
|
### Infobox
|
||||||
|
|
|
@ -535,13 +535,14 @@ then create a `.tar.gz` archive file that you can distribute and install with
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
The `spacy package` command now also builds the `.tar.gz` archive automatically,
|
The `spacy package` command now also builds the `.tar.gz` archive automatically,
|
||||||
so you don't have to run `python setup.py sdist` separately anymore.
|
so you don't have to run `python setup.py sdist` separately anymore. To disable
|
||||||
|
this, you can set the `--no-sdist` flag.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
[--version] [--force]
|
[--no-sdist] [--version] [--force]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -557,7 +558,8 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
| `input_dir` | positional | Path to directory containing model data. |
|
| `input_dir` | positional | Path to directory containing model data. |
|
||||||
| `output_dir` | positional | Directory to create package folder in. |
|
| `output_dir` | positional | Directory to create package folder in. |
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
|
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
|
||||||
| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
|
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
|
||||||
|
| `--no-sdist`, `-NS`, | flag | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. |
|
||||||
| `--version`, `-v` <Tag variant="new">3</Tag> | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. |
|
| `--version`, `-v` <Tag variant="new">3</Tag> | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. |
|
||||||
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
|
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
|
8
website/docs/api/dependencymatcher.md
Normal file
8
website/docs/api/dependencymatcher.md
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
title: DependencyMatcher
|
||||||
|
teaser: Match sequences of tokens, based on the dependency parse
|
||||||
|
tag: class
|
||||||
|
source: spacy/matcher/dependencymatcher.pyx
|
||||||
|
---
|
||||||
|
|
||||||
|
TODO: write
|
|
@ -1,23 +1,41 @@
|
||||||
---
|
---
|
||||||
title: DependencyParser
|
title: DependencyParser
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/dep_parser.pyx
|
||||||
|
teaser: 'Pipeline component for syntactic dependency parsing'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: parser
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
This class is a subclass of `Pipe` and follows the same API. The pipeline
|
## Config and implementation {#config}
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
|
||||||
via the ID `"parser"`.
|
|
||||||
|
|
||||||
## Default config {#config}
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
> #### Example
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
>
|
||||||
documentation for details on the architectures and their arguments and
|
> ```python
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
> from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
> config = {
|
||||||
|
> "moves": None,
|
||||||
|
> # TODO: rest
|
||||||
|
> "model": DEFAULT_PARSER_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("parser", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
|
| `moves` | list | <!-- TODO: --> | `None` |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx
|
||||||
```
|
```
|
||||||
|
|
||||||
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
## DependencyParser.\_\_init\_\_ {#init tag="method"}
|
||||||
|
@ -25,29 +43,33 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_d
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe with default model
|
> # Construction via add_pipe with default model
|
||||||
> parser = nlp.create_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
>
|
>
|
||||||
> # Construction via create_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_parser"}}
|
> config = {"model": {"@architectures": "my_parser"}}
|
||||||
> parser = nlp.create_pipe("parser", config)
|
> parser = nlp.add_pipe("parser", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class with custom model from file
|
> # Construction from class
|
||||||
> from spacy.pipeline import DependencyParser
|
> from spacy.pipeline import DependencyParser
|
||||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
|
||||||
> parser = DependencyParser(nlp.vocab, model)
|
> parser = DependencyParser(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| **RETURNS** | `DependencyParser` | The newly constructed object. |
|
| `moves` | list | <!-- TODO: --> |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
||||||
|
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
||||||
|
| `learn_tokens` | bool | <!-- TODO: --> |
|
||||||
|
| `min_action_freq` | int | <!-- TODO: --> |
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -62,8 +84,8 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> parser = nlp.add_pipe("parser")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = parser(doc)
|
> processed = parser(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -85,16 +107,37 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> for doc in parser.pipe(docs, batch_size=50):
|
> for doc in parser.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
|
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> parser = nlp.add_pipe("parser")
|
||||||
|
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## DependencyParser.predict {#predict tag="method"}
|
## DependencyParser.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -103,7 +146,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> scores = parser.predict([doc1, doc2])
|
> scores = parser.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -119,7 +162,7 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> scores = parser.predict([doc1, doc2])
|
> scores = parser.predict([doc1, doc2])
|
||||||
> parser.set_annotations([doc1, doc2], scores)
|
> parser.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
@ -138,7 +181,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab, parser_model)
|
> parser = nlp.add_pipe("parser")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> losses = parser.update(examples, sgd=optimizer)
|
> losses = parser.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
@ -150,7 +193,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
||||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## DependencyParser.get_loss {#get_loss tag="method"}
|
## DependencyParser.get_loss {#get_loss tag="method"}
|
||||||
|
@ -161,36 +204,31 @@ predicted scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> scores = parser.predict([eg.predicted for eg in examples])
|
> scores = parser.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = parser.get_loss(examples, scores)
|
> loss, d_loss = parser.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------- |
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Score a batch of examples.
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> scores = parser.score(examples)
|
||||||
> nlp.pipeline.append(parser)
|
|
||||||
> optimizer = parser.begin_training(pipeline=nlp.pipeline)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). |
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -200,13 +238,13 @@ component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> optimizer = parser.create_optimizer()
|
> optimizer = parser.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
@ -231,7 +269,7 @@ Add a new label to the pipe.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.add_label("MY_LABEL")
|
> parser.add_label("MY_LABEL")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -246,14 +284,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.to_disk("/path/to/parser")
|
> parser.to_disk("/path/to/parser")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## DependencyParser.from_disk {#from_disk tag="method"}
|
## DependencyParser.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -262,14 +300,14 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.from_disk("/path/to/parser")
|
> parser.from_disk("/path/to/parser")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -277,16 +315,16 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser_bytes = parser.to_bytes()
|
> parser_bytes = parser.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -296,14 +334,14 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser_bytes = parser.to_bytes()
|
> parser_bytes = parser.to_bytes()
|
||||||
> parser = DependencyParser(nlp.vocab)
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.from_bytes(parser_bytes)
|
> parser.from_bytes(parser_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
||||||
|
|
||||||
## DependencyParser.labels {#labels tag="property"}
|
## DependencyParser.labels {#labels tag="property"}
|
||||||
|
|
|
@ -1,27 +1,47 @@
|
||||||
---
|
---
|
||||||
title: EntityLinker
|
title: EntityLinker
|
||||||
teaser:
|
|
||||||
Functionality to disambiguate a named entity in text to a unique knowledge
|
|
||||||
base identifier.
|
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/entity_linker.py
|
||||||
new: 2.2
|
new: 2.2
|
||||||
|
teaser: 'Pipeline component for named entity linking and disambiguation'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: entity_linker
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
This class is a subclass of `Pipe` and follows the same API. The pipeline
|
## Config and implementation {#config}
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
|
||||||
via the ID `"entity_linker"`.
|
|
||||||
|
|
||||||
## Default config {#config}
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
> #### Example
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
>
|
||||||
documentation for details on the architectures and their arguments and
|
> ```python
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
> from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
> config = {
|
||||||
|
> "kb": None,
|
||||||
|
> "labels_discard": [],
|
||||||
|
> "incl_prior": True,
|
||||||
|
> "incl_context": True,
|
||||||
|
> "model": DEFAULT_NEL_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
||||||
|
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` |
|
||||||
|
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` |
|
||||||
|
| `incl_prior` | bool | <!-- TODO: --> | `True` |
|
||||||
|
| `incl_context` | bool | <!-- TODO: --> | `True` |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
## EntityLinker.\_\_init\_\_ {#init tag="method"}
|
||||||
|
@ -29,30 +49,32 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_l
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe with default model
|
> # Construction via add_pipe with default model
|
||||||
> entity_linker = nlp.create_pipe("entity_linker")
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
>
|
>
|
||||||
> # Construction via create_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_el"}}
|
> config = {"model": {"@architectures": "my_el"}}
|
||||||
> entity_linker = nlp.create_pipe("entity_linker", config)
|
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class with custom model from file
|
> # Construction from class
|
||||||
> from spacy.pipeline import EntityLinker
|
> from spacy.pipeline import EntityLinker
|
||||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab, model)
|
> entity_linker = EntityLinker(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------- | ------- | ------------------------------------------------------------------------------- |
|
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| **RETURNS** | `EntityLinker` | The newly constructed object. |
|
| `kb` | `KnowlegeBase` | <!-- TODO: --> |
|
||||||
|
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> |
|
||||||
|
| `incl_prior` | bool | <!-- TODO: --> |
|
||||||
|
| `incl_context` | bool | <!-- TODO: --> |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -66,8 +88,8 @@ delegate to the [`predict`](/api/entitylinker#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = entity_linker(doc)
|
> processed = entity_linker(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -89,91 +111,17 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> for doc in entity_linker.pipe(docs, batch_size=50):
|
> for doc in entity_linker.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
## EntityLinker.predict {#predict tag="method"}
|
|
||||||
|
|
||||||
Apply the pipeline's model to a batch of docs, without modifying them.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> kb_ids = entity_linker.predict([doc1, doc2])
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------- | --------------- | ------------------------------------------------------------ |
|
|
||||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
|
||||||
| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
|
||||||
|
|
||||||
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
|
||||||
|
|
||||||
Modify a batch of documents, using pre-computed entity IDs for a list of named
|
|
||||||
entities.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> kb_ids = entity_linker.predict([doc1, doc2])
|
|
||||||
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
|
||||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
|
||||||
| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
|
||||||
|
|
||||||
## EntityLinker.update {#update tag="method"}
|
|
||||||
|
|
||||||
Learn from a batch of [`Example`](/api/example) objects, updating both the
|
|
||||||
pipe's entity linking model and context encoder. Delegates to
|
|
||||||
[`predict`](/api/entitylinker#predict) and
|
|
||||||
[`get_loss`](/api/entitylinker#get_loss).
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab, nel_model)
|
|
||||||
> optimizer = nlp.begin_training()
|
|
||||||
> losses = entity_linker.update(examples, sgd=optimizer)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
|
||||||
| _keyword-only_ | | |
|
|
||||||
| `drop` | float | The dropout rate. |
|
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
|
|
||||||
| `sgd` | `Optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
|
||||||
|
|
||||||
## EntityLinker.set_kb {#set_kb tag="method"}
|
|
||||||
|
|
||||||
Define the knowledge base (KB) used for disambiguating named entities to KB
|
|
||||||
identifiers.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
|
||||||
> entity_linker.set_kb(kb)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ---- | --------------- | ------------------------------- |
|
|
||||||
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
|
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
@ -185,18 +133,94 @@ method, a knowledge base should have been defined with
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
> entity_linker.set_kb(kb)
|
> entity_linker.set_kb(kb)
|
||||||
> nlp.add_pipe(entity_linker, last=True)
|
|
||||||
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
| _keyword-only_ | | |
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. | |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
|
## EntityLinker.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
Apply the pipeline's model to a batch of docs, without modifying them. Returns
|
||||||
|
the KB IDs for each entity in each doc, including `NIL` if there is no
|
||||||
|
prediction.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------- | ------------------------------------------------------------ |
|
||||||
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
|
| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
||||||
|
|
||||||
|
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
Modify a batch of documents, using pre-computed entity IDs for a list of named
|
||||||
|
entities.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||||
|
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
||||||
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
|
| `kb_ids` | `List[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||||
|
|
||||||
|
## EntityLinker.update {#update tag="method"}
|
||||||
|
|
||||||
|
Learn from a batch of [`Example`](/api/example) objects, updating both the
|
||||||
|
pipe's entity linking model and context encoder. Delegates to
|
||||||
|
[`predict`](/api/entitylinker#predict).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
> optimizer = nlp.begin_training()
|
||||||
|
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## EntityLinker.set_kb {#set_kb tag="method"}
|
||||||
|
|
||||||
|
Define the knowledge base (KB) used for disambiguating named entities to KB
|
||||||
|
identifiers.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
> entity_linker.set_kb(kb)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ---- | --------------- | ------------------------------- |
|
||||||
|
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
|
||||||
|
|
||||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -205,13 +229,13 @@ Create an optimizer for the pipeline component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> optimizer = entity_linker.create_optimizer()
|
> optimizer = entity_linker.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
@ -220,7 +244,7 @@ Modify the pipe's EL model, to use the given parameter values.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> with entity_linker.use_params(optimizer.averages):
|
> with entity_linker.use_params(optimizer.averages):
|
||||||
> entity_linker.to_disk("/best_model")
|
> entity_linker.to_disk("/best_model")
|
||||||
> ```
|
> ```
|
||||||
|
@ -236,14 +260,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> entity_linker.to_disk("/path/to/entity_linker")
|
> entity_linker.to_disk("/path/to/entity_linker")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## EntityLinker.from_disk {#from_disk tag="method"}
|
## EntityLinker.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -252,15 +276,15 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = EntityLinker(nlp.vocab)
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> entity_linker.from_disk("/path/to/entity_linker")
|
> entity_linker.from_disk("/path/to/entity_linker")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------------- | -------------------------------------------------------------------------- |
|
| ----------- | --------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
|
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
|
@ -1,23 +1,41 @@
|
||||||
---
|
---
|
||||||
title: EntityRecognizer
|
title: EntityRecognizer
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/ner.pyx
|
||||||
|
teaser: 'Pipeline component for named entity recognition'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: ner
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
This class is a subclass of `Pipe` and follows the same API. The pipeline
|
## Config and implementation {#config}
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
|
||||||
via the ID `"ner"`.
|
|
||||||
|
|
||||||
## Default config {#config}
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
> #### Example
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
>
|
||||||
documentation for details on the architectures and their arguments and
|
> ```python
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
> from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
> config = {
|
||||||
|
> "moves": None,
|
||||||
|
> # TODO: rest
|
||||||
|
> "model": DEFAULT_NER_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("ner", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
|
| `moves` | list | <!-- TODO: --> | `None` |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx
|
||||||
```
|
```
|
||||||
|
|
||||||
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
@ -25,29 +43,33 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defa
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe with default model
|
||||||
> ner = nlp.create_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
>
|
>
|
||||||
> # Construction via create_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_ner"}}
|
> config = {"model": {"@architectures": "my_ner"}}
|
||||||
> parser = nlp.create_pipe("ner", config)
|
> parser = nlp.add_pipe("ner", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class with custom model from file
|
> # Construction from class
|
||||||
> from spacy.pipeline import EntityRecognizer
|
> from spacy.pipeline import EntityRecognizer
|
||||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
|
||||||
> ner = EntityRecognizer(nlp.vocab, model)
|
> ner = EntityRecognizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | ------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| **RETURNS** | `EntityRecognizer` | The newly constructed object. |
|
| `moves` | list | <!-- TODO: --> |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
||||||
|
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
||||||
|
| `learn_tokens` | bool | <!-- TODO: --> |
|
||||||
|
| `min_action_freq` | int | <!-- TODO: --> |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -62,8 +84,8 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> ner = nlp.add_pipe("ner")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = ner(doc)
|
> processed = ner(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -85,16 +107,37 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> for doc in ner.pipe(docs, batch_size=50):
|
> for doc in ner.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `docs` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
|
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> ner = nlp.add_pipe("ner")
|
||||||
|
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## EntityRecognizer.predict {#predict tag="method"}
|
## EntityRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -103,7 +146,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> scores = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -119,7 +162,7 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> scores = ner.predict([doc1, doc2])
|
> scores = ner.predict([doc1, doc2])
|
||||||
> ner.set_annotations([doc1, doc2], scores)
|
> ner.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
@ -138,20 +181,20 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab, ner_model)
|
> ner = nlp.add_pipe("ner")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> losses = ner.update(examples, sgd=optimizer)
|
> losses = ner.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
||||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -161,36 +204,31 @@ predicted scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> scores = ner.predict([eg.predicted for eg in examples])
|
> scores = ner.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = ner.get_loss(examples, scores)
|
> loss, d_loss = ner.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------- |
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Score a batch of examples.
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> scores = ner.score(examples)
|
||||||
> nlp.pipeline.append(ner)
|
|
||||||
> optimizer = ner.begin_training(pipeline=nlp.pipeline)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -199,13 +237,13 @@ Create an optimizer for the pipeline component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> optimizer = ner.create_optimizer()
|
> optimizer = ner.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
@ -230,7 +268,7 @@ Add a new label to the pipe.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.add_label("MY_LABEL")
|
> ner.add_label("MY_LABEL")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -245,14 +283,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.to_disk("/path/to/ner")
|
> ner.to_disk("/path/to/ner")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -261,14 +299,14 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.from_disk("/path/to/ner")
|
> ner.from_disk("/path/to/ner")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -276,16 +314,16 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner_bytes = ner.to_bytes()
|
> ner_bytes = ner.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -295,14 +333,14 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner_bytes = ner.to_bytes()
|
> ner_bytes = ner.to_bytes()
|
||||||
> ner = EntityRecognizer(nlp.vocab)
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.from_bytes(ner_bytes)
|
> ner.from_bytes(ner_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
| ------------ | ------------------ | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
||||||
|
|
||||||
## EntityRecognizer.labels {#labels tag="property"}
|
## EntityRecognizer.labels {#labels tag="property"}
|
||||||
|
|
|
@ -3,44 +3,76 @@ title: EntityRuler
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/entityruler.py
|
source: spacy/pipeline/entityruler.py
|
||||||
new: 2.1
|
new: 2.1
|
||||||
|
teaser: 'Pipeline component for rule-based named entity recognition'
|
||||||
|
api_string_name: entity_ruler
|
||||||
|
api_trainable: false
|
||||||
---
|
---
|
||||||
|
|
||||||
The EntityRuler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
|
The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
|
||||||
token-based rules or exact phrase matches. It can be combined with the
|
token-based rules or exact phrase matches. It can be combined with the
|
||||||
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
|
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
|
||||||
used on its own to implement a purely rule-based entity recognition system.
|
used on its own to implement a purely rule-based entity recognition system. For
|
||||||
After initialization, the component is typically added to the processing
|
usage examples, see the docs on
|
||||||
pipeline using [`nlp.add_pipe`](/api/language#add_pipe). For usage examples, see
|
|
||||||
the docs on
|
|
||||||
[rule-based entity recognition](/usage/rule-based-matching#entityruler).
|
[rule-based entity recognition](/usage/rule-based-matching#entityruler).
|
||||||
|
|
||||||
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {
|
||||||
|
> "phrase_matcher_attr": None,
|
||||||
|
> "validation": True,
|
||||||
|
> "overwrite_ents": False,
|
||||||
|
> "ent_id_sep": "||",
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("entity_ruler", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||||
|
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` |
|
||||||
|
| `validation` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. | `False` |
|
||||||
|
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` |
|
||||||
|
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` |
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py
|
||||||
|
```
|
||||||
|
|
||||||
## EntityRuler.\_\_init\_\_ {#init tag="method"}
|
## EntityRuler.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize the entity ruler. If patterns are supplied here, they need to be a
|
Initialize the entity ruler. If patterns are supplied here, they need to be a
|
||||||
list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
|
list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
|
||||||
be a token pattern (list) or a phrase pattern (string). For example:
|
be a token pattern (list) or a phrase pattern (string). For example:
|
||||||
`{'label': 'ORG', 'pattern': 'Apple'}`.
|
`{"label": "ORG", "pattern": "Apple"}`.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe
|
||||||
> ruler = nlp.create_pipe("entity_ruler")
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import EntityRuler
|
> from spacy.pipeline import EntityRuler
|
||||||
> ruler = EntityRuler(nlp, overwrite_ents=True)
|
> ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||||
| `patterns` | iterable | Optional patterns to load in. |
|
| `name` <Tag variant="new">3</Tag> | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
|
||||||
| `phrase_matcher_attr` | int / str | Optional attr to pass to the internal [`PhraseMatcher`](/api/phrasematcher). defaults to `None` |
|
| _keyword-only_ | | |
|
||||||
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
|
| `phrase_matcher_attr` | int / str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. |
|
||||||
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
|
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
|
||||||
| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
|
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
|
||||||
| **RETURNS** | `EntityRuler` | The newly constructed object. |
|
| `ent_id_sep` | str | Separator used internally for entity IDs. Defaults to `"||"`. |
|
||||||
|
| `patterns` | iterable | Optional patterns to load in on initialization. |
|
||||||
|
|
||||||
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
@ -49,7 +81,7 @@ The number of all patterns added to the entity ruler.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> assert len(ruler) == 0
|
> assert len(ruler) == 0
|
||||||
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
> assert len(ruler) == 1
|
> assert len(ruler) == 1
|
||||||
|
@ -66,7 +98,7 @@ Whether a label is present in the patterns.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
> assert "ORG" in ruler
|
> assert "ORG" in ruler
|
||||||
> assert not "PERSON" in ruler
|
> assert not "PERSON" in ruler
|
||||||
|
@ -90,9 +122,8 @@ is chosen.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
> nlp.add_pipe(ruler)
|
|
||||||
>
|
>
|
||||||
> doc = nlp("A text about Apple.")
|
> doc = nlp("A text about Apple.")
|
||||||
> ents = [(ent.text, ent.label_) for ent in doc.ents]
|
> ents = [(ent.text, ent.label_) for ent in doc.ents]
|
||||||
|
@ -117,7 +148,7 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
|
||||||
> {"label": "ORG", "pattern": "Apple"},
|
> {"label": "ORG", "pattern": "Apple"},
|
||||||
> {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
|
> {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
|
||||||
> ]
|
> ]
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler.add_patterns(patterns)
|
> ruler.add_patterns(patterns)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -135,7 +166,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only
|
> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only
|
||||||
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
|
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
|
||||||
> ```
|
> ```
|
||||||
|
@ -154,7 +185,7 @@ configuration.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only
|
> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only
|
||||||
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
|
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
|
||||||
> ```
|
> ```
|
||||||
|
@ -171,7 +202,7 @@ Serialize the entity ruler patterns to a bytestring.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
> ruler_bytes = ruler.to_bytes()
|
> ruler_bytes = ruler.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -187,14 +218,14 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ruler_bytes = ruler.to_bytes()
|
> ruler_bytes = ruler.to_bytes()
|
||||||
> ruler = EntityRuler(nlp)
|
> ruler = nlp.add_pipe("enity_ruler")
|
||||||
> ruler.from_bytes(ruler_bytes)
|
> ruler.from_bytes(ruler_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ------------- | ---------------------------------- |
|
| ------------ | ------------- | ---------------------------------- |
|
||||||
| `patterns_bytes` | bytes | The bytestring to load. |
|
| `bytes_data` | bytes | The bytestring to load. |
|
||||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
||||||
|
|
||||||
## EntityRuler.labels {#labels tag="property"}
|
## EntityRuler.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
|
|
@ -223,7 +223,7 @@ in `example.predicted`.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.add_pipe(my_ner)
|
> nlp.add_pipe("my_ner")
|
||||||
> doc = nlp("Mr and Mrs Smith flew to New York")
|
> doc = nlp("Mr and Mrs Smith flew to New York")
|
||||||
> tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "New York"]
|
> tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "New York"]
|
||||||
> example = Example.from_dict(doc, {"words": tokens_ref})
|
> example = Example.from_dict(doc, {"words": tokens_ref})
|
||||||
|
|
|
@ -15,6 +15,88 @@ the tagger or parser that are called on a document in order. You can also add
|
||||||
your own processing pipeline components that take a `Doc` object, modify it and
|
your own processing pipeline components that take a `Doc` object, modify it and
|
||||||
return it.
|
return it.
|
||||||
|
|
||||||
|
## Language.component {#component tag="classmethod" new="3"}
|
||||||
|
|
||||||
|
Register a custom pipeline component under a given name. This allows
|
||||||
|
initializing the component by name using
|
||||||
|
[`Language.add_pipe`](/api/language#add_pipe) and referring to it in
|
||||||
|
[config files](/usage/training#config). This classmethod and decorator is
|
||||||
|
intended for **simple stateless functions** that take a `Doc` and return it. For
|
||||||
|
more complex stateful components that allow settings and need access to the
|
||||||
|
shared `nlp` object, use the [`Language.factory`](/api/language#factory)
|
||||||
|
decorator. For more details and examples, see the
|
||||||
|
[usage documentation](/usage/processing-pipelines#custom-components).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> # Usage as a decorator
|
||||||
|
> @Language.component("my_component")
|
||||||
|
> def my_component(doc):
|
||||||
|
> # Do something to the doc
|
||||||
|
> return doc
|
||||||
|
>
|
||||||
|
> # Usage as a function
|
||||||
|
> Language.component("my_component2", func=my_component)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `name` | str | The name of the component factory. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
|
## Language.factory {#factory tag="classmethod"}
|
||||||
|
|
||||||
|
Register a custom pipeline component factory under a given name. This allows
|
||||||
|
initializing the component by name using
|
||||||
|
[`Language.add_pipe`](/api/language#add_pipe) and referring to it in
|
||||||
|
[config files](/usage/training#config). The registered factory function needs to
|
||||||
|
take at least two **named arguments** which spaCy fills in automatically: `nlp`
|
||||||
|
for the current `nlp` object and `name` for the component instance name. This
|
||||||
|
can be useful to distinguish multiple instances of the same component and allows
|
||||||
|
trainable components to add custom losses using the component instance name. The
|
||||||
|
`default_config` defines the default values of the remaining factory arguments.
|
||||||
|
It's merged into the [`nlp.config`](/api/language#config). For more details and
|
||||||
|
examples, see the
|
||||||
|
[usage documentation](/usage/processing-pipelines#custom-components).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> # Usage as a decorator
|
||||||
|
> @Language.factory(
|
||||||
|
> "my_component",
|
||||||
|
> default_config={"some_setting": True},
|
||||||
|
> )
|
||||||
|
> def create_my_component(nlp, name, some_setting):
|
||||||
|
> return MyComponent(some_setting)
|
||||||
|
>
|
||||||
|
> # Usage as function
|
||||||
|
> Language.factory(
|
||||||
|
> "my_component",
|
||||||
|
> default_config={"some_setting": True},
|
||||||
|
> func=create_my_component
|
||||||
|
> )
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `name` | str | The name of the component factory. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||||
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
## Language.\_\_init\_\_ {#init tag="method"}
|
## Language.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize a `Language` object.
|
Initialize a `Language` object.
|
||||||
|
@ -30,12 +112,41 @@ Initialize a `Language` object.
|
||||||
> nlp = English()
|
> nlp = English()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------- | ------------------------------------------------------------------------------------------ |
|
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
|
||||||
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created via `Language.Defaults.create_vocab`. |
|
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
|
||||||
| `make_doc` | callable | A function that takes text and returns a `Doc` object. Usually a `Tokenizer`. |
|
| _keyword-only_ | | |
|
||||||
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
|
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
|
||||||
| **RETURNS** | `Language` | The newly constructed object. |
|
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
|
||||||
|
| `create_tokenizer` | `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
|
||||||
|
| **RETURNS** | `Language` | The newly constructed object. |
|
||||||
|
|
||||||
|
## Language.from_config {#from_config tag="classmethod"}
|
||||||
|
|
||||||
|
Create a `Language` object from a loaded config. Will set up the tokenizer and
|
||||||
|
language data, add pipeline components based on the pipeline and components
|
||||||
|
define in the config and validate the results. If no config is provided, the
|
||||||
|
default config of the given language is used. This is also how spaCy loads a
|
||||||
|
model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from thinc.api import Config
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> config = Config().from_disk("./config.cfg")
|
||||||
|
> nlp = Language.from_config(config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
|
||||||
|
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
|
||||||
|
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
|
| **RETURNS** | `Language` | The initialized object. |
|
||||||
|
|
||||||
## Language.\_\_call\_\_ {#call tag="method"}
|
## Language.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -162,43 +273,99 @@ their original weights after the block.
|
||||||
|
|
||||||
Create a pipeline component from a factory.
|
Create a pipeline component from a factory.
|
||||||
|
|
||||||
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
|
||||||
|
the string name of the factory, creates the component, adds it to the pipeline
|
||||||
|
and returns it. The `Language.create_pipe` method is now mostly used internally.
|
||||||
|
To create a component and add it to the pipeline, you should always use
|
||||||
|
`Language.add_pipe`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = nlp.create_pipe("parser")
|
> parser = nlp.create_pipe("parser")
|
||||||
> nlp.add_pipe(parser)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------- |
|
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
|
| `factory_name` | str | Name of the registered component factory. |
|
||||||
| `config` | dict | Configuration parameters to initialize component. |
|
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
||||||
| **RETURNS** | callable | The pipeline component. |
|
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
||||||
|
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
|
| **RETURNS** | callable | The pipeline component. |
|
||||||
|
|
||||||
## Language.add_pipe {#add_pipe tag="method" new="2"}
|
## Language.add_pipe {#add_pipe tag="method" new="2"}
|
||||||
|
|
||||||
Add a component to the processing pipeline. Valid components are callables that
|
Add a component to the processing pipeline. Expects a name that maps to a
|
||||||
take a `Doc` object, modify it and return it. Only one of `before`, `after`,
|
component factory registered using
|
||||||
`first` or `last` can be set. Default behavior is `last=True`.
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory). Components should be callables
|
||||||
|
that take a `Doc` object, modify it and return it. Only one of `before`,
|
||||||
|
`after`, `first` or `last` can be set. Default behavior is `last=True`.
|
||||||
|
|
||||||
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method doesn't
|
||||||
|
take callables anymore and instead expects the name of a component factory
|
||||||
|
registered using [`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory). It now takes care of creating the
|
||||||
|
component, adds it to the pipeline and returns it.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> def component(doc):
|
> @Language.component("component")
|
||||||
|
> def component_func(doc):
|
||||||
> # modify Doc and return it return doc
|
> # modify Doc and return it return doc
|
||||||
>
|
>
|
||||||
> nlp.add_pipe(component, before="ner")
|
> nlp.add_pipe("component", before="ner")
|
||||||
> nlp.add_pipe(component, name="custom_name", last=True)
|
> component = nlp.add_pipe("component", name="custom_name", last=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `component` | callable | The pipeline component. |
|
| `factory_name` | str | Name of the registered component factory. |
|
||||||
| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
|
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
||||||
| `before` | str | Component name to insert component directly before. |
|
| _keyword-only_ | | |
|
||||||
| `after` | str | Component name to insert component directly after: |
|
| `before` | str / int | Component name or index to insert component directly before. |
|
||||||
| `first` | bool | Insert component first / not first in the pipeline. |
|
| `after` | str / int | Component name or index to insert component directly after: |
|
||||||
| `last` | bool | Insert component last / not last in the pipeline. |
|
| `first` | bool | Insert component first / not first in the pipeline. |
|
||||||
|
| `last` | bool | Insert component last / not last in the pipeline. |
|
||||||
|
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
||||||
|
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
|
| **RETURNS** <Tag variant="new">3</Tag> | callable | The pipeline component. |
|
||||||
|
|
||||||
|
## Language.has_factory {#has_factory tag="classmethod" new="3"}
|
||||||
|
|
||||||
|
Check whether a factory name is registered on the `Language` class or subclass.
|
||||||
|
Will check for
|
||||||
|
[language-specific factories](/usage/processing-pipelines#factories-language)
|
||||||
|
registered on the subclass, as well as general-purpose factories registered on
|
||||||
|
the `Language` base class, available to all subclasses.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.language import Language
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
>
|
||||||
|
> @English.component("component")
|
||||||
|
> def component(doc):
|
||||||
|
> return doc
|
||||||
|
>
|
||||||
|
> assert English.has_factory("component")
|
||||||
|
> assert not Language.has_factory("component")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---- | ---------------------------------------------------------- |
|
||||||
|
| `name` | str | Name of the pipeline factory to check. |
|
||||||
|
| **RETURNS** | bool | Whether a factory of that name is registered on the class. |
|
||||||
|
|
||||||
## Language.has_pipe {#has_pipe tag="method" new="2"}
|
## Language.has_pipe {#has_pipe tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -208,9 +375,13 @@ Check whether a component is present in the pipeline. Equivalent to
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.add_pipe(lambda doc: doc, name="component")
|
> @Language.component("component")
|
||||||
> assert "component" in nlp.pipe_names
|
> def component(doc):
|
||||||
> assert nlp.has_pipe("component")
|
> return doc
|
||||||
|
>
|
||||||
|
> nlp.add_pipe("component", name="my_component")
|
||||||
|
> assert "my_component" in nlp.pipe_names
|
||||||
|
> assert nlp.has_pipe("my_component")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -324,6 +495,88 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
||||||
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
|
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
|
||||||
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
||||||
|
|
||||||
|
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
|
||||||
|
|
||||||
|
Get the factory meta information for a given pipeline component name. Expects
|
||||||
|
the name of the component **factory**. The factory meta is an instance of the
|
||||||
|
[`FactoryMeta`](/api/language#factorymeta) dataclass and contains the
|
||||||
|
information about the component and its default provided by the
|
||||||
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> factory_meta = Language.get_factory_meta("ner")
|
||||||
|
> assert factory_meta.factory == "ner"
|
||||||
|
> print(factory_meta.default_config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----------------------------- | ------------------ |
|
||||||
|
| `name` | str | The factory name. |
|
||||||
|
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
||||||
|
|
||||||
|
## Language.get_pipe_meta {#get_pipe_meta tag="method" new="3"}
|
||||||
|
|
||||||
|
Get the factory meta information for a given pipeline component name. Expects
|
||||||
|
the name of the component **instance** in the pipeline. The factory meta is an
|
||||||
|
instance of the [`FactoryMeta`](/api/language#factorymeta) dataclass and
|
||||||
|
contains the information about the component and its default provided by the
|
||||||
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> nlp.add_pipe("ner", name="entity_recognizer")
|
||||||
|
> factory_meta = nlp.get_pipe_meta("entity_recognizer")
|
||||||
|
> assert factory_meta.factory == "ner"
|
||||||
|
> print(factory_meta.default_config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----------------------------- | ---------------------------- |
|
||||||
|
| `name` | str | The pipeline component name. |
|
||||||
|
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
||||||
|
|
||||||
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
|
Custom meta data for the Language class. If a model is loaded, contains meta
|
||||||
|
data of the model. The `Language.meta` is also what's serialized as the
|
||||||
|
`meta.json` when you save an `nlp` object to disk.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> print(nlp.meta)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---- | -------------- |
|
||||||
|
| **RETURNS** | dict | The meta data. |
|
||||||
|
|
||||||
|
## Language.config {#config tag="property" new="3"}
|
||||||
|
|
||||||
|
Export a trainable [`config.cfg`](/api/data-formats#config) for the current
|
||||||
|
`nlp` object. Includes the current pipeline, all configs used to create the
|
||||||
|
currently active pipeline components, as well as the default training config
|
||||||
|
that can be used with [`spacy train`](/api/cli#train). `Language.config` returns
|
||||||
|
a [Thinc `Config` object](https://thinc.ai/docs/api-config#config), which is a
|
||||||
|
subclass of the built-in `dict`. It supports the additional methods `to_disk`
|
||||||
|
(serialize the config to a file) and `to_str` (output the config as a string).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> nlp.config.to_disk("./config.cfg")
|
||||||
|
> print(nlp.config.to_str())
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------------------------------------------- | ----------- |
|
||||||
|
| **RETURNS** | [`Config`](https://thinc.ai/docs/api-config#config) | The config. |
|
||||||
|
|
||||||
## Language.to_disk {#to_disk tag="method" new="2"}
|
## Language.to_disk {#to_disk tag="method" new="2"}
|
||||||
|
|
||||||
Save the current state to a directory. If a model is loaded, this will **include
|
Save the current state to a directory. If a model is loaded, this will **include
|
||||||
|
@ -405,23 +658,26 @@ available to the loaded object.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------- |
|
| --------------------------------------------- | ---------------------- | ---------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A container for the lexical types. |
|
| `vocab` | `Vocab` | A container for the lexical types. |
|
||||||
| `tokenizer` | `Tokenizer` | The tokenizer. |
|
| `tokenizer` | `Tokenizer` | The tokenizer. |
|
||||||
| `make_doc` | `callable` | Callable that takes a string and returns a `Doc`. |
|
| `make_doc` | `Callable` | Callable that takes a string and returns a `Doc`. |
|
||||||
| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. |
|
| `pipeline` | `List[str, Callable]` | List of `(name, component)` tuples describing the current processing pipeline, in order. |
|
||||||
| `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. |
|
| `pipe_names` <Tag variant="new">2</Tag> | `List[str]` | List of pipeline component names, in order. |
|
||||||
| `pipe_labels` <Tag variant="new">2.2</Tag> | dict | List of labels set by the pipeline components, if available, keyed by component name. |
|
| `pipe_labels` <Tag variant="new">2.2</Tag> | `Dict[str, List[str]]` | List of labels set by the pipeline components, if available, keyed by component name. |
|
||||||
| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. |
|
| `pipe_factories` <Tag variant="new">2.2</Tag> | `Dict[str, str]` | Dictionary of pipeline component names, mapped to their factory names. |
|
||||||
| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |
|
| `factories` | `Dict[str, Callable]` | All available factory functions, keyed by name. |
|
||||||
|
| `factory_names` <Tag variant="new">3</Tag> | `List[str]` | List of all available factory names. |
|
||||||
|
| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |
|
||||||
|
|
||||||
## Class attributes {#class-attributes}
|
## Class attributes {#class-attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | ----- | ----------------------------------------------------------------------------------------------- |
|
| ---------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||||
| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||||
|
| `default_config` | dict | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). |
|
||||||
|
|
||||||
## Defaults {#defaults}
|
## Defaults {#defaults}
|
||||||
|
|
||||||
|
@ -502,3 +758,19 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
| `tokenizer` | Tokenization rules and exceptions. |
|
| `tokenizer` | Tokenization rules and exceptions. |
|
||||||
| `meta` | The meta data, available as `Language.meta`. |
|
| `meta` | The meta data, available as `Language.meta`. |
|
||||||
| ... | String names of pipeline components, e.g. `"ner"`. |
|
| ... | String names of pipeline components, e.g. `"ner"`. |
|
||||||
|
|
||||||
|
## FactoryMeta {#factorymeta new="3" tag="dataclass"}
|
||||||
|
|
||||||
|
The `FactoryMeta` contains the information about the component and its default
|
||||||
|
provided by the [`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator. It's created whenever a
|
||||||
|
component is added to the pipeline and stored on the `Language` class for each
|
||||||
|
component instance and factory instance.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `factory` | str | The name of the registered component factory. |
|
||||||
|
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||||
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||||
|
|
|
@ -5,6 +5,8 @@ tag: class
|
||||||
source: spacy/lemmatizer.py
|
source: spacy/lemmatizer.py
|
||||||
---
|
---
|
||||||
|
|
||||||
|
<!-- TODO: rewrite once it's converted to pipe -->
|
||||||
|
|
||||||
The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
|
The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
|
||||||
lookup tables.
|
lookup tables.
|
||||||
|
|
||||||
|
|
|
@ -142,11 +142,12 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||||
| `match_id` | str | An ID for the thing you're matching. |
|
| `match_id` | str | An ID for the thing you're matching. |
|
||||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
| _keyword-only_ | | |
|
||||||
|
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||||
|
|
||||||
## Matcher.remove {#remove tag="method" new="2"}
|
## Matcher.remove {#remove tag="method" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -3,27 +3,41 @@ title: Morphologizer
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/morphologizer.pyx
|
source: spacy/pipeline/morphologizer.pyx
|
||||||
new: 3
|
new: 3
|
||||||
|
teaser: 'Pipeline component for predicting morphological features'
|
||||||
|
api_base_class: /api/tagger
|
||||||
|
api_string_name: morphologizer
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
A trainable pipeline component to predict morphological features and
|
A trainable pipeline component to predict morphological features and
|
||||||
coarse-grained POS tags following the Universal Dependencies
|
coarse-grained POS tags following the Universal Dependencies
|
||||||
[UPOS](https://universaldependencies.org/u/pos/index.html) and
|
[UPOS](https://universaldependencies.org/u/pos/index.html) and
|
||||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||||
annotation guidelines. This class is a subclass of `Pipe` and follows the same
|
annotation guidelines.
|
||||||
API. The component is also available via the string name `"morphologizer"`.
|
|
||||||
After initialization, it is typically added to the processing pipeline using
|
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
|
||||||
|
|
||||||
## Default config {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
The default config is defined by the pipeline component factory and describes
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
how the component should be configured. You can override its settings via the
|
||||||
documentation for details on the architectures and their arguments and
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||||
|
> config = {"model": DEFAULT_MORPH_MODEL}
|
||||||
|
> nlp.add_pipe("morphologizer", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/morphologizer_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx
|
||||||
```
|
```
|
||||||
|
|
||||||
## Morphologizer.\_\_init\_\_ {#init tag="method"}
|
## Morphologizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
@ -33,40 +47,45 @@ Initialize the morphologizer.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe with default model
|
||||||
> morphologizer = nlp.create_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_morphologizer"}}
|
||||||
|
> morphologizer = nlp.add_pipe("morphologizer", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import Morphologizer
|
> from spacy.pipeline import Morphologizer
|
||||||
> morphologizer = Morphologizer()
|
> morphologizer = Morphologizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------------------------- |
|
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| **RETURNS** | `Morphologizer` | The newly constructed object. |
|
| _keyword-only_ | | |
|
||||||
|
| `labels_morph` | dict | <!-- TODO: --> |
|
||||||
|
| `labels_pos` | dict | <!-- TODO: --> |
|
||||||
|
|
||||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
Apply the pipe to one document. The document is modified in place, and returned.
|
Apply the pipe to one document. The document is modified in place, and returned.
|
||||||
This usually happens under the hood when the `nlp` object is called on a text
|
This usually happens under the hood when the `nlp` object is called on a text
|
||||||
and all pipeline components are applied to the `Doc` in order. Both
|
and all pipeline components are applied to the `Doc` in order. Both
|
||||||
[`__call__`](/api/morphologizer#call) and [`pipe`](/api/morphologizer#pipe) delegate to the
|
[`__call__`](/api/morphologizer#call) and [`pipe`](/api/morphologizer#pipe)
|
||||||
[`predict`](/api/morphologizer#predict) and
|
delegate to the [`predict`](/api/morphologizer#predict) and
|
||||||
[`set_annotations`](/api/morphologizer#set_annotations) methods.
|
[`set_annotations`](/api/morphologizer#set_annotations) methods.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = morphologizer(doc)
|
> processed = morphologizer(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -81,22 +100,45 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
||||||
[`pipe`](/api/morphologizer#pipe) delegate to the [`predict`](/api/morphologizer#predict) and
|
[`pipe`](/api/morphologizer#pipe) delegate to the
|
||||||
|
[`predict`](/api/morphologizer#predict) and
|
||||||
[`set_annotations`](/api/morphologizer#set_annotations) methods.
|
[`set_annotations`](/api/morphologizer#set_annotations) methods.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> for doc in morphologizer.pipe(docs, batch_size=50):
|
> for doc in morphologizer.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
|
## Morphologizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
|
> nlp.pipeline.append(morphologizer)
|
||||||
|
> optimizer = morphologizer.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## Morphologizer.predict {#predict tag="method"}
|
## Morphologizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -105,7 +147,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> scores = morphologizer.predict([doc1, doc2])
|
> scores = morphologizer.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -121,14 +163,14 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> scores = morphologizer.predict([doc1, doc2])
|
> scores = morphologizer.predict([doc1, doc2])
|
||||||
> morphologizer.set_annotations([doc1, doc2], scores)
|
> morphologizer.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | --------------- | ------------------------------------------------ |
|
| -------- | --------------- | ------------------------------------------------------- |
|
||||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
| `scores` | - | The scores to set, produced by `Morphologizer.predict`. |
|
| `scores` | - | The scores to set, produced by `Morphologizer.predict`. |
|
||||||
|
|
||||||
## Morphologizer.update {#update tag="method"}
|
## Morphologizer.update {#update tag="method"}
|
||||||
|
@ -140,20 +182,20 @@ pipe's model. Delegates to [`predict`](/api/morphologizer#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab, morphologizer_model)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> losses = morphologizer.update(examples, sgd=optimizer)
|
> losses = morphologizer.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/morphologizer#set_annotations). |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). |
|
||||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Morphologizer.get_loss {#get_loss tag="method"}
|
## Morphologizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -163,36 +205,16 @@ predicted scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> scores = morphologizer.predict([eg.predicted for eg in examples])
|
> scores = morphologizer.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = morphologizer.get_loss(examples, scores)
|
> loss, d_loss = morphologizer.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------- |
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
|
||||||
> nlp.pipeline.append(morphologizer)
|
|
||||||
> optimizer = morphologizer.begin_training(pipeline=nlp.pipeline)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/morphologizer#create_optimizer) if not set. |
|
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## Morphologizer.create_optimizer {#create_optimizer tag="method"}
|
## Morphologizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -201,13 +223,13 @@ Create an optimizer for the pipeline component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> optimizer = morphologizer.create_optimizer()
|
> optimizer = morphologizer.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## Morphologizer.use_params {#use_params tag="method, contextmanager"}
|
## Morphologizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
@ -216,7 +238,7 @@ Modify the pipe's model, to use the given parameter values.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> with morphologizer.use_params():
|
> with morphologizer.use_params():
|
||||||
> morphologizer.to_disk("/best_model")
|
> morphologizer.to_disk("/best_model")
|
||||||
> ```
|
> ```
|
||||||
|
@ -233,13 +255,13 @@ both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin")
|
> morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | ---- | --------------------------------------------------------------- |
|
| ------- | ---- | ----------------- |
|
||||||
| `label` | str | The label to add. |
|
| `label` | str | The label to add. |
|
||||||
|
|
||||||
## Morphologizer.to_disk {#to_disk tag="method"}
|
## Morphologizer.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
@ -248,14 +270,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.to_disk("/path/to/morphologizer")
|
> morphologizer.to_disk("/path/to/morphologizer")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Morphologizer.from_disk {#from_disk tag="method"}
|
## Morphologizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -264,31 +286,31 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.from_disk("/path/to/morphologizer")
|
> morphologizer.from_disk("/path/to/morphologizer")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
| ----------- | --------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. |
|
| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. |
|
||||||
|
|
||||||
## Morphologizer.to_bytes {#to_bytes tag="method"}
|
## Morphologizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer_bytes = morphologizer.to_bytes()
|
> morphologizer_bytes = morphologizer.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. |
|
||||||
|
|
||||||
## Morphologizer.from_bytes {#from_bytes tag="method"}
|
## Morphologizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -298,20 +320,20 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer_bytes = morphologizer.to_bytes()
|
> morphologizer_bytes = morphologizer.to_bytes()
|
||||||
> morphologizer = Morphologizer(nlp.vocab)
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.from_bytes(morphologizer_bytes)
|
> morphologizer.from_bytes(morphologizer_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------------------------- |
|
| ------------ | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Morphologizer` | The `Morphologizer` object. |
|
| **RETURNS** | `Morphologizer` | The `Morphologizer` object. |
|
||||||
|
|
||||||
## Morphologizer.labels {#labels tag="property"}
|
## Morphologizer.labels {#labels tag="property"}
|
||||||
|
|
||||||
The labels currently added to the component in Universal Dependencies [FEATS
|
The labels currently added to the component in Universal Dependencies
|
||||||
format](https://universaldependencies.org/format.html#morphological-annotation).
|
[FEATS format](https://universaldependencies.org/format.html#morphological-annotation).
|
||||||
Note that even for a blank component, this will always include the internal
|
Note that even for a blank component, this will always include the internal
|
||||||
empty label `_`. If POS features are used, the labels will include the
|
empty label `_`. If POS features are used, the labels will include the
|
||||||
coarse-grained POS as the feature `POS`.
|
coarse-grained POS as the feature `POS`.
|
||||||
|
@ -339,8 +361,8 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
> data = morphologizer.to_disk("/path", exclude=["vocab"])
|
> data = morphologizer.to_disk("/path", exclude=["vocab"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ------------------------------------------------------------------------------------------ |
|
| ------- | -------------------------------------------------------------- |
|
||||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -165,11 +165,12 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||||
| `match_id` | str | An ID for the thing you're matching. |
|
| `match_id` | str | An ID for the thing you're matching. |
|
||||||
| `docs` | list | `Doc` objects of the phrases to match. |
|
| `docs` | list | `Doc` objects of the phrases to match. |
|
||||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
| _keyword-only_ | | |
|
||||||
|
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||||
|
|
||||||
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
||||||
|
|
||||||
|
|
6
website/docs/api/pipe.md
Normal file
6
website/docs/api/pipe.md
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
title: Pipe
|
||||||
|
tag: class
|
||||||
|
---
|
||||||
|
|
||||||
|
TODO: write
|
|
@ -11,8 +11,7 @@ menu:
|
||||||
## merge_noun_chunks {#merge_noun_chunks tag="function"}
|
## merge_noun_chunks {#merge_noun_chunks tag="function"}
|
||||||
|
|
||||||
Merge noun chunks into a single token. Also available via the string name
|
Merge noun chunks into a single token. Also available via the string name
|
||||||
`"merge_noun_chunks"`. After initialization, the component is typically added to
|
`"merge_noun_chunks"`.
|
||||||
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -20,9 +19,7 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
> texts = [t.text for t in nlp("I have a blue car")]
|
> texts = [t.text for t in nlp("I have a blue car")]
|
||||||
> assert texts == ["I", "have", "a", "blue", "car"]
|
> assert texts == ["I", "have", "a", "blue", "car"]
|
||||||
>
|
>
|
||||||
> merge_nps = nlp.create_pipe("merge_noun_chunks")
|
> nlp.add_pipe("merge_noun_chunks")
|
||||||
> nlp.add_pipe(merge_nps)
|
|
||||||
>
|
|
||||||
> texts = [t.text for t in nlp("I have a blue car")]
|
> texts = [t.text for t in nlp("I have a blue car")]
|
||||||
> assert texts == ["I", "have", "a blue car"]
|
> assert texts == ["I", "have", "a blue car"]
|
||||||
> ```
|
> ```
|
||||||
|
@ -44,8 +41,7 @@ all other components.
|
||||||
## merge_entities {#merge_entities tag="function"}
|
## merge_entities {#merge_entities tag="function"}
|
||||||
|
|
||||||
Merge named entities into a single token. Also available via the string name
|
Merge named entities into a single token. Also available via the string name
|
||||||
`"merge_entities"`. After initialization, the component is typically added to
|
`"merge_entities"`.
|
||||||
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -53,8 +49,7 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
> texts = [t.text for t in nlp("I like David Bowie")]
|
> texts = [t.text for t in nlp("I like David Bowie")]
|
||||||
> assert texts == ["I", "like", "David", "Bowie"]
|
> assert texts == ["I", "like", "David", "Bowie"]
|
||||||
>
|
>
|
||||||
> merge_ents = nlp.create_pipe("merge_entities")
|
> nlp.add_pipe("merge_entities")
|
||||||
> nlp.add_pipe(merge_ents)
|
|
||||||
>
|
>
|
||||||
> texts = [t.text for t in nlp("I like David Bowie")]
|
> texts = [t.text for t in nlp("I like David Bowie")]
|
||||||
> assert texts == ["I", "like", "David Bowie"]
|
> assert texts == ["I", "like", "David Bowie"]
|
||||||
|
@ -76,12 +71,9 @@ components to the end of the pipeline and after all other components.
|
||||||
## merge_subtokens {#merge_subtokens tag="function" new="2.1"}
|
## merge_subtokens {#merge_subtokens tag="function" new="2.1"}
|
||||||
|
|
||||||
Merge subtokens into a single token. Also available via the string name
|
Merge subtokens into a single token. Also available via the string name
|
||||||
`"merge_subtokens"`. After initialization, the component is typically added to
|
`"merge_subtokens"`. As of v2.1, the parser is able to predict "subtokens" that
|
||||||
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
should be merged into one single token later on. This is especially relevant for
|
||||||
|
languages like Chinese, Japanese or Korean, where a "word" isn't defined as a
|
||||||
As of v2.1, the parser is able to predict "subtokens" that should be merged into
|
|
||||||
one single token later on. This is especially relevant for languages like
|
|
||||||
Chinese, Japanese or Korean, where a "word" isn't defined as a
|
|
||||||
whitespace-delimited sequence of characters. Under the hood, this component uses
|
whitespace-delimited sequence of characters. Under the hood, this component uses
|
||||||
the [`Matcher`](/api/matcher) to find sequences of tokens with the dependency
|
the [`Matcher`](/api/matcher) to find sequences of tokens with the dependency
|
||||||
label `"subtok"` and then merges them into a single token.
|
label `"subtok"` and then merges them into a single token.
|
||||||
|
@ -96,9 +88,7 @@ label `"subtok"` and then merges them into a single token.
|
||||||
> print([(token.text, token.dep_) for token in doc])
|
> print([(token.text, token.dep_) for token in doc])
|
||||||
> # [('拜', 'subtok'), ('托', 'subtok')]
|
> # [('拜', 'subtok'), ('托', 'subtok')]
|
||||||
>
|
>
|
||||||
> merge_subtok = nlp.create_pipe("merge_subtokens")
|
> nlp.add_pipe("merge_subtokens")
|
||||||
> nlp.add_pipe(merge_subtok)
|
|
||||||
>
|
|
||||||
> doc = nlp("拜托")
|
> doc = nlp("拜托")
|
||||||
> print([token.text for token in doc])
|
> print([token.text for token in doc])
|
||||||
> # ['拜托']
|
> # ['拜托']
|
||||||
|
|
|
@ -1,26 +1,40 @@
|
||||||
---
|
---
|
||||||
title: SentenceRecognizer
|
title: SentenceRecognizer
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/senter.pyx
|
||||||
new: 3
|
new: 3
|
||||||
|
teaser: 'Pipeline component for sentence segmentation'
|
||||||
|
api_base_class: /api/tagger
|
||||||
|
api_string_name: senter
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
A trainable pipeline component for sentence segmentation. For a simpler,
|
A trainable pipeline component for sentence segmentation. For a simpler,
|
||||||
ruse-based strategy, see the [`Sentencizer`](/api/sentencizer). This class is a
|
ruse-based strategy, see the [`Sentencizer`](/api/sentencizer).
|
||||||
subclass of `Pipe` and follows the same API. The component is also available via
|
|
||||||
the string name `"senter"`. After initialization, it is typically added to the
|
|
||||||
processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
|
||||||
|
|
||||||
## Default config {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
The default config is defined by the pipeline component factory and describes
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
how the component should be configured. You can override its settings via the
|
||||||
documentation for details on the architectures and their arguments and
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
|
||||||
|
> config = {"model": DEFAULT_SENTER_MODEL,}
|
||||||
|
> nlp.add_pipe("senter", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/senter_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx
|
||||||
```
|
```
|
||||||
|
|
||||||
## SentenceRecognizer.\_\_init\_\_ {#init tag="method"}
|
## SentenceRecognizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
@ -30,12 +44,322 @@ Initialize the sentence recognizer.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe with default model
|
||||||
> senter = nlp.create_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
|
>
|
||||||
|
> # Construction via create_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_senter"}}
|
||||||
|
> senter = nlp.add_pipe("senter", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import SentenceRecognizer
|
> from spacy.pipeline import SentenceRecognizer
|
||||||
> senter = SentenceRecognizer()
|
> senter = SentenceRecognizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<!-- TODO: document, similar to other trainable pipeline components -->
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
|
shortcut for this and instantiate the component using its string name and
|
||||||
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
Apply the pipe to one document. The document is modified in place, and returned.
|
||||||
|
This usually happens under the hood when the `nlp` object is called on a text
|
||||||
|
and all pipeline components are applied to the `Doc` in order. Both
|
||||||
|
[`__call__`](/api/sentencerecognizer#call) and
|
||||||
|
[`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
|
[`predict`](/api/sentencerecognizer#predict) and
|
||||||
|
[`set_annotations`](/api/sentencerecognizer#set_annotations) methods.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("This is a sentence.")
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> # This usually happens under the hood
|
||||||
|
> processed = senter(doc)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | ------------------------ |
|
||||||
|
| `doc` | `Doc` | The document to process. |
|
||||||
|
| **RETURNS** | `Doc` | The processed document. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.pipe {#pipe tag="method"}
|
||||||
|
|
||||||
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
applied to the `Doc` in order. Both [`__call__`](/api/sentencerecognizer#call)
|
||||||
|
and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
|
[`predict`](/api/sentencerecognizer#predict) and
|
||||||
|
[`set_annotations`](/api/sentencerecognizer#set_annotations) methods.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> for doc in senter.pipe(docs, batch_size=50):
|
||||||
|
> pass
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> optimizer = senter.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> scores = senter.predict([doc1, doc2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------- | ----------------------------------------- |
|
||||||
|
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||||
|
| **RETURNS** | - | The model's prediction for each document. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
|
Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> scores = senter.predict([doc1, doc2])
|
||||||
|
> senter.set_annotations([doc1, doc2], scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------- | --------------- | ------------------------------------------------------------ |
|
||||||
|
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||||
|
| `scores` | - | The scores to set, produced by `SentenceRecognizer.predict`. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.update {#update tag="method"}
|
||||||
|
|
||||||
|
Learn from a batch of documents and gold-standard information, updating the
|
||||||
|
pipe's model. Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
||||||
|
[`get_loss`](/api/sentencerecognizer#get_loss).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> optimizer = nlp.begin_training()
|
||||||
|
> losses = senter.update(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental"}
|
||||||
|
|
||||||
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> optimizer = nlp.begin_training()
|
||||||
|
> losses = senter.rehearse(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
Find the loss and gradient of loss for the batch of documents and their
|
||||||
|
predicted scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> scores = senter.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = senter.get_loss(examples, scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
|
Score a batch of examples.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = senter.score(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||||
|
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||||
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||||
|
|
||||||
|
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> optimizer = senter.create_optimizer()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
Modify the pipe's model, to use the given parameter values.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> with senter.use_params():
|
||||||
|
> senter.to_disk("/best_model")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
Serialize the pipe to disk.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> senter.to_disk("/path/to/senter")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> senter.from_disk("/path/to/senter")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | -------------------- | -------------------------------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> senter_bytes = senter.to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. |
|
||||||
|
|
||||||
|
## SentenceRecognizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> senter_bytes = senter.to_bytes()
|
||||||
|
> senter = nlp.add_pipe("senter")
|
||||||
|
> senter.from_bytes(senter_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------ | -------------------- | ------------------------------------------------------------------------- |
|
||||||
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. |
|
||||||
|
|
||||||
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = senter.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -1,16 +1,40 @@
|
||||||
---
|
---
|
||||||
title: Sentencizer
|
title: Sentencizer
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/sentencizer.pyx
|
||||||
|
teaser: 'Pipeline component for rule-based sentence boundary detection'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: sentencizer
|
||||||
|
api_trainable: false
|
||||||
---
|
---
|
||||||
|
|
||||||
A simple pipeline component, to allow custom sentence boundary detection logic
|
A simple pipeline component, to allow custom sentence boundary detection logic
|
||||||
that doesn't require the dependency parse. By default, sentence segmentation is
|
that doesn't require the dependency parse. By default, sentence segmentation is
|
||||||
performed by the [`DependencyParser`](/api/dependencyparser), so the
|
performed by the [`DependencyParser`](/api/dependencyparser), so the
|
||||||
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
|
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
|
||||||
require a statistical model to be loaded. The component is also available via
|
require a statistical model to be loaded.
|
||||||
the string name `"sentencizer"`. After initialization, it is typically added to
|
|
||||||
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {"punct_chars": None}
|
||||||
|
> nlp.add_pipe("entity_ruler", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------------- | ----------- | ---------------------------------------------------------------------------------------------------------- | ------- |
|
||||||
|
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. | `None` |
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx
|
||||||
|
```
|
||||||
|
|
||||||
## Sentencizer.\_\_init\_\_ {#init tag="method"}
|
## Sentencizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -19,18 +43,18 @@ Initialize the sentencizer.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe
|
||||||
> sentencizer = nlp.create_pipe("sentencizer")
|
> sentencizer = nlp.add_pipe("sentencizer")
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import Sentencizer
|
> from spacy.pipeline import Sentencizer
|
||||||
> sentencizer = Sentencizer()
|
> sentencizer = Sentencizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | ------------- | ----------------------------------------------------------------------------------------------- |
|
| -------------- | ----------- | ----------------------------------------------------------------------------------------------- |
|
||||||
| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. |
|
| _keyword-only_ | | |
|
||||||
| **RETURNS** | `Sentencizer` | The newly constructed object. |
|
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### punct_chars defaults
|
### punct_chars defaults
|
||||||
|
@ -58,8 +82,7 @@ the component has been added to the pipeline using
|
||||||
> from spacy.lang.en import English
|
> from spacy.lang.en import English
|
||||||
>
|
>
|
||||||
> nlp = English()
|
> nlp = English()
|
||||||
> sentencizer = nlp.create_pipe("sentencizer")
|
> nlp.add_pipe("sentencizer")
|
||||||
> nlp.add_pipe(sentencizer)
|
|
||||||
> doc = nlp("This is a sentence. This is another sentence.")
|
> doc = nlp("This is a sentence. This is another sentence.")
|
||||||
> assert len(list(doc.sents)) == 2
|
> assert len(list(doc.sents)) == 2
|
||||||
> ```
|
> ```
|
||||||
|
@ -69,6 +92,42 @@ the component has been added to the pipeline using
|
||||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||||
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
|
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
|
||||||
|
|
||||||
|
## Sentencizer.pipe {#pipe tag="method"}
|
||||||
|
|
||||||
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
applied to the `Doc` in order.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> sentencizer = nlp.add_pipe("sentencizer")
|
||||||
|
> for doc in sentencizer.pipe(docs, batch_size=50):
|
||||||
|
> pass
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------- | ----------------------------------------------------- |
|
||||||
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||||
|
|
||||||
|
## Sentencizer.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
|
Score a batch of examples.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> scores = sentencizer.score(examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||||
|
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||||
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||||
|
|
||||||
## Sentencizer.to_disk {#to_disk tag="method"}
|
## Sentencizer.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Save the sentencizer settings (punctuation characters) a directory. Will create
|
Save the sentencizer settings (punctuation characters) a directory. Will create
|
||||||
|
@ -78,13 +137,14 @@ a file `sentencizer.json`. This also happens automatically when you save an
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
|
> config = {"punct_chars": [".", "?", "!", "。"]}
|
||||||
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
|
> sentencizer = nlp.add_pipe("sentencizer", config=config)
|
||||||
|
> sentencizer.to_disk("/path/to/sentencizer.json")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
|
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
|
|
||||||
## Sentencizer.from_disk {#from_disk tag="method"}
|
## Sentencizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -95,7 +155,7 @@ added to its pipeline.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> sentencizer = Sentencizer()
|
> sentencizer = nlp.add_pipe("sentencizer")
|
||||||
> sentencizer.from_disk("/path/to/sentencizer.json")
|
> sentencizer.from_disk("/path/to/sentencizer.json")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -111,7 +171,8 @@ Serialize the sentencizer settings to a bytestring.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
|
> config = {"punct_chars": [".", "?", "!", "。"]}
|
||||||
|
> sentencizer = nlp.add_pipe("sentencizer", config=config)
|
||||||
> sentencizer_bytes = sentencizer.to_bytes()
|
> sentencizer_bytes = sentencizer.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -127,7 +188,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> sentencizer_bytes = sentencizer.to_bytes()
|
> sentencizer_bytes = sentencizer.to_bytes()
|
||||||
> sentencizer = Sentencizer()
|
> sentencizer = nlp.add_pipe("sentencizer")
|
||||||
> sentencizer.from_bytes(sentencizer_bytes)
|
> sentencizer.from_bytes(sentencizer_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -1,41 +1,70 @@
|
||||||
---
|
---
|
||||||
title: Tagger
|
title: Tagger
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/tagger.pyx
|
||||||
|
teaser: 'Pipeline component for part-of-speech tagging'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: tagger
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
This class is a subclass of `Pipe` and follows the same API. The pipeline
|
## Config and implementation {#config}
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
|
||||||
via the ID `"tagger"`.
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||||
|
> config = {
|
||||||
|
> "set_morphology": False,
|
||||||
|
> "model": DEFAULT_TAGGER_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("tagger", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
|
||||||
|
| `set_morphology` | bool | Whether to set morphological features. | `False` |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
|
||||||
|
```
|
||||||
|
|
||||||
## Tagger.\_\_init\_\_ {#init tag="method"}
|
## Tagger.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe with default model
|
||||||
> tagger = nlp.create_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
>
|
>
|
||||||
> # Construction via create_pipe with custom model
|
> # Construction via create_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_tagger"}}
|
> config = {"model": {"@architectures": "my_tagger"}}
|
||||||
> parser = nlp.create_pipe("tagger", config)
|
> parser = nlp.add_pipe("tagger", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class with custom model from file
|
> # Construction from class
|
||||||
> from spacy.pipeline import Tagger
|
> from spacy.pipeline import Tagger
|
||||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
|
||||||
> tagger = Tagger(nlp.vocab, model)
|
> tagger = Tagger(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------------------------- |
|
| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| **RETURNS** | `Tagger` | The newly constructed object. |
|
| _keyword-only_ | | |
|
||||||
|
| `set_morphology` | bool | Whether to set morphological features. |
|
||||||
|
|
||||||
## Tagger.\_\_call\_\_ {#call tag="method"}
|
## Tagger.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -49,8 +78,8 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = tagger(doc)
|
> processed = tagger(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -71,16 +100,37 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> for doc in tagger.pipe(docs, batch_size=50):
|
> for doc in tagger.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------ |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||||
|
|
||||||
|
## Tagger.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> tagger = nlp.add_pipe("tagger")
|
||||||
|
> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## Tagger.predict {#predict tag="method"}
|
## Tagger.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -89,7 +139,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> scores = tagger.predict([doc1, doc2])
|
> scores = tagger.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -105,7 +155,7 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> scores = tagger.predict([doc1, doc2])
|
> scores = tagger.predict([doc1, doc2])
|
||||||
> tagger.set_annotations([doc1, doc2], scores)
|
> tagger.set_annotations([doc1, doc2], scores)
|
||||||
> ```
|
> ```
|
||||||
|
@ -124,20 +174,43 @@ pipe's model. Delegates to [`predict`](/api/tagger#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab, tagger_model)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> losses = tagger.update(examples, sgd=optimizer)
|
> losses = tagger.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). |
|
||||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## Tagger.rehearse {#rehearse tag="method,experimental"}
|
||||||
|
|
||||||
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> tagger = nlp.add_pipe("tagger")
|
||||||
|
> optimizer = nlp.begin_training()
|
||||||
|
> losses = tagger.rehearse(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Tagger.get_loss {#get_loss tag="method"}
|
## Tagger.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -147,36 +220,31 @@ predicted scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> scores = tagger.predict([eg.predicted for eg in examples])
|
> scores = tagger.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = tagger.get_loss(examples, scores)
|
> loss, d_loss = tagger.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------- |
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
## Tagger.begin_training {#begin_training tag="method"}
|
## Tagger.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Score a batch of examples.
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> scores = tagger.score(examples)
|
||||||
> nlp.pipeline.append(tagger)
|
|
||||||
> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. |
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. |
|
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -185,13 +253,13 @@ Create an optimizer for the pipeline component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = tagger.create_optimizer()
|
> optimizer = tagger.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## Tagger.use_params {#use_params tag="method, contextmanager"}
|
## Tagger.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
@ -200,7 +268,7 @@ Modify the pipe's model, to use the given parameter values.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> with tagger.use_params():
|
> with tagger.use_params():
|
||||||
> tagger.to_disk("/best_model")
|
> tagger.to_disk("/best_model")
|
||||||
> ```
|
> ```
|
||||||
|
@ -217,14 +285,14 @@ Add a new label to the pipe.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.symbols import POS
|
> from spacy.symbols import POS
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.add_label("MY_LABEL", {POS: 'NOUN'})
|
> tagger.add_label("MY_LABEL", {POS: "NOUN"})
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------- | ---- | --------------------------------------------------------------- |
|
| -------- | ---------------- | --------------------------------------------------------------- |
|
||||||
| `label` | str | The label to add. |
|
| `label` | str | The label to add. |
|
||||||
| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
|
| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
|
||||||
|
|
||||||
## Tagger.to_disk {#to_disk tag="method"}
|
## Tagger.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
@ -233,14 +301,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.to_disk("/path/to/tagger")
|
> tagger.to_disk("/path/to/tagger")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Tagger.from_disk {#from_disk tag="method"}
|
## Tagger.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -249,31 +317,31 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.from_disk("/path/to/tagger")
|
> tagger.from_disk("/path/to/tagger")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
| ----------- | --------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.to_bytes {#to_bytes tag="method"}
|
## Tagger.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger_bytes = tagger.to_bytes()
|
> tagger_bytes = tagger.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.from_bytes {#from_bytes tag="method"}
|
## Tagger.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -283,15 +351,15 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger_bytes = tagger.to_bytes()
|
> tagger_bytes = tagger.to_bytes()
|
||||||
> tagger = Tagger(nlp.vocab)
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.from_bytes(tagger_bytes)
|
> tagger.from_bytes(tagger_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | -------- | ------------------------------------------------------------------------- |
|
| ------------ | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
||||||
|
|
||||||
## Tagger.labels {#labels tag="property"}
|
## Tagger.labels {#labels tag="property"}
|
||||||
|
|
||||||
|
@ -306,9 +374,9 @@ tags by default, e.g. `VERB`, `NOUN` and so on.
|
||||||
> assert "MY_LABEL" in tagger.labels
|
> assert "MY_LABEL" in tagger.labels
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------- |
|
| ----------- | ------------ | ---------------------------------- |
|
||||||
| **RETURNS** | tuple | The labels added to the component. |
|
| **RETURNS** | `Tuple[str]` | The labels added to the component. |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
|
@ -1,56 +1,71 @@
|
||||||
---
|
---
|
||||||
title: TextCategorizer
|
title: TextCategorizer
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/pipes.pyx
|
source: spacy/pipeline/textcat.py
|
||||||
new: 2
|
new: 2
|
||||||
|
teaser: 'Pipeline component for text classification'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: textcat
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
This class is a subclass of `Pipe` and follows the same API. The pipeline
|
## Config and implementation {#config}
|
||||||
component is available in the [processing pipeline](/usage/processing-pipelines)
|
|
||||||
via the ID `"textcat"`.
|
|
||||||
|
|
||||||
## Default config {#config}
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
> #### Example
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
>
|
||||||
documentation for details on the architectures and their arguments and
|
> ```python
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
> from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
> config = {
|
||||||
|
> "labels": [],
|
||||||
|
> "model": DEFAULT_TEXTCAT_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("textcat", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
|
||||||
|
| `labels` | `Iterable[str]` | The labels to use. | `[]` |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/textcat_defaults.cfg
|
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO: do we also need to document the other defaults here? -->
|
|
||||||
|
|
||||||
## TextCategorizer.\_\_init\_\_ {#init tag="method"}
|
## TextCategorizer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via create_pipe
|
> # Construction via add_pipe with default model
|
||||||
> textcat = nlp.create_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
>
|
>
|
||||||
> # Construction via create_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_textcat"}}
|
> config = {"model": {"@architectures": "my_textcat"}}
|
||||||
> parser = nlp.create_pipe("textcat", config)
|
> parser = nlp.add_pipe("textcat", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class with custom model from file
|
> # Construction from class
|
||||||
> from spacy.pipeline import TextCategorizer
|
> from spacy.pipeline import TextCategorizer
|
||||||
> model = util.load_config("model.cfg", create_objects=True)["model"]
|
|
||||||
> textcat = TextCategorizer(nlp.vocab, model)
|
> textcat = TextCategorizer(nlp.vocab, model)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Create a new pipeline instance. In your application, you would normally use a
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | ------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `**cfg` | - | Configuration parameters. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
|
| _keyword-only_ | | |
|
||||||
|
| `labels` | `Iterable[str]` | The labels to use. |
|
||||||
|
|
||||||
<!-- TODO move to config page
|
<!-- TODO move to config page
|
||||||
### Architectures {#architectures new="2.1"}
|
### Architectures {#architectures new="2.1"}
|
||||||
|
@ -81,8 +96,8 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
|
||||||
> doc = nlp("This is a sentence.")
|
> doc = nlp("This is a sentence.")
|
||||||
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> # This usually happens under the hood
|
> # This usually happens under the hood
|
||||||
> processed = textcat(doc)
|
> processed = textcat(doc)
|
||||||
> ```
|
> ```
|
||||||
|
@ -104,16 +119,37 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> for doc in textcat.pipe(docs, batch_size=50):
|
> for doc in textcat.pipe(docs, batch_size=50):
|
||||||
> pass
|
> pass
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------------- | ------------------------------------------------------ |
|
| -------------- | --------------- | ----------------------------------------------------- |
|
||||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
| _keyword-only_ | | |
|
||||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||||
|
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||||
|
|
||||||
|
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Return an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> textcat = nlp.add_pipe("textcat")
|
||||||
|
> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
@ -122,7 +158,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> scores = textcat.predict([doc1, doc2])
|
> scores = textcat.predict([doc1, doc2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -138,7 +174,7 @@ Modify a batch of documents, using pre-computed scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> scores = textcat.predict(docs)
|
> scores = textcat.predict(docs)
|
||||||
> textcat.set_annotations(docs, scores)
|
> textcat.set_annotations(docs, scores)
|
||||||
> ```
|
> ```
|
||||||
|
@ -157,20 +193,43 @@ pipe's model. Delegates to [`predict`](/api/textcategorizer#predict) and
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab, textcat_model)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.begin_training()
|
||||||
> losses = textcat.update(examples, sgd=optimizer)
|
> losses = textcat.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
||||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## TextCategorizer.rehearse {#rehearse tag="method,experimental"}
|
||||||
|
|
||||||
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> textcat = nlp.add_pipe("textcat")
|
||||||
|
> optimizer = nlp.begin_training()
|
||||||
|
> losses = textcat.rehearse(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## TextCategorizer.get_loss {#get_loss tag="method"}
|
## TextCategorizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
@ -180,36 +239,32 @@ predicted scores.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> scores = textcat.predict([eg.predicted for eg in examples])
|
> scores = textcat.predict([eg.predicted for eg in examples])
|
||||||
> loss, d_loss = textcat.get_loss(examples, scores)
|
> loss, d_loss = textcat.get_loss(examples, scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------------------------------- |
|
| ----------- | --------------------- | --------------------------------------------------- |
|
||||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||||
| `scores` | - | Scores representing the model's predictions. |
|
| `scores` | - | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
|
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||||
|
|
||||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
## TextCategorizer.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Score a batch of examples.
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> scores = textcat.score(examples)
|
||||||
> nlp.pipeline.append(textcat)
|
|
||||||
> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------- | ---------------------------------------------------------------------- |
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
| `examples` | `Iterable[Example]` | The examples to score. | _keyword-only_ | | |
|
||||||
| `pipeline` | `List[(str, callable)]` | Optional list of pipeline components that this component is part of. |
|
| `positive_label` | str | Optional positive label. |
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. |
|
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). |
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
|
@ -218,29 +273,13 @@ Create an optimizer for the pipeline component.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = textcat.create_optimizer()
|
> optimizer = textcat.create_optimizer()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------- | -------------- |
|
||||||
| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
|
||||||
|
|
||||||
Modify the pipe's model, to use the given parameter values.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
|
||||||
> with textcat.use_params(optimizer.averages):
|
|
||||||
> textcat.to_disk("/best_model")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
|
|
||||||
|
|
||||||
## TextCategorizer.add_label {#add_label tag="method"}
|
## TextCategorizer.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
|
@ -249,7 +288,7 @@ Add a new label to the pipe.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.add_label("MY_LABEL")
|
> textcat.add_label("MY_LABEL")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -257,6 +296,22 @@ Add a new label to the pipe.
|
||||||
| ------- | ---- | ----------------- |
|
| ------- | ---- | ----------------- |
|
||||||
| `label` | str | The label to add. |
|
| `label` | str | The label to add. |
|
||||||
|
|
||||||
|
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
|
Modify the pipe's model, to use the given parameter values.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> textcat = nlp.add_pipe("textcat")
|
||||||
|
> with textcat.use_params():
|
||||||
|
> textcat.to_disk("/best_model")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
|
||||||
|
|
||||||
## TextCategorizer.to_disk {#to_disk tag="method"}
|
## TextCategorizer.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Serialize the pipe to disk.
|
Serialize the pipe to disk.
|
||||||
|
@ -264,14 +319,14 @@ Serialize the pipe to disk.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.to_disk("/path/to/textcat")
|
> textcat.to_disk("/path/to/textcat")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## TextCategorizer.from_disk {#from_disk tag="method"}
|
## TextCategorizer.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -280,14 +335,14 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.from_disk("/path/to/textcat")
|
> textcat.from_disk("/path/to/textcat")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
||||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
||||||
|
@ -295,16 +350,16 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat_bytes = textcat.to_bytes()
|
> textcat_bytes = textcat.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Serialize the pipe to a bytestring.
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ------------------------------------------------------------------------- |
|
| ----------- | --------------- | ------------------------------------------------------------------------- |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
@ -314,14 +369,14 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat_bytes = textcat.to_bytes()
|
> textcat_bytes = textcat.to_bytes()
|
||||||
> textcat = TextCategorizer(nlp.vocab)
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.from_bytes(textcat_bytes)
|
> textcat.from_bytes(textcat_bytes)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | ----------------- | ------------------------------------------------------------------------- |
|
| ------------ | ----------------- | ------------------------------------------------------------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
||||||
|
|
||||||
## TextCategorizer.labels {#labels tag="property"}
|
## TextCategorizer.labels {#labels tag="property"}
|
||||||
|
|
|
@ -2,18 +2,10 @@
|
||||||
title: Tok2Vec
|
title: Tok2Vec
|
||||||
source: spacy/pipeline/tok2vec.py
|
source: spacy/pipeline/tok2vec.py
|
||||||
new: 3
|
new: 3
|
||||||
|
teaser: null
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: tok2vec
|
||||||
|
api_trainable: true
|
||||||
---
|
---
|
||||||
|
|
||||||
TODO: document
|
TODO:
|
||||||
|
|
||||||
## Default config {#config}
|
|
||||||
|
|
||||||
This is the default configuration used to initialize the model powering the
|
|
||||||
pipeline component. See the [model architectures](/api/architectures)
|
|
||||||
documentation for details on the architectures and their arguments and
|
|
||||||
hyperparameters. To learn more about how to customize the config and train
|
|
||||||
custom models, check out the [training config](/usage/training#config) docs.
|
|
||||||
|
|
||||||
```python
|
|
||||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/tok2vec_defaults.cfg
|
|
||||||
```
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ the
|
||||||
> nlp = English()
|
> nlp = English()
|
||||||
> # Create a Tokenizer with the default settings for English
|
> # Create a Tokenizer with the default settings for English
|
||||||
> # including punctuation rules and exceptions
|
> # including punctuation rules and exceptions
|
||||||
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
> tokenizer = nlp.tokenizer
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -31,11 +31,12 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
|
||||||
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
|
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------ | --------------------------------------------------------------------------------- |
|
| ------------------------------------------ | ----------------- | --------------------------------------------------------------------------------- |
|
||||||
| `name` | str / `Path` | Model to load, i.e. package name or path. |
|
| `name` | str / `Path` | Model to load, i.e. package name or path. |
|
||||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
| `component_cfg` <Tag variant="new">3</Tag> | `Dict[str, dict]` | Optional config overrides for pipeline components, keyed by component names. |
|
||||||
|
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
||||||
|
|
||||||
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
||||||
and pipeline components from a model's `meta.json`, initializes the `Language`
|
and pipeline components from a model's `meta.json`, initializes the `Language`
|
||||||
|
@ -43,9 +44,10 @@ class, loads in the model data and returns it.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Abstract example
|
### Abstract example
|
||||||
cls = util.get_lang_class(lang) # get language for ID, e.g. 'en'
|
cls = util.get_lang_class(lang) # get language for ID, e.g. "en"
|
||||||
nlp = cls() # initialise the language
|
nlp = cls() # initialize the language
|
||||||
for name in pipeline: component = nlp.create_pipe(name) # create each pipeline component nlp.add_pipe(component) # add component to pipeline
|
for name in pipeline:
|
||||||
|
nlp.add_pipe(name) # add component to pipeline
|
||||||
nlp.from_disk(model_data_path) # load in model data
|
nlp.from_disk(model_data_path) # load in model data
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -57,15 +59,14 @@ Create a blank model of a given language class. This function is the twin of
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp_en = spacy.blank("en")
|
> nlp_en = spacy.blank("en") # equivalent to English()
|
||||||
> nlp_de = spacy.blank("de")
|
> nlp_de = spacy.blank("de") # equivalent to German()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | ------------------------------------------------------------------------------------------------ |
|
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
||||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
||||||
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
|
||||||
|
|
||||||
#### spacy.info {#spacy.info tag="function"}
|
#### spacy.info {#spacy.info tag="function"}
|
||||||
|
|
||||||
|
@ -79,13 +80,14 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
||||||
> ```python
|
> ```python
|
||||||
> spacy.info()
|
> spacy.info()
|
||||||
> spacy.info("en_core_web_sm")
|
> spacy.info("en_core_web_sm")
|
||||||
> spacy.info(markdown=True)
|
> markdown = spacy.info(markdown=True, silent=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | ---- | ------------------------------------------------ |
|
| ---------- | ---- | ------------------------------------------------ |
|
||||||
| `model` | str | A model, i.e. a package name or path (optional). |
|
| `model` | str | A model, i.e. a package name or path (optional). |
|
||||||
| `markdown` | bool | Print information as Markdown. |
|
| `markdown` | bool | Print information as Markdown. |
|
||||||
|
| `silent` | bool | Don't print anything, just return. |
|
||||||
|
|
||||||
### spacy.explain {#spacy.explain tag="function"}
|
### spacy.explain {#spacy.explain tag="function"}
|
||||||
|
|
||||||
|
@ -479,7 +481,6 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
|
||||||
> for lang_id in ["en", "de"]:
|
> for lang_id in ["en", "de"]:
|
||||||
> lang_class = util.get_lang_class(lang_id)
|
> lang_class = util.get_lang_class(lang_id)
|
||||||
> lang = lang_class()
|
> lang = lang_class()
|
||||||
> tokenizer = lang.Defaults.create_tokenizer()
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -30,13 +30,14 @@ you can add vectors to later.
|
||||||
> vectors = Vectors(data=data, keys=keys)
|
> vectors = Vectors(data=data, keys=keys)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
| _keyword-only_ | | |
|
||||||
| `keys` | iterable | A sequence of keys aligned with the data. |
|
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
|
||||||
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
|
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
||||||
| `name` | str | A name to identify the vectors table. |
|
| `keys` | iterable | A sequence of keys aligned with the data. |
|
||||||
| **RETURNS** | `Vectors` | The newly created object. |
|
| `name` | str | A name to identify the vectors table. |
|
||||||
|
| **RETURNS** | `Vectors` | The newly created object. |
|
||||||
|
|
||||||
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
@ -138,12 +139,13 @@ mapping separately. If you need to manage the strings, you should use the
|
||||||
> nlp.vocab.vectors.add("dog", row=0)
|
> nlp.vocab.vectors.add("dog", row=0)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------------------- | ----------------------------------------------------- |
|
| -------------- | ---------------------------------- | ----------------------------------------------------- |
|
||||||
| `key` | str / int | The key to add. |
|
| `key` | str / int | The key to add. |
|
||||||
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
|
| _keyword-only_ | | |
|
||||||
| `row` | int | An optional row number of a vector to map the key to. |
|
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
|
||||||
| **RETURNS** | int | The row the vector was added to. |
|
| `row` | int | An optional row number of a vector to map the key to. |
|
||||||
|
| **RETURNS** | int | The row the vector was added to. |
|
||||||
|
|
||||||
## Vectors.resize {#resize tag="method"}
|
## Vectors.resize {#resize tag="method"}
|
||||||
|
|
||||||
|
@ -225,13 +227,14 @@ Look up one or more keys by row, or vice versa.
|
||||||
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
|
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------- | ------------------------------------------------------------------------ |
|
||||||
| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
|
| _keyword-only_ | | |
|
||||||
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
|
| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
|
||||||
| `row` | int | Find the first key that points to the row. Returns int. |
|
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
|
||||||
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
|
| `row` | int | Find the first key that points to the row. Returns int. |
|
||||||
| **RETURNS** | The requested key, keys, row or rows. |
|
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
|
||||||
|
| **RETURNS** | The requested key, keys, row or rows. |
|
||||||
|
|
||||||
## Vectors.shape {#shape tag="property"}
|
## Vectors.shape {#shape tag="property"}
|
||||||
|
|
||||||
|
@ -318,13 +321,14 @@ performed in chunks, to avoid consuming too much memory. You can set the
|
||||||
> most_similar = nlp.vocab.vectors.most_similar(queries, n=10)
|
> most_similar = nlp.vocab.vectors.most_similar(queries, n=10)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------ | --------- | ------------------------------------------------------------------ |
|
| -------------- | --------- | ------------------------------------------------------------------ |
|
||||||
| `queries` | `ndarray` | An array with one or more vectors. |
|
| `queries` | `ndarray` | An array with one or more vectors. |
|
||||||
| `batch_size` | int | The batch size to use. Default to `1024`. |
|
| _keyword-only_ | | |
|
||||||
| `n` | int | The number of entries to return for each query. Defaults to `1`. |
|
| `batch_size` | int | The batch size to use. Default to `1024`. |
|
||||||
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
| `n` | int | The number of entries to return for each query. Defaults to `1`. |
|
||||||
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
||||||
|
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
||||||
|
|
||||||
## Vectors.to_disk {#to_disk tag="method"}
|
## Vectors.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -136,10 +136,11 @@ have to call this to change the size of the vectors. Only one of the `width` and
|
||||||
> nlp.vocab.reset_vectors(width=300)
|
> nlp.vocab.reset_vectors(width=300)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------- | ---- | -------------------------------------- |
|
| -------------- | ---- | -------------------------------------- |
|
||||||
| `width` | int | The new width (keyword argument only). |
|
| _keyword-only_ | | |
|
||||||
| `shape` | int | The new shape (keyword argument only). |
|
| `width` | int | The new width (keyword argument only). |
|
||||||
|
| `shape` | int | The new shape (keyword argument only). |
|
||||||
|
|
||||||
## Vocab.prune_vectors {#prune_vectors tag="method" new="2"}
|
## Vocab.prune_vectors {#prune_vectors tag="method" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,33 @@
|
||||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="1155" height="221" viewBox="0 0 1155 221">
|
||||||
<style>
|
<defs>
|
||||||
.svg__pipeline__text { fill: #1a1e23; font: 20px Arial, sans-serif }
|
<rect id="a" width="735" height="170" x="210" y="25" rx="30"/>
|
||||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px Arial, sans-serif }
|
<mask id="b" width="735" height="170" x="0" y="0" fill="#fff" maskContentUnits="userSpaceOnUse" maskUnits="objectBoundingBox">
|
||||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace }
|
<use xlink:href="#a"/>
|
||||||
</style>
|
</mask>
|
||||||
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
</defs>
|
||||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
<g fill="none" fill-rule="evenodd" transform="translate(0 26)">
|
||||||
<text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
|
<rect width="145" height="80" x="2.5" y="2.5" fill="#D8D8D8" stroke="#6A6A6A" stroke-width="5" rx="10" transform="translate(0 70)"/>
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
|
<path fill="#3D4251" fill-rule="nonzero" d="M55.4 99.7v3.9h-7.6V125H43v-21.4h-7.7v-3.9h20zm10.2 7c1 0 2.1.2 3 .6a6.8 6.8 0 014.1 4.1 9.6 9.6 0 01.6 4.3l-.2.5-.3.3H61.3c0 2 .6 3.3 1.4 4.1.9.9 2 1.3 3.5 1.3a6 6 0 001.8-.2l1.3-.6 1-.5.8-.3c.2 0 .3 0 .5.2l.3.2 1.3 1.6c-.5.6-1 1-1.6 1.4a9 9 0 01-3.9 1.4l-2 .2c-1.2 0-2.3-.2-3.4-.7-1-.4-2-1-2.8-1.8a8.6 8.6 0 01-1.9-3 11.6 11.6 0 010-7.6c.3-1.1.9-2 1.6-2.8a8 8 0 012.7-2 9 9 0 013.7-.6zm0 3.2a4 4 0 00-3 1c-.6.7-1 1.8-1.3 3h8.1c0-.5 0-1-.2-1.5-.1-.5-.4-1-.7-1.3-.3-.4-.7-.7-1.2-1a4 4 0 00-1.7-.2zm15.5 5.8l-5.9-8.7h4.2c.3 0 .5 0 .7.2l.4.4 3.7 6a4.9 4.9 0 01.6-1.2l3-4.7.4-.5.6-.2h4l-6 8.5L93 125h-4.2c-.3 0-.5 0-.7-.2l-.5-.6-3.8-6.3-.4 1.1-3.4 5.2-.5.5a1 1 0 01-.7.3H75l6-9.3zm20.5 9.6c-1.5 0-2.7-.5-3.5-1.3a5 5 0 01-1.3-3.7v-10H95c-.3 0-.5 0-.6-.2-.2-.2-.3-.4-.3-.7v-1.7l2.9-.5 1-5c0-.1 0-.3.2-.5l.7-.2h2.2v5.7h4.7v3h-4.7v9.8c0 .6.2 1 .4 1.3.3.3.7.5 1.2.5l.6-.1a3.7 3.7 0 00.9-.4l.3-.1.3.1.3.3 1.2 2c-.6.6-1.3 1-2.1 1.3a8 8 0 01-2.6.4z"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
|
<rect width="145" height="80" x="2.5" y="2.5" fill="#D7CCF4" stroke="#8978B5" stroke-width="5" rx="10" transform="translate(1005 70)"/>
|
||||||
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
|
<path fill="#3D4251" fill-rule="nonzero" d="M1050.3 101.5a58.8 58.8 0 016.8-.4c2.2 0 4 .4 5.4 1 1.4.6 2.5 1.5 3.4 2.6a10 10 0 011.7 4 23.2 23.2 0 010 9.6c-.3 1.5-1 2.9-1.8 4-.8 1.3-2 2.2-3.5 3-1.5.7-3.4 1-5.8 1a37.3 37.3 0 01-5-.1l-1.2-.2v-24.5zm7 4a15.6 15.6 0 00-2.3 0V122h.5a158 158 0 001.6.1 6 6 0 003.2-.7c.8-.5 1.4-1.2 1.8-2 .4-.8.7-1.8.8-2.8a27.3 27.3 0 000-5.8 8 8 0 00-.7-2.6c-.4-.8-1-1.5-1.8-2-.7-.5-1.8-.8-3.1-.8zm13.4 11.8c0-1.5.2-2.8.7-4a8 8 0 014.8-4.7c1.1-.4 2.4-.6 3.8-.6 1.5 0 2.8.2 4 .7 1 .4 2 1 2.9 1.8.8.9 1.4 1.8 1.8 3 .4 1.1.6 2.4.6 3.7 0 1.5-.2 2.8-.7 4a8 8 0 01-4.8 4.7c-1.1.4-2.4.6-3.8.6a11 11 0 01-4-.7c-1-.4-2-1-2.9-1.8a7.9 7.9 0 01-1.8-3c-.4-1.1-.6-2.4-.6-3.8zm4.7 0c0 .7.1 1.4.3 2 .2.7.5 1.3 1 1.8a4.1 4.1 0 003.3 1.5c1.4 0 2.5-.4 3.3-1.3.9-.8 1.3-2.2 1.3-4a6 6 0 00-1.2-4c-.8-1-2-1.4-3.4-1.4-.7 0-1.3 0-1.8.3-.6.2-1 .5-1.5 1-.4.4-.7 1-1 1.6-.2.7-.3 1.5-.3 2.4zm34.2 7c-1 .7-2 1.3-3.3 1.6-1.3.4-2.7.6-4 .6-1.6 0-3-.2-4.1-.7-1.2-.4-2.2-1-3-1.8a8 8 0 01-1.8-3 10.9 10.9 0 010-7.7 8.2 8.2 0 015.2-4.7 14.3 14.3 0 017.6-.2l2.6 1v6.1h-3.8v-3.2l-2.2-.3c-.7 0-1.3.1-2 .3a4.8 4.8 0 00-2.9 2.6c-.3.7-.5 1.4-.5 2.3 0 .8.2 1.5.4 2.1a5 5 0 002.8 2.8 8.2 8.2 0 005.6-.2l1.9-1 1.5 3.4z"/>
|
||||||
<text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
|
<use stroke="#3AC" stroke-dasharray="5 10" stroke-width="10" mask="url(#b)" xlink:href="#a"/>
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
|
<g transform="translate(540)">
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
|
<rect width="95" height="50" x="2.5" y="2.5" fill="#C3E7F1" stroke="#3AC" stroke-width="5" rx="10"/>
|
||||||
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
|
<path fill="#3D4251" fill-rule="nonzero" d="M27.8 24.5h4.4l.3 1.6h.1a5.2 5.2 0 014.2-2c.7 0 1.3.1 1.8.3.6.2 1 .4 1.4.8.4.4.7 1 1 1.6.1.6.3 1.5.3 2.4V37H38v-7.1c0-1-.2-1.8-.7-2.2-.4-.5-1-.7-1.7-.7-.6 0-1.2.2-1.7.6-.5.3-.9.8-1 1.3V37h-3.3v-9.8h-1.8v-2.7zm16.9-5H50v11.6c0 1.2.2 2.1.5 2.6s.8.8 1.5.8c.5 0 1 0 1.3-.2l1-.4 1.2 2.2a15.3 15.3 0 01-1.8 1 6.1 6.1 0 01-2.3.3c-1.5 0-2.7-.4-3.5-1.3-.8-.8-1.1-1.9-1.1-3.4V22.3h-2.1v-2.7zm12.8 5h4.3L62 26h.1c.9-1.2 2.3-1.9 4.2-1.9a6 6 0 012.1.4c.7.3 1.2.6 1.7 1.1.4.6.8 1.2 1 2 .3.8.4 1.7.4 2.8 0 1-.1 2-.4 3-.3.8-.7 1.5-1.2 2.1-.6.6-1.2 1-2 1.4-.7.3-1.6.5-2.6.5-.5 0-1 0-1.5-.2-.5 0-1-.2-1.3-.3V42h-3.2V27.2h-1.9v-2.7zm8 2.4c-.7 0-1.3.2-1.8.5s-.9.8-1 1.4V34c.2.2.5.3 1 .4l1.3.2c.4 0 .9 0 1.3-.2s.7-.4 1-.8c.3-.4.6-.8.7-1.3.2-.6.3-1.2.3-2 0-1-.3-1.9-.8-2.5-.6-.6-1.2-.9-2-.9z"/>
|
||||||
<text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
|
</g>
|
||||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
|
<path fill="#3AC" d="M205 112.5L180 125v-25z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
|
<path stroke="#3AC" stroke-linecap="square" stroke-width="5" d="M180 112.5h-23.1"/>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
|
<path fill="#3AC" d="M1000 112.5L975 125v-25z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="8" width="62" height="14" transform="translate(342.5 80.5)">tagger</text>
|
<path stroke="#3AC" stroke-linecap="square" stroke-width="5" d="M975 112.5h-23.1"/>
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
|
<path fill="#EAC1CC" stroke="#F03969" stroke-linejoin="round" stroke-width="3.8" d="M230 75h135l23.5 43.4L365 160H230l23.5-41.5z"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
|
<path fill="#F2D7B2" stroke="#F0A439" stroke-linejoin="round" stroke-width="3.8" d="M395 75h135l23.5 43.4L530 160H395l23.5-41.5z"/>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
|
<path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M515 75h135l23.5 43.4L650 160H515l23.5-41.5z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">parser</text>
|
<path fill="#D7E99A" stroke="#B2D73A" stroke-linejoin="round" stroke-width="3.8" d="M640 75h135l23.5 43.4L775 160H640l23.5-41.5z"/>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
|
<path fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="3.8" d="M765 75h135l23.5 43.4L900 160H765l23.5-41.5z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="8" width="40" height="14" transform="translate(558.5 80.5)">ner</text>
|
<path fill="#3D4251" fill-rule="nonzero" d="M265.9 125.2c-1.1 0-2-.3-2.6-1-.6-.6-.9-1.4-.9-2.5v-7.2h-1.3c-.2 0-.3 0-.4-.2-.2 0-.2-.2-.2-.5v-1.2l2-.3.7-3.5.2-.4.5-.2h1.6v4h3.4v2.3h-3.4v7c0 .3 0 .6.3.9.2.2.5.3.8.3h.5a2.6 2.6 0 00.6-.3l.2-.1h.2l.2.3 1 1.5-1.6.8-1.8.3zm10.9-13.2c1 0 1.8.1 2.6.4a5.6 5.6 0 013.3 3.4c.3.8.4 1.8.4 2.8 0 1-.1 1.9-.4 2.7a5.5 5.5 0 01-3.3 3.4 7 7 0 01-2.6.5 7 7 0 01-2.6-.5 5.6 5.6 0 01-3.3-3.4 7.8 7.8 0 010-5.5c.3-.8.7-1.5 1.3-2 .5-.6 1.2-1 2-1.4a7 7 0 012.6-.4zm0 10.8c1 0 1.9-.3 2.4-1 .5-.8.7-1.8.7-3.2 0-1.4-.2-2.4-.7-3.2-.5-.7-1.3-1-2.4-1-1 0-1.9.3-2.4 1-.5.8-.8 1.8-.8 3.2 0 1.4.3 2.4.8 3.1.5.8 1.3 1.1 2.4 1.1zm11.9-16.4v10.7h.5l.5-.1.4-.3 3.2-4 .4-.4.7-.1h2.8l-4 4.7-.4.5-.5.4.4.4.4.6 4.3 6.2h-2.8l-.6-.1c-.2-.1-.3-.2-.4-.5l-3.3-4.8a1 1 0 00-.4-.4h-1.2v5.8h-3.1v-18.6h3zm16 5.6c.7 0 1.5.1 2.2.4a4.9 4.9 0 012.9 3 6.9 6.9 0 01.3 3v.3l-.3.2h-8.3c.1 1.4.5 2.4 1.1 3 .6.6 1.4.9 2.4.9.6 0 1 0 1.3-.2a22 22 0 001.7-.8l.6-.1h.3l.3.3.9 1c-.4.5-.8.8-1.2 1a6.4 6.4 0 01-2.7 1c-.5.2-1 .2-1.4.2-1 0-1.7-.2-2.5-.5s-1.4-.7-2-1.3c-.6-.5-1-1.3-1.4-2.1a8.3 8.3 0 010-5.5 5.7 5.7 0 013.2-3.4c.7-.3 1.6-.4 2.5-.4zm0 2.2c-1 0-1.6.2-2.1.8-.5.5-.9 1.2-1 2.1h5.8c0-.4 0-.8-.2-1.1 0-.4-.2-.7-.5-1l-.8-.6-1.2-.2zm8 10.8v-12.8h1.9c.4 0 .6.2.8.5l.2 1a7 7 0 011.7-1.2 4.6 4.6 0 012.2-.5c.7 0 1.4 0 1.9.3l1.4 1 .8 1.6c.2.6.3 1.2.3 2v8.1h-3.1v-8.2c0-.7-.2-1.4-.6-1.8-.3-.4-.9-.6-1.6-.6l-1.5.3c-.5.3-1 .6-1.3 1v9.3h-3.1zm17.5-12.8V125H327v-12.8h3zm.4-3.8l-.1.8a2 2 0 01-1 1 2 2 0 01-2.2-.4 2 2 0 01-.4-.6l-.2-.8a2 2 0 01.6-1.4 2 2 0 011.3-.5l.8.1a2 2 0 011 1l.3.8zm12.3 5v.7l-.3.5-6.2 8h6.4v2.4h-10v-1.3l.2-.5c0-.2.1-.4.3-.5l6.1-8.2h-6.2v-2.3h9.8v1.3zm7.8-1.4c.8 0 1.6.1 2.2.4a4.9 4.9 0 013 3 6.9 6.9 0 01.3 3v.3l-.3.2h-8.3c.1 1.4.5 2.4 1 3 .7.6 1.5.9 2.5.9.5 0 1 0 1.3-.2a22 22 0 001.7-.8l.6-.1h.3l.3.3.8 1c-.3.5-.7.8-1.1 1a6.4 6.4 0 01-2.7 1c-.5.2-1 .2-1.4.2-1 0-1.8-.2-2.5-.5-.8-.3-1.5-.7-2-1.3-.6-.5-1-1.3-1.4-2.1a8.3 8.3 0 010-5.5 5.7 5.7 0 013.2-3.4c.7-.3 1.6-.4 2.5-.4zm0 2.2c-.8 0-1.5.2-2 .8-.5.5-.9 1.2-1 2.1h5.8c0-.4 0-.8-.2-1.1 0-.4-.2-.7-.5-1l-.8-.6-1.2-.2zm8 10.8v-12.8h1.9l.6.1c.2.2.3.4.3.7l.2 1.5a6 6 0 011.6-1.9c.6-.4 1.3-.7 2-.7s1.2.2 1.6.5l-.3 2.3-.2.3-.3.1h-.6l-.8-.2c-.7 0-1.2.2-1.7.6a4 4 0 00-1.1 1.5v8h-3.1z"/>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
|
<path fill="#3D4251" fill-rule="nonzero" d="M440.9 125.2c-1.1 0-2-.3-2.6-1-.6-.6-.9-1.4-.9-2.5v-7.2h-1.3c-.2 0-.3 0-.4-.2-.2 0-.2-.2-.2-.5v-1.2l2-.3.7-3.5.2-.4.5-.2h1.6v4h3.4v2.3h-3.4v7c0 .3 0 .6.3.9.2.2.5.3.8.3h.5a2.6 2.6 0 00.6-.3l.2-.1h.2l.2.3 1 1.5-1.6.8-1.8.3zm15.5-.2H455l-.7-.1c-.2-.1-.3-.3-.4-.6l-.3-.9a10.6 10.6 0 01-1.9 1.3 5 5 0 01-1 .4 6.4 6.4 0 01-2.8-.1l-1.2-.7a3 3 0 01-.7-1c-.2-.5-.3-1-.3-1.6 0-.5.1-1 .4-1.4.2-.5.6-.9 1.2-1.3s1.4-.7 2.4-1c1-.2 2.2-.3 3.7-.3v-.8c0-.9-.2-1.5-.6-2-.3-.3-.9-.5-1.6-.5a3.8 3.8 0 00-2 .5l-.8.4c-.2.2-.4.2-.6.2-.3 0-.4 0-.6-.2l-.3-.3-.6-1c1.5-1.4 3.2-2 5.3-2 .8 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6c.2.6.3 1.3.3 2v8.1zm-6-2h.9a3.3 3.3 0 001.4-.7l.7-.6v-2.2c-1 0-1.7.1-2.3.3a6 6 0 00-1.5.4l-.7.6c-.2.2-.3.5-.3.8 0 .5.2.9.5 1.1.3.3.8.4 1.3.4zm13.5-11l1.5.1 1.3.5h3.7v1.2l-.1.4-.6.2-1.1.3a4 4 0 01.3 1.4 3.8 3.8 0 01-1.5 3c-.4.4-1 .7-1.6.9a6.5 6.5 0 01-3.4.1c-.4.3-.6.5-.6.8 0 .3.2.5.4.6l1 .3h1.3a27.5 27.5 0 013 .3l1.3.5 1 1c.2.3.3.8.3 1.5 0 .5-.2 1-.4 1.6-.3.5-.7 1-1.2 1.4-.6.4-1.2.8-2 1a10.1 10.1 0 01-5.2.1 6 6 0 01-1.7-.7c-.5-.3-.9-.7-1-1.1-.3-.4-.4-.8-.4-1.3 0-.6.1-1 .5-1.5.4-.4.9-.7 1.5-1-.3-.1-.5-.4-.7-.7a2 2 0 01-.3-1.1v-.6l.4-.6.5-.6.8-.5a3.7 3.7 0 01-2-3.5 3.8 3.8 0 011.3-3l1.6-.8c.6-.2 1.3-.3 2-.3zm3.3 13.6c0-.3 0-.5-.2-.6-.1-.2-.3-.3-.6-.4l-1-.2a16.7 16.7 0 00-2.2-.2H462c-.4.1-.6.4-.8.6-.3.3-.4.6-.4 1 0 .2 0 .4.2.6l.5.5 1 .3 1.4.1 1.5-.1c.4-.1.8-.2 1-.4l.7-.5.1-.7zm-3.3-7.3c.3 0 .7 0 1-.2l.7-.4.4-.7.1-.8a2 2 0 00-.5-1.5c-.4-.4-1-.6-1.8-.6-.7 0-1.3.2-1.7.6a2 2 0 00-.5 1.5l.1.8a1.8 1.8 0 001.2 1.1l1 .2zm12.9-6.3l1.5.1 1.4.5h3.7v1.2l-.2.4-.5.2-1.2.3a4 4 0 01.3 1.4 3.8 3.8 0 01-1.4 3c-.5.4-1 .7-1.6.9a6.5 6.5 0 01-3.4.1c-.4.3-.6.5-.6.8 0 .3 0 .5.3.6l1 .3h1.3a27.5 27.5 0 013 .3l1.3.5 1 1c.2.3.3.8.3 1.5 0 .5-.1 1-.4 1.6-.3.5-.7 1-1.2 1.4-.5.4-1.2.8-2 1a10.1 10.1 0 01-5.2.1 6 6 0 01-1.7-.7c-.5-.3-.8-.7-1-1.1-.3-.4-.4-.8-.4-1.3 0-.6.2-1 .5-1.5.4-.4 1-.7 1.6-1-.3-.1-.6-.4-.8-.7a2 2 0 01-.3-1.1l.1-.6.3-.6.6-.6.7-.5a3.7 3.7 0 01-2-3.5 3.8 3.8 0 011.3-3c.5-.3 1-.6 1.7-.8.6-.2 1.3-.3 2-.3zm3.4 13.6c0-.3-.1-.5-.3-.6-.1-.2-.3-.3-.6-.4l-.9-.2a16.7 16.7 0 00-2.3-.2H475l-.8.6c-.2.3-.3.6-.3 1l.1.6.6.5 1 .3 1.4.1 1.5-.1 1-.4c.3-.1.5-.3.6-.5l.2-.7zm-3.4-7.3c.4 0 .7 0 1-.2.3 0 .5-.2.7-.4l.4-.7.2-.8a2 2 0 00-.6-1.5c-.4-.4-1-.6-1.7-.6-.8 0-1.3.2-1.7.6a2 2 0 00-.6 1.5c0 .3 0 .6.2.8a1.8 1.8 0 001 1.1l1 .2zm13.8-6.3c.8 0 1.5.1 2.2.4a4.9 4.9 0 013 3 6.9 6.9 0 01.3 3l-.1.3-.2.2h-8.3c0 1.4.4 2.4 1 3 .7.6 1.5.9 2.5.9.5 0 1 0 1.3-.2a22 22 0 001.7-.8l.6-.1h.3l.2.3 1 1c-.4.5-.8.8-1.2 1a6.4 6.4 0 01-2.8 1c-.4.2-1 .2-1.4.2-.9 0-1.7-.2-2.4-.5-.8-.3-1.5-.7-2-1.3-.6-.5-1-1.3-1.4-2.1a8.3 8.3 0 010-5.5 5.7 5.7 0 013.2-3.4c.7-.3 1.5-.4 2.5-.4zm0 2.2c-.9 0-1.6.2-2 .8-.6.5-.9 1.2-1 2.1h5.8c0-.4 0-.8-.2-1.1l-.5-1-.8-.6-1.3-.2zm8 10.8v-12.8h1.9l.6.1c.2.2.2.4.3.7l.2 1.5a6 6 0 011.6-1.9c.6-.4 1.3-.7 2-.7s1.2.2 1.6.5l-.4 2.3-.1.3-.4.1h-.5l-.8-.2c-.7 0-1.2.2-1.7.6a4 4 0 00-1.2 1.5v8h-3z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="8" width="20" height="14" transform="translate(671.5 80.5)">...</text>
|
<path fill="#3D4251" fill-rule="nonzero" d="M556.6 129.2v-17h2l.4.1c.2.1.3.2.3.4l.3 1.2c.5-.6 1-1 1.8-1.4a4.8 4.8 0 014.2-.1c.6.3 1.1.7 1.5 1.2a6 6 0 011 2 10.3 10.3 0 010 5.6c-.3.8-.7 1.5-1.1 2a5.1 5.1 0 01-6 1.7l-1.3-1v5.3h-3zm6-14.8c-.6 0-1.1.1-1.6.4-.4.3-.9.6-1.3 1.1v5.8a3 3 0 002.5 1.1c.5 0 .9 0 1.3-.2l1-.8c.2-.4.4-.8.5-1.4a8.6 8.6 0 000-3.8c0-.5-.2-1-.4-1.3a2 2 0 00-.9-.7c-.3-.2-.6-.2-1-.2zm18.2 10.6h-1.3l-.7-.1c-.2-.1-.3-.3-.4-.6l-.3-.9a10.6 10.6 0 01-2 1.3 5 5 0 01-1 .4 6.4 6.4 0 01-2.7-.1c-.5-.2-.9-.4-1.2-.7a3 3 0 01-.8-1c-.2-.5-.3-1-.3-1.6 0-.5.2-1 .4-1.4.3-.5.7-.9 1.3-1.3.6-.4 1.4-.7 2.4-1 1-.2 2.2-.3 3.6-.3v-.8c0-.9-.2-1.5-.5-2-.4-.3-1-.5-1.6-.5a3.8 3.8 0 00-2.1.5l-.7.4c-.2.2-.4.2-.7.2-.2 0-.4 0-.5-.2-.2 0-.3-.2-.4-.3l-.5-1c1.4-1.4 3.2-2 5.3-2a4.3 4.3 0 014.4 3c.2.5.3 1.2.3 1.9v8.1zm-6-2h1a3.3 3.3 0 001.4-.7l.6-.6v-2.2c-.9 0-1.6.1-2.2.3a6 6 0 00-1.5.4l-.8.6-.2.8c0 .5.2.9.5 1.1.3.3.7.4 1.2.4zm9 2v-12.8h1.9l.6.1c.2.2.3.4.3.7l.2 1.5a6 6 0 011.6-1.9c.6-.4 1.3-.7 2-.7s1.2.2 1.6.5l-.4 2.3-.1.3-.4.1h-.5l-.8-.2c-.7 0-1.2.2-1.7.6a4 4 0 00-1.1 1.5v8h-3.1zm17.9-10.3l-.3.3h-.8a32.9 32.9 0 00-1.4-.7h-1c-.6 0-1 0-1.4.3-.4.3-.5.6-.5 1 0 .3 0 .5.2.7l.7.5 1 .4a33 33 0 012.3.8c.4.2.8.4 1 .7.4.2.6.5.8 1l.2 1.2c0 .7 0 1.2-.3 1.7-.2.6-.6 1-1 1.4-.4.4-1 .7-1.6.9a7 7 0 01-3.5.2 7.6 7.6 0 01-2.3-.8l-.8-.7.7-1.1c0-.2.2-.3.3-.4h1a12 12 0 001.4.8l1.2.1h1l.6-.4c.1-.2.3-.3.3-.5l.1-.6c0-.3 0-.6-.2-.8l-.7-.5-1-.3a33.5 33.5 0 01-2.4-.9 4 4 0 01-1-.7 3 3 0 01-.7-1 3.7 3.7 0 011-4.2c.4-.3.9-.6 1.5-.8.6-.2 1.3-.3 2-.3 1 0 1.8.1 2.5.4.7.3 1.3.7 1.8 1.2l-.7 1zm8.6-2.7c.8 0 1.6.1 2.2.4a4.9 4.9 0 013 3 6.9 6.9 0 01.3 3v.3l-.3.2h-8.3c.1 1.4.5 2.4 1 3 .7.6 1.5.9 2.5.9.5 0 1 0 1.3-.2a22 22 0 001.7-.8l.6-.1h.3l.3.3.9 1c-.4.5-.8.8-1.2 1a6.4 6.4 0 01-2.7 1c-.5.2-1 .2-1.4.2-1 0-1.8-.2-2.5-.5-.8-.3-1.5-.7-2-1.3-.6-.5-1-1.3-1.4-2.1a8.3 8.3 0 010-5.5 5.7 5.7 0 013.2-3.4c.7-.3 1.6-.4 2.5-.4zm0 2.2c-.8 0-1.5.2-2 .8-.5.5-.9 1.2-1 2.1h5.8c0-.4 0-.8-.2-1.1 0-.4-.2-.7-.5-1l-.8-.6-1.2-.2zm8 10.8v-12.8h1.9l.6.1c.2.2.3.4.3.7l.2 1.5a6 6 0 011.6-1.9c.6-.4 1.3-.7 2-.7s1.2.2 1.6.5l-.4 2.3-.1.3-.4.1h-.5l-.8-.2c-.7 0-1.2.2-1.7.6a4 4 0 00-1.1 1.5v8h-3.1z"/>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M701.6 125v-12.8h2c.3 0 .6.2.7.5l.2 1a7 7 0 011.8-1.2 4.6 4.6 0 012.2-.5c.7 0 1.3 0 1.9.3.5.3 1 .6 1.3 1 .4.5.7 1 .8 1.6.2.6.3 1.2.3 2v8.1h-3v-8.2c0-.7-.2-1.4-.6-1.8-.4-.4-1-.6-1.6-.6-.6 0-1 .1-1.5.3l-1.4 1v9.3h-3zm19.6-13c.8 0 1.5.1 2.2.4a4.9 4.9 0 012.9 3 6.9 6.9 0 01.4 3l-.1.3-.3.2H718c.2 1.4.5 2.4 1.1 3 .7.6 1.5.9 2.5.9.5 0 1 0 1.3-.2a22 22 0 001.7-.8l.5-.1h.4l.2.3.9 1c-.3.5-.7.8-1.1 1a6.4 6.4 0 01-2.8 1c-.5.2-1 .2-1.4.2-.9 0-1.7-.2-2.5-.5-.7-.3-1.4-.7-2-1.3-.5-.5-1-1.3-1.3-2.1a8.3 8.3 0 010-5.5 5.7 5.7 0 013.2-3.4c.6-.3 1.5-.4 2.5-.4zm0 2.2c-.9 0-1.6.2-2 .8-.6.5-1 1.2-1 2.1h5.7l-.1-1.1-.5-1-.9-.6-1.2-.2zm8 10.8v-12.8h1.8l.7.1.3.7.1 1.5a6 6 0 011.6-1.9c.7-.4 1.4-.7 2.1-.7.7 0 1.2.2 1.6.5l-.4 2.3c0 .1 0 .2-.2.3l-.3.1h-.5l-.9-.2c-.6 0-1.2.2-1.6.6a4 4 0 00-1.2 1.5v8h-3z"/>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M831 123.3a2 2 0 01.5-1.3 2 2 0 011.3-.6 1.9 1.9 0 011.4.6 1.9 1.9 0 01.3 2 1.8 1.8 0 01-1 1 2 2 0 01-2-.4c-.2-.1-.3-.3-.4-.6a2 2 0 01-.2-.7zm5.5 0a2 2 0 01.6-1.3 2 2 0 011.3-.6 1.9 1.9 0 011.4.6 1.9 1.9 0 01.4 2 1.8 1.8 0 01-1 1 2 2 0 01-2-.4c-.3-.1-.4-.3-.5-.6a2 2 0 01-.2-.7zm5.7 0a2 2 0 01.5-1.3 2 2 0 011.4-.6 1.9 1.9 0 011.3.6 1.9 1.9 0 01.4 2 1.8 1.8 0 01-1 1 2 2 0 01-2-.4c-.3-.1-.4-.3-.5-.6a2 2 0 01-.1-.7z"/>
|
||||||
|
</g>
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 3.1 KiB After Width: | Height: | Size: 13 KiB |
|
@ -1,47 +1,60 @@
|
||||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="827" height="168" viewBox="-10 -10 837 178">
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="827" height="168" viewBox="0 0 827 168">
|
||||||
<style>
|
<defs>
|
||||||
.svg__training__text { fill: #1a1e23; font: 18px Arial, sans-serif }
|
<linearGradient id="c" x1="0%" x2="100%" y1="0%" y2="100%">
|
||||||
.svg__training__text-code { fill: #1a1e23; font: bold 16px Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace }
|
<stop offset="0%" stop-color="#B4FE67"/>
|
||||||
</style>
|
<stop offset="100%" stop-color="#FE9A98"/>
|
||||||
<defs>
|
|
||||||
<linearGradient id="a" x1="0%" x2="0%" y1="100%" y2="0%">
|
|
||||||
<stop offset="0%" stop-color="#F99"/>
|
|
||||||
<stop offset="100%" stop-color="#B3FF66"/>
|
|
||||||
</linearGradient>
|
</linearGradient>
|
||||||
</defs>
|
<rect id="a" width="116" height="29" x="0" y="0" rx="6"/>
|
||||||
|
<mask id="b" width="116" height="29" x="0" y="0" fill="#fff" maskContentUnits="userSpaceOnUse" maskUnits="objectBoundingBox">
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M561 103h-6v46H251v-35.8"/>
|
<use xlink:href="#a"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M251 107.2l4 8-4-2-4 2z"/>
|
</mask>
|
||||||
<rect fill="#f6f6f6" transform="translate(372 138.5)" width="80" height="20"/>
|
</defs>
|
||||||
<text class="svg__training__text-code" dy="1em" transform="translate(378.5 138.5)" width="65" height="16">PREDICT</text>
|
<g fill="none" fill-rule="evenodd">
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 73v6h76.8"/>
|
<path stroke="#979797" stroke-linecap="square" stroke-width="2.2" d="M562.8 118v36.2h-99.9"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M703.8 79l-8 4 2-4-2-4z"/>
|
<path stroke="#979797" stroke-linecap="square" stroke-width="2.2" d="M375 154.6l-110 .1v-27.8"/>
|
||||||
<rect fill="#f6f6f6" transform="translate(630.5 68.5)" width="50" height="20"/>
|
<path fill="#979797" d="M265 117l5 10h-10z"/>
|
||||||
<text class="svg__training__text-code" dy="1em" transform="translate(634.5 68.5)" width="43" height="18">SAVE</text>
|
<path fill="#79E000" d="M378 60l-10 5V55z"/>
|
||||||
<rect width="120" height="60" x="501" y="43" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="9" ry="9"/>
|
<path stroke="#79E000" stroke-linecap="square" stroke-width="2.2" d="M367 60.2h-41"/>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(538.5 63.5)" width="43" height="18">Model</text>
|
<path fill="#979797" d="M502 78l-10 5V73z"/>
|
||||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M121 54h61.8"/>
|
<path stroke="#979797" stroke-linecap="square" stroke-width="2.2" d="M491.2 78H475"/>
|
||||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M188.8 54l-8 4 2-4-2-4z"/>
|
<path fill="#979797" d="M703 78l-10 5V73z"/>
|
||||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M121 19h61.8"/>
|
<path stroke="#979797" stroke-linecap="square" stroke-width="2.2" d="M692.2 78H687"/>
|
||||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M188.8 19l-8 4 2-4-2-4z"/>
|
<path stroke="#979797" stroke-linecap="square" stroke-width="2.2" d="M629 77.3h-4.8"/>
|
||||||
<rect width="120" height="71" x="1" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="10.7" ry="10.7"/>
|
<path fill="#FF5D59" d="M378 95l-10 5V90z"/>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(10 26.5)" width="93" height="18">Training data</text>
|
<path stroke="#FF5D59" stroke-linecap="square" stroke-width="2.2" d="M367 95.2h-41"/>
|
||||||
<path fill="none" stroke="#87e02d" stroke-width="2" stroke-miterlimit="10" d="M311 54h51.8"/>
|
<path fill="#3AC" d="M203 27l-10 5V22z"/>
|
||||||
<path fill="#87e02d" stroke="#87e02d" stroke-width="2" stroke-miterlimit="10" d="M368.8 54l-8 4 2-4-2-4z"/>
|
<path stroke="#3AC" stroke-linecap="square" stroke-width="2.2" d="M192 27.2h-41"/>
|
||||||
<path fill="#dae8fc" stroke="#09a3d5" stroke-width="2" d="M191 39h120v30H191z"/>
|
<path fill="#3AC" d="M203 61l-10 5V56z"/>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(232.5 44.5)" width="35" height="18">label</text>
|
<path stroke="#3AC" stroke-linecap="square" stroke-width="2.2" d="M192 61.2h-41"/>
|
||||||
<path fill="none" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M311 90h51.8"/>
|
<rect width="117.5" height="73.5" x="25.8" y="8.8" fill="#C3E7F1" stroke="#3AC" stroke-width="3.5" rx="12"/>
|
||||||
<path fill="#f33" stroke="#f33" stroke-width="2" stroke-miterlimit="10" d="M368.8 90l-8 4 2-4-2-4z"/>
|
<g transform="translate(505 46)">
|
||||||
<path fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" d="M191 75h120v30H191z" stroke-dasharray="2 2"/>
|
<rect width="113" height="60" x="1.5" y="1.5" fill="#FFF" stroke="#B7B7B7" stroke-width="3" rx="12"/>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(232.5 80.5)" width="35" height="18">label</text>
|
<path fill="#3D4251" fill-rule="nonzero" d="M40 31.6a7.3 7.3 0 01.5 1.2 20.3 20.3 0 01.6-1.2l3.8-7.2.2-.2.1-.2h2.4v13h-2.2v-8.4a10.7 10.7 0 010-1l-3.9 7.3c0 .2-.2.3-.3.4a1 1 0 01-.5.1h-.3a1 1 0 01-.5-.1 1 1 0 01-.4-.4l-4-7.4a8 8 0 010 1V37h-2V24H35.7l.1.2.2.2 3.9 7.2zm14-4a5 5 0 011.9.4 4 4 0 012.3 2.4c.3.6.4 1.2.4 2 0 .7-.1 1.4-.4 2-.2.5-.5 1-.9 1.4a4 4 0 01-1.4 1 5 5 0 01-1.9.3c-.7 0-1.3 0-1.9-.3a4 4 0 01-2.3-2.5c-.3-.5-.4-1.2-.4-2 0-.7.1-1.3.4-2a4 4 0 012.4-2.4c.5-.2 1.1-.3 1.8-.3zm0 7.8c.8 0 1.3-.2 1.7-.7.4-.6.6-1.3.6-2.3a4 4 0 00-.6-2.3c-.4-.5-1-.8-1.7-.8-.8 0-1.4.3-1.7.8-.4.5-.6 1.3-.6 2.3 0 1 .2 1.7.6 2.2.3.6 1 .8 1.7.8zM66.8 37c-.3 0-.5-.1-.6-.4l-.2-.9-.6.6a3.8 3.8 0 01-1.4.7l-1 .1a3 3 0 01-2.4-1.2c-.3-.4-.5-.9-.7-1.5a7.5 7.5 0 010-3.9c.2-.6.5-1.1.8-1.5.4-.5.8-.8 1.3-1 .5-.3 1-.4 1.6-.4a3.2 3.2 0 012.3.9v-4.9h2.3V37h-1.4zm-3-1.6c.5 0 .9-.1 1.2-.3l1-.8V30a2.2 2.2 0 00-1.9-.8c-.3 0-.6 0-.9.2l-.7.5-.4 1-.1 1.4v1.4l.5.9.5.5.8.2zm10.6-7.8c.6 0 1 .1 1.6.3a3.5 3.5 0 012 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c.1 1 .4 1.7.8 2.1.5.5 1 .7 1.8.7l1-.1.6-.3c.2 0 .4-.2.5-.3h.7l.1.1.7.8c-.3.3-.5.6-.8.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.8-.3 4 4 0 01-1.5-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 1-1.4.3-.5.8-.8 1.3-1 .6-.3 1.2-.4 1.9-.4zm0 1.6a2 2 0 00-1.5.6c-.4.3-.6.8-.7 1.5h4.2l-.1-.8-.4-.7-.6-.4a2 2 0 00-.9-.2zm8.1-5.6V37h-2.2V23.6h2.2z"/>
|
||||||
<rect width="120" height="60" x="706" y="49" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="9" ry="9"/>
|
</g>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(734.5 59.5)" width="61" height="38">Updated
|
<g transform="translate(704 46)">
|
||||||
<tspan dy="1.25em" dx="-3.25em">Model</tspan>
|
<rect width="113" height="60" x="1.5" y="1.5" fill="#FFF" stroke="#B7B7B7" stroke-width="3" rx="12"/>
|
||||||
</text>
|
<path fill="#3D4251" fill-rule="nonzero" d="M29.6 28c.5 0 1 0 1.3-.2a2.6 2.6 0 001.5-1.7c.2-.4.2-.8.2-1.3V17H35v7.8a6 6 0 01-.3 2.1 4.9 4.9 0 01-2.8 2.8 6 6 0 01-2.3.4 6 6 0 01-2.2-.4 4.9 4.9 0 01-2.8-2.8 6 6 0 01-.3-2.1V17h2.4v7.8c0 .5 0 1 .2 1.3.1.4.3.8.6 1 .2.3.5.6.9.7.4.2.8.2 1.2.2zm7.8 5V20.8H39l.2.4.2.8 1.3-1a3.5 3.5 0 013 0c.5.1.8.4 1.1.8.3.4.6 1 .7 1.5a7.4 7.4 0 010 4c-.2.5-.4 1-.8 1.5a3.7 3.7 0 01-4.2 1.1l-1-.7V33h-2.2zm4.3-10.6c-.5 0-.9 0-1.2.2l-.9.9v4.1a2.1 2.1 0 001.8.8l.9-.1.7-.6.4-1 .2-1.4-.1-1.4-.4-.9-.6-.5-.8-.1zM54.1 30c-.3 0-.5-.1-.6-.4l-.2-.9-.6.6a3.8 3.8 0 01-1.5.7l-.9.1A3 3 0 0148 29c-.3-.4-.6-.9-.7-1.5a7.5 7.5 0 010-3.9c.2-.6.5-1.1.8-1.5.4-.5.8-.8 1.3-1 .4-.3 1-.4 1.6-.4a3.2 3.2 0 012.3.9v-4.9h2.2V30h-1.3zm-3-1.6c.5 0 .9-.1 1.2-.3l.9-.8V23a2.2 2.2 0 00-1.8-.8c-.3 0-.6 0-.9.2l-.7.5-.4 1-.2 1.4c0 .6 0 1 .2 1.4 0 .4.2.7.3.9l.6.5.8.2zm14 1.6h-1a1 1 0 01-.5 0l-.3-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.8.2a4.6 4.6 0 01-2-.1l-.8-.5-.5-.8c-.2-.3-.2-.6-.2-1s0-.7.2-1c.2-.4.5-.7 1-1 .4-.3 1-.5 1.7-.7a12 12 0 012.6-.3v-.5c0-.7-.1-1.1-.4-1.4-.3-.3-.6-.4-1.1-.4a2.8 2.8 0 00-1.6.4l-.5.2a1 1 0 01-.4.2l-.4-.1a1 1 0 01-.3-.3l-.4-.7c1-1 2.3-1.5 3.8-1.5.6 0 1 0 1.5.3a3 3 0 011.7 1.8l.3 1.5V30zm-4.4-1.4h.7a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.6.5a1 1 0 00-.2.5c0 .4.1.7.3.9l1 .2zm9.6 1.5c-.8 0-1.4-.2-1.9-.6-.4-.5-.6-1.1-.6-2v-5h-1l-.3-.2V21l1.4-.2.5-2.5.1-.3H70v2.8h2.4v1.6H70v5c0 .3 0 .5.2.7l.6.3.3-.1a2 2 0 00.5-.2H71.9l.1.1.7 1.1c-.3.3-.7.5-1.1.6-.4.2-.9.2-1.3.2zm7.7-9.5c.6 0 1.1.1 1.6.3a3.5 3.5 0 012.1 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c0 1 .3 1.7.8 2.1.4.5 1 .7 1.7.7l1-.1.7-.3.5-.3H81l.2.1.6.8c-.2.3-.5.6-.8.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.8-.3 4 4 0 01-1.4-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 .9-1.4.4-.5.8-.8 1.4-1 .5-.3 1.1-.4 1.8-.4zm0 1.6a2 2 0 00-1.4.6c-.4.3-.6.8-.7 1.5H80v-.8l-.4-.7-.7-.4a2 2 0 00-.8-.2zM90.4 30c-.3 0-.4-.1-.5-.4l-.2-.9-.6.6a3.8 3.8 0 01-1.5.7l-.9.1a3 3 0 01-2.5-1.2c-.3-.4-.5-.9-.7-1.5a7.5 7.5 0 010-3.9c.3-.6.5-1.1.9-1.5.3-.5.7-.8 1.2-1 .5-.3 1-.4 1.7-.4a3.2 3.2 0 012.3.9v-4.9h2.2V30h-1.4zm-3-1.6c.5 0 1-.1 1.3-.3l.9-.8V23a2.2 2.2 0 00-1.8-.8c-.3 0-.7 0-1 .2l-.6.5-.5 1-.1 1.4.1 1.4c.1.4.2.7.4.9l.6.5.8.2zM40 42.6a7.3 7.3 0 01.5 1.2 20.3 20.3 0 01.6-1.2l3.8-7.2.2-.2.1-.2h2.4v13h-2.2v-8.4a10.7 10.7 0 010-1l-3.9 7.3c0 .2-.2.3-.3.4a1 1 0 01-.5.1h-.3a1 1 0 01-.5-.1 1 1 0 01-.4-.4l-4-7.4a8 8 0 010 1V48h-2V35H35.7l.1.2.2.2 3.9 7.2zm14-4a5 5 0 011.9.4 4 4 0 012.3 2.4c.3.6.4 1.2.4 2 0 .7-.1 1.4-.4 2-.2.5-.5 1-.9 1.4a4 4 0 01-1.4 1 5 5 0 01-1.9.3c-.7 0-1.3 0-1.9-.3a4 4 0 01-2.3-2.5c-.3-.5-.4-1.2-.4-2 0-.7.1-1.3.4-2a4 4 0 012.4-2.4c.5-.2 1.1-.3 1.8-.3zm0 7.8c.8 0 1.3-.2 1.7-.7.4-.6.6-1.3.6-2.3a4 4 0 00-.6-2.3c-.4-.5-1-.8-1.7-.8-.8 0-1.4.3-1.7.8-.4.5-.6 1.3-.6 2.3 0 1 .2 1.7.6 2.2.3.6 1 .8 1.7.8zM66.8 48c-.3 0-.5-.1-.6-.4l-.2-.9-.6.6a3.8 3.8 0 01-1.4.7l-1 .1a3 3 0 01-2.4-1.2c-.3-.4-.5-.9-.7-1.5a7.5 7.5 0 010-3.9c.2-.6.5-1.1.8-1.5.4-.5.8-.8 1.3-1 .5-.3 1-.4 1.6-.4a3.2 3.2 0 012.3.9v-4.9h2.3V48h-1.4zm-3-1.6c.5 0 .9-.1 1.2-.3l1-.8V41a2.2 2.2 0 00-1.9-.8c-.3 0-.6 0-.9.2l-.7.5-.4 1-.1 1.4v1.4l.5.9.5.5.8.2zm10.6-7.8c.6 0 1 .1 1.6.3a3.5 3.5 0 012 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c.1 1 .4 1.7.8 2.1.5.5 1 .7 1.8.7l1-.1.6-.3c.2 0 .4-.2.5-.3h.7l.1.1.7.8c-.3.3-.5.6-.8.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.8-.3 4 4 0 01-1.5-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 1-1.4.3-.5.8-.8 1.3-1 .6-.3 1.2-.4 1.9-.4zm0 1.6a2 2 0 00-1.5.6c-.4.3-.6.8-.7 1.5h4.2l-.1-.8-.4-.7-.6-.4a2 2 0 00-.9-.2zm8.1-5.6V48h-2.2V34.6h2.2z"/>
|
||||||
<path fill="#dae8fc" stroke="#09a3d5" stroke-width="2" d="M191 4h120v30H191z"/>
|
</g>
|
||||||
<text class="svg__training__text" dy="0.9em" transform="translate(236.5 9.5)" width="27" height="18">text</text>
|
<g transform="translate(207 12)">
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M461 73h31.8"/>
|
<rect width="113.5" height="26.5" x="1.3" y="1.3" fill="#C3E7F1" stroke="#3AC" stroke-width="2.5" rx="6"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M498.8 73l-8 4 2-4-2-4z"/>
|
<path fill="#3D4251" fill-rule="nonzero" d="M50 8v2h-4v11h-2.4V10h-3.9V8H50zm5.2 3.6c.6 0 1.1.1 1.6.3A3.5 3.5 0 0159 14a5 5 0 01.3 2.2l-.1.3-.2.1H53c0 1 .3 1.7.7 2.1.5.5 1 .7 1.8.7l1-.1.6-.3c.2 0 .4-.2.5-.3h.7l.2.1.6.8c-.2.3-.5.6-.8.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.8-.3 4 4 0 01-1.4-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 .9-1.4.3-.5.8-.8 1.3-1 .6-.3 1.2-.4 2-.4zm0 1.6a2 2 0 00-1.5.6c-.3.3-.6.8-.7 1.5h4.2l-.1-.8-.4-.7c-.1-.2-.3-.3-.6-.4a2 2 0 00-.8-.2zm8 3l-3-4.4h2.5l.2.3 2 3a2.5 2.5 0 01.2-.6l1.5-2.4.3-.3h2.3l-3 4.3 3.1 4.9h-2.1l-.4-.1-.3-.3-2-3.2-.1.5-1.8 2.7-.2.3-.4.1h-2l3.2-4.8zm10.6 5c-.8 0-1.4-.3-1.9-.7-.4-.5-.6-1.1-.6-2v-5h-1l-.3-.2-.1-.3v-1l1.5-.2.4-2.5.2-.3h1.5v2.8h2.4v1.6h-2.4v5c0 .3 0 .5.2.7.2.2.3.3.6.3l.3-.1a2 2 0 00.5-.2H75.4l.1.1.7 1.1c-.3.3-.7.5-1.1.6-.5.2-.9.2-1.3.2z"/>
|
||||||
<path fill="url(#a)" d="M409.5 21L461 72.5 409.5 124 358 72.5z"/>
|
</g>
|
||||||
<text class="svg__training__text-code" dy="0.9em" transform="translate(371.5 64.5)" width="67" height="16">GRADIENT</text>
|
<g transform="translate(207 46)">
|
||||||
|
<rect width="113.5" height="26.5" x="1.3" y="1.3" fill="#C3E7F1" stroke="#3AC" stroke-width="2.5" rx="6"/>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M40 19h5.2v2h-7.6V8H40v11zm14 2h-1a1 1 0 01-.5 0l-.3-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.7.2a4.6 4.6 0 01-2-.1l-.9-.5-.5-.8-.2-1c0-.4 0-.7.3-1 .1-.4.5-.7.9-1 .4-.3 1-.5 1.7-.7a12 12 0 012.6-.3v-.5c0-.7-.1-1.1-.4-1.4-.2-.3-.6-.5-1.1-.5a2.8 2.8 0 00-1.5.5l-.5.2a1 1 0 01-.5.2l-.4-.1a1 1 0 01-.3-.3l-.4-.7c1-1 2.4-1.5 3.9-1.5.5 0 1 0 1.4.3a3 3 0 011.8 1.8l.2 1.5V21zm-4.3-1.4h.6a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.6.5a1 1 0 00-.2.5c0 .4.1.7.4.9l.9.2zm6.5 1.4V7.6h2.2V13l1.3-1a3.6 3.6 0 013 0c.4.2.7.5 1 1 .3.3.6.8.7 1.4a7.4 7.4 0 010 4c-.2.5-.4 1-.8 1.5a3.7 3.7 0 01-2.9 1.3 3.3 3.3 0 01-1.4-.3c-.2 0-.4-.2-.5-.4l-.5-.5v.7l-.3.3-.3.1h-1.5zm4.3-7.7l-1.2.3-.9.9v4.1a2.1 2.1 0 001.8.8l1-.1.6-.6.4-1c.2-.4.2-.9.2-1.4l-.1-1.4-.4-.9-.6-.5-.8-.2zm9.7-1.7c.6 0 1.2.1 1.7.3a3.5 3.5 0 012 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c0 1 .3 1.7.8 2.1.4.5 1 .7 1.8.7l.9-.1.7-.3c.2 0 .3-.2.5-.3H73.3l.2.1.7.8c-.3.3-.6.6-.9.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.7-.3 4 4 0 01-1.5-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 .9-1.4.4-.5.8-.8 1.4-1 .5-.3 1.2-.4 1.8-.4zm0 1.6a2 2 0 00-1.4.6c-.4.3-.6.8-.7 1.5h4.1v-.8l-.4-.7-.6-.4a2 2 0 00-1-.2zm8.2-5.6V21h-2.2V7.6h2.2z"/>
|
||||||
|
</g>
|
||||||
|
<g transform="translate(207 80)">
|
||||||
|
<use stroke="#3AC" stroke-dasharray="3 3" stroke-width="5" mask="url(#b)" xlink:href="#a"/>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M40 19h5.2v2h-7.6V8H40v11zm14 2h-1a1 1 0 01-.5 0l-.3-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.7.2a4.6 4.6 0 01-2-.1l-.9-.5-.5-.8-.2-1c0-.4 0-.7.3-1 .1-.4.5-.7.9-1 .4-.3 1-.5 1.7-.7a12 12 0 012.6-.3v-.5c0-.7-.1-1.1-.4-1.4-.2-.3-.6-.5-1.1-.5a2.8 2.8 0 00-1.5.5l-.5.2a1 1 0 01-.5.2l-.4-.1a1 1 0 01-.3-.3l-.4-.7c1-1 2.4-1.5 3.9-1.5.5 0 1 0 1.4.3a3 3 0 011.8 1.8l.2 1.5V21zm-4.3-1.4h.6a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.6.5a1 1 0 00-.2.5c0 .4.1.7.4.9l.9.2zm6.5 1.4V7.6h2.2V13l1.3-1a3.6 3.6 0 013 0c.4.2.7.5 1 1 .3.3.6.8.7 1.4a7.4 7.4 0 010 4c-.2.5-.4 1-.8 1.5a3.7 3.7 0 01-2.9 1.3 3.3 3.3 0 01-1.4-.3c-.2 0-.4-.2-.5-.4l-.5-.5v.7l-.3.3-.3.1h-1.5zm4.3-7.7l-1.2.3-.9.9v4.1a2.1 2.1 0 001.8.8l1-.1.6-.6.4-1c.2-.4.2-.9.2-1.4l-.1-1.4-.4-.9-.6-.5-.8-.2zm9.7-1.7c.6 0 1.2.1 1.7.3a3.5 3.5 0 012 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c0 1 .3 1.7.8 2.1.4.5 1 .7 1.8.7l.9-.1.7-.3c.2 0 .3-.2.5-.3H73.3l.2.1.7.8c-.3.3-.6.6-.9.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.7-.3 4 4 0 01-1.5-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 .9-1.4.4-.5.8-.8 1.4-1 .5-.3 1.2-.4 1.8-.4zm0 1.6a2 2 0 00-1.4.6c-.4.3-.6.8-.7 1.5h4.1v-.8l-.4-.7-.6-.4a2 2 0 00-1-.2zm8.2-5.6V21h-2.2V7.6h2.2z"/>
|
||||||
|
</g>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M61.5 31v2h-4v11h-2.4V33h-3.9v-2h10.3zm1.4 13v-9.2h1.8l.1.5.2 1.1c.3-.5.7-1 1.1-1.3.5-.3 1-.5 1.5-.5s.9.1 1.2.3l-.3 1.7-.1.2H68h-1c-.4 0-.8 0-1.2.3a3 3 0 00-.8 1.1V44H63zm14.6 0h-1a1 1 0 01-.5 0l-.3-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.7.2a4.6 4.6 0 01-2-.1l-.9-.5-.5-.8c-.2-.3-.2-.6-.2-1s0-.7.3-1c.1-.4.4-.7.9-1 .4-.3 1-.5 1.7-.7a12 12 0 012.6-.3v-.5c0-.7-.1-1.1-.4-1.4-.2-.3-.6-.4-1.1-.4a2.8 2.8 0 00-1.5.4l-.5.2a1 1 0 01-.5.2l-.4-.1a1 1 0 01-.3-.3l-.4-.7c1-1 2.4-1.5 3.9-1.5.5 0 1 0 1.4.3a3 3 0 011.7 1.8l.3 1.5V44zm-4.3-1.4h.6a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.6.5a1 1 0 00-.2.5c0 .4.1.7.4.9l.9.2zm8.8-7.8V44h-2.2v-9.2H82zm.4-2.7c0 .2 0 .3-.2.5a1.5 1.5 0 01-.7.8 1.4 1.4 0 01-1.6-.3c0-.2-.2-.3-.3-.5v-.5a1.4 1.4 0 01.3-1 1.4 1.4 0 011-.4h.6a1.5 1.5 0 01.7.8l.2.6zM84.5 44v-9.2H86c.3 0 .5.1.6.4l.1.7a5 5 0 011.3-1 3.3 3.3 0 011.6-.3c.5 0 1 .1 1.3.3.4.1.7.4 1 .7.3.3.4.7.6 1.1l.2 1.4V44h-2.2v-5.9c0-.5-.2-1-.4-1.3-.3-.3-.7-.4-1.2-.4-.4 0-.7 0-1 .2l-1 .7V44h-2.3zm12.6-9.2V44H95v-9.2h2.2zm.4-2.7l-.1.5a1.5 1.5 0 01-.8.8A1.4 1.4 0 0195 33c0-.2-.2-.3-.3-.5v-.5a1.4 1.4 0 01.3-1 1.4 1.4 0 011-.4h.6a1.5 1.5 0 01.8.8v.6zM99.7 44v-9.2h1.3c.3 0 .5.1.6.4l.1.7a5 5 0 011.3-1 3.3 3.3 0 011.6-.3c.5 0 1 .1 1.3.3.4.1.7.4 1 .7.3.3.5.7.6 1.1l.2 1.4V44h-2.2v-5.9c0-.5-.2-1-.4-1.3-.3-.3-.7-.4-1.2-.4-.4 0-.7 0-1 .2l-1 .7V44h-2.2zm13.4-9.4l1.1.1 1 .4h2.6v.8l-.1.3-.4.2-.8.1a2.9 2.9 0 01.2 1 2.7 2.7 0 01-1 2.3l-1.2.6a4.7 4.7 0 01-2.4 0c-.3.3-.5.4-.5.7 0 .2.1.3.3.4l.7.2h1a19.8 19.8 0 012 .2c.4.1.8.2 1 .4l.7.7c.2.2.3.6.3 1l-.3 1.2c-.2.4-.5.7-.9 1-.4.4-.9.6-1.4.8a7.3 7.3 0 01-3.7 0c-.5 0-1-.3-1.3-.5l-.8-.8-.2-.9c0-.4.1-.8.4-1 .2-.4.6-.6 1-.8l-.5-.5-.2-.8v-.4l.3-.5.4-.4.5-.3a2.7 2.7 0 01-1.5-2.5 2.7 2.7 0 011-2.2l1.2-.6a5 5 0 011.5-.2zm2.4 9.8l-.1-.4a1 1 0 00-.5-.3l-.6-.1a12 12 0 00-1.7-.1l-.9-.1a2 2 0 00-.6.5 1 1 0 00-.2.6l.1.5.4.3.7.3h2.1c.3 0 .6-.2.8-.3l.4-.4.1-.5zm-2.4-5.2c.3 0 .5 0 .7-.2.2 0 .4-.1.5-.3l.3-.4.1-.7c0-.4-.1-.8-.4-1-.3-.3-.7-.4-1.2-.4-.6 0-1 0-1.2.4-.3.2-.5.6-.5 1l.1.6a1.3 1.3 0 00.9.8l.7.2zM74 62c-.2 0-.4-.1-.5-.4l-.2-.9-.6.6a3.8 3.8 0 01-1.5.7l-.9.1A3 3 0 0168 61c-.3-.4-.6-.9-.7-1.5a7.5 7.5 0 010-3.9c.2-.6.4-1.1.8-1.5.3-.5.8-.8 1.2-1 .5-.3 1-.4 1.7-.4a3.2 3.2 0 012.3.9v-4.9h2.2V62h-1.3zm-3-1.6c.6 0 1-.1 1.3-.3l.9-.8V55a2.2 2.2 0 00-1.8-.8c-.3 0-.6 0-1 .2l-.6.5-.4 1-.2 1.4.1 1.4c.1.4.2.7.4.9l.6.5.8.2zm14 1.6h-1a1 1 0 01-.4 0c-.2-.2-.3-.3-.3-.5l-.2-.6a7.6 7.6 0 01-1.4 1l-.8.2a4.6 4.6 0 01-2-.1l-.8-.5-.5-.8c-.2-.3-.2-.6-.2-1s0-.7.2-1c.2-.4.5-.7 1-1 .4-.3 1-.5 1.6-.7a12 12 0 012.7-.3v-.5c0-.7-.2-1.1-.4-1.4-.3-.3-.7-.4-1.2-.4a2.8 2.8 0 00-1.5.4l-.5.2a1 1 0 01-.5.2l-.3-.1a1 1 0 01-.3-.3l-.4-.7c1-1 2.3-1.5 3.8-1.5.6 0 1 0 1.5.3a3 3 0 011.7 1.8c.2.5.2 1 .2 1.5V62zm-4.3-1.4h.7a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.7.5a1 1 0 00-.1.5c0 .4 0 .7.3.9l1 .2zm9.6 1.5c-.8 0-1.4-.2-1.9-.6-.4-.5-.6-1.1-.6-2v-5h-1l-.3-.2-.1-.3v-1l1.5-.2.5-2.5.1-.3h1.5v2.8h2.4v1.6h-2.4v5c0 .3 0 .5.2.7.2.2.3.3.6.3l.3-.1a2 2 0 00.5-.2H92l.1.1.7 1.1c-.3.3-.7.5-1.1.6-.4.2-.9.2-1.3.2zm11.1-.1h-1a1 1 0 01-.5 0l-.2-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.8.2a4.6 4.6 0 01-2-.1l-.8-.5-.6-.8-.2-1c0-.4.1-.7.3-1 .2-.4.5-.7 1-1 .4-.3 1-.5 1.6-.7a12 12 0 012.7-.3v-.5c0-.7-.2-1.1-.4-1.4-.3-.3-.7-.4-1.2-.4a2.8 2.8 0 00-1.5.4l-.5.2a1 1 0 01-.5.2L95 55a1 1 0 01-.2-.3l-.4-.7c1-1 2.3-1.5 3.8-1.5.6 0 1 0 1.5.3a3 3 0 011.7 1.8c.2.5.2 1 .2 1.5V62zm-4.3-1.4h.6a2.4 2.4 0 001-.5l.6-.5v-1.5l-1.7.1-1 .3-.6.5a1 1 0 00-.1.5c0 .4 0 .7.3.9l1 .2z"/>
|
||||||
|
<path fill="url(#c)" d="M384.1 42.1h73v73h-73z" transform="rotate(45 420.6 78.6)"/>
|
||||||
|
<path fill="#3D4251" fill-rule="nonzero" d="M393.4 80.2a6 6 0 002.6-.5v-2.4h-1.6l-.4-.1-.1-.4v-1.3h4.3v5.2a7.2 7.2 0 01-3.5 1.4h-1.5c-1 0-1.8-.1-2.6-.5a6.2 6.2 0 01-3.4-3.4c-.4-.9-.5-1.7-.5-2.7 0-1 .1-1.9.5-2.7a6 6 0 013.4-3.5c.9-.3 1.8-.5 2.9-.5 1 0 2 .2 2.7.5.8.3 1.4.7 2 1.2l-.7 1.1c-.2.3-.3.4-.6.4-.1 0-.3 0-.4-.2a34.3 34.3 0 00-1.3-.6 5.4 5.4 0 00-3.6 0c-.5.2-1 .6-1.3 1-.4.4-.6.8-.8 1.4-.2.6-.3 1.2-.3 1.9s0 1.4.3 2c.2.6.5 1 .9 1.5.3.4.8.7 1.3.9.5.2 1.1.3 1.7.3zm7 1.8v-9.2h1.7l.2.5.1 1.1c.4-.5.7-1 1.2-1.3.4-.3 1-.5 1.5-.5.4 0 .8.1 1.1.3l-.3 1.7v.2H404.5c-.5 0-.9 0-1.2.3a3 3 0 00-.8 1.1V82h-2.3zm14.5 0h-1a1 1 0 01-.5 0l-.2-.5-.2-.6a7.6 7.6 0 01-1.4 1l-.8.2a4.6 4.6 0 01-2-.1l-.8-.5c-.3-.2-.4-.5-.6-.8l-.2-1c0-.4.1-.7.3-1 .2-.4.5-.7 1-1 .3-.3.9-.5 1.6-.7a12 12 0 012.6-.3v-.5c0-.7 0-1.1-.3-1.4-.3-.3-.7-.5-1.2-.5a2.8 2.8 0 00-1.5.5l-.5.2a1 1 0 01-.5.2l-.4-.1a1 1 0 01-.2-.3l-.4-.7c1-1 2.3-1.5 3.8-1.5.5 0 1 0 1.4.3a3 3 0 011.8 1.8l.2 1.5V82zm-4.3-1.4h.6a2.4 2.4 0 001-.5l.5-.5v-1.5l-1.6.1-1 .3-.6.5a1 1 0 00-.2.5c0 .4.2.7.4.9l.9.2zm13 1.4c-.3 0-.5-.1-.6-.4l-.1-.9-.6.6a3.8 3.8 0 01-1.5.7l-1 .1a3 3 0 01-2.4-1.2c-.3-.4-.5-.9-.7-1.5a7.5 7.5 0 010-3.9c.2-.6.5-1.1.8-1.5.4-.5.8-.8 1.3-1 .5-.3 1-.4 1.6-.4a3.2 3.2 0 012.3.9v-4.9h2.3V82h-1.4zm-3-1.6c.5 0 .9-.1 1.2-.3l1-.8V75a2.2 2.2 0 00-1.8-.8c-.4 0-.7 0-1 .2l-.7.5-.4 1-.1 1.4v1.4l.5.9c.1.2.3.4.6.5l.7.2zm9.1-7.6V82h-2.2v-9.2h2.2zm.4-2.7c0 .2 0 .3-.2.5a1.5 1.5 0 01-.7.8 1.4 1.4 0 01-1.6-.3c0-.2-.2-.3-.3-.5v-.5a1.4 1.4 0 01.3-1 1.4 1.4 0 011-.4h.6a1.5 1.5 0 01.7.8l.2.6zm6 2.5l1.6.3a3.5 3.5 0 012 2.1 5 5 0 01.3 2.2v.3l-.2.1h-6c.1 1 .4 1.7.8 2.2.4.4 1 .6 1.8.6l.9-.1c.3 0 .5-.2.7-.3.2 0 .3-.2.5-.3h.7l.1.1.7.8c-.3.3-.5.6-.9.7a4.6 4.6 0 01-2 .8h-1a5 5 0 01-1.7-.3 4 4 0 01-1.5-1c-.4-.3-.7-.9-1-1.5a6 6 0 010-3.9c.2-.5.5-1 .9-1.4.4-.5.9-.8 1.4-1 .5-.3 1.2-.4 1.9-.4zm0 1.6a2 2 0 00-1.5.6c-.4.3-.6.8-.7 1.5h4.2c0-.3 0-.5-.2-.8l-.3-.7-.6-.4a2 2 0 00-.9-.2zm5.8 7.8v-9.2h1.3c.3 0 .5.1.6.4l.1.7a5 5 0 011.3-1 3.3 3.3 0 011.6-.3c.5 0 1 .1 1.3.3.4.1.8.4 1 .7.3.3.5.7.6 1.1l.2 1.4V82h-2.2v-5.9c0-.5-.1-1-.4-1.3-.3-.3-.7-.5-1.2-.5-.4 0-.7.1-1 .3l-1 .7V82h-2.2zm13.2.1c-.8 0-1.4-.2-1.8-.6-.4-.5-.7-1.1-.7-2v-5h-.9l-.3-.2-.1-.3v-1l1.4-.2.5-2.5c0-.1 0-.2.2-.3h1.5v2.8h2.4v1.6h-2.4v5c0 .3 0 .5.2.7.1.2.3.3.6.3l.3-.1a2 2 0 00.4-.2H456.7l.2.1.7 1.1c-.4.3-.7.5-1.2.6-.4.2-.8.2-1.3.2z"/>
|
||||||
|
<rect width="80" height="18" x="378" y="145" fill="#37BBAB" rx="9"/>
|
||||||
|
<g transform="translate(631 69)">
|
||||||
|
<rect width="52" height="18" x="1" fill="#37BBAB" rx="9"/>
|
||||||
|
<path fill="#FFF" fill-rule="nonzero" d="M13.6 5.5c0 .2 0 .2-.2.3H12.8a12.2 12.2 0 00-1.1-.6l-.9-.1H10l-.5.4c-.2 0-.3.2-.4.4v.6c0 .3 0 .5.2.7l.6.5.8.3a41.9 41.9 0 012 .7l.8.6a2.6 2.6 0 01.9 2c0 .6-.1 1-.3 1.5a3.4 3.4 0 01-2 2c-.5.2-1.1.3-1.7.3a5.5 5.5 0 01-3.8-1.5l.6-1 .2-.2H8a13 13 0 001.3.8l1 .2c.6 0 1.1-.2 1.4-.5.4-.3.5-.7.5-1.2 0-.3 0-.6-.2-.8-.1-.2-.3-.3-.6-.4l-.8-.4a28.4 28.4 0 01-2-.7L8 9l-.7-1A3.4 3.4 0 018 4.4a4.3 4.3 0 012.8-1c.7 0 1.3.1 1.9.3.6.2 1 .5 1.5 1l-.6 1zM26.2 15h-1.7a.7.7 0 01-.7-.5l-.9-2.3h-4.8l-.8 2.3a.8.8 0 01-.7.5h-1.7l4.5-11.6h2.2L26.2 15zm-7.5-4.4h3.7L21 6.8a17.6 17.6 0 01-.5-1.4 26.7 26.7 0 01-.4 1.4l-1.4 3.8zm7.5-7.2H28a.7.7 0 01.7.5l2.7 7a9.5 9.5 0 01.5 1.7l.5-1.6L35 4l.2-.4.5-.2h1.7L33 15h-2L26.2 3.4zm19.8 0v1.7h-5v3.3h4V10h-4v3.3h5V15h-7.3V3.4H46z"/>
|
||||||
|
</g>
|
||||||
|
<path fill="#FFF" fill-rule="nonzero" d="M389 156v4h-2.2v-11.6h3.8c.7 0 1.4.1 2 .3.5.2 1 .4 1.4.8l.8 1.1.3 1.5c0 .6-.1 1-.3 1.6l-.9 1.2a4 4 0 01-1.4.7c-.5.2-1.2.3-2 .3H389zm0-1.8h1.6l1-.1.7-.4.5-.7a2.6 2.6 0 000-1.7l-.5-.7a2 2 0 00-.7-.4l-1-.1H389v4.1zm10 1.3v4.5h-2.2v-11.6h3.5c.8 0 1.5.1 2 .3.6.1 1 .4 1.4.7.4.3.7.6.8 1a3.5 3.5 0 01-.4 3.4l-.8.8-1 .5.6.6 3 4.3h-2a1 1 0 01-.5-.1 1 1 0 01-.3-.3l-2.4-3.7-.3-.3a1 1 0 00-.5-.1h-1zm0-1.6h1.3l1-.1.8-.4.4-.7.2-.8c0-.6-.2-1-.6-1.3-.4-.3-1-.5-1.8-.5H399v3.8zm15.5-5.5v1.7h-5.1v3.3h4v1.6h-4v3.3h5.1v1.7h-7.3v-11.6h7.3zm12.1 5.8c0 .9-.1 1.6-.4 2.4a5.4 5.4 0 01-3 3c-.7.3-1.5.4-2.4.4h-4.4v-11.6h4.4c.9 0 1.7.2 2.4.5a5.4 5.4 0 013 3c.3.7.4 1.5.4 2.3zm-2.2 0c0-.6 0-1.2-.2-1.7s-.4-1-.7-1.3c-.4-.3-.7-.6-1.2-.8a4 4 0 00-1.5-.3h-2.3v8.2h2.3c.6 0 1-.1 1.5-.3.5-.2.8-.4 1.2-.8.3-.3.5-.8.7-1.3.2-.5.2-1 .2-1.7zm6.4 5.8h-2.2v-11.6h2.2V160zm10.5-2.7l.3.1.9 1c-.5.5-1 1-1.8 1.3a6 6 0 01-2.4.4 5.1 5.1 0 01-5.2-3.5 7 7 0 010-4.8 5.5 5.5 0 013.1-3 6.4 6.4 0 014.7 0c.6.2 1.2.6 1.6 1l-.7 1-.2.2h-.2l-.4-.1a4.7 4.7 0 00-1.2-.6l-1.2-.2-1.5.3c-.5.2-.8.5-1.2.8-.3.4-.6.8-.7 1.3a5 5 0 00-.3 1.7c0 .7 0 1.2.3 1.8.1.5.4.9.7 1.2.3.4.7.6 1.1.8.4.2 1 .3 1.4.3h.8a3.4 3.4 0 001.2-.5c.2 0 .4-.2.5-.4h.2l.2-.1zm11-8.9v1.8h-3.5v9.8h-2.2v-9.8h-3.5v-1.8h9.1z"/>
|
||||||
|
</g>
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 18 KiB |
|
@ -18,13 +18,13 @@ an **annotated document**. It also orchestrates training and serialization.
|
||||||
|
|
||||||
### Container objects {#architecture-containers}
|
### Container objects {#architecture-containers}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
||||||
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
||||||
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
||||||
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
||||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||||
|
|
||||||
### Processing pipeline {#architecture-pipeline}
|
### Processing pipeline {#architecture-pipeline}
|
||||||
|
|
||||||
|
@ -52,5 +52,3 @@ an **annotated document**. It also orchestrates training and serialization.
|
||||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||||
| [`Example`](/api/example) | Collection for training annotations. |
|
| [`Example`](/api/example) | Collection for training annotations. |
|
||||||
|
|
||||||
|
|
|
||||||
|
|
|
@ -12,29 +12,32 @@ passed on to the next component.
|
||||||
> - **Creates:** Objects, attributes and properties modified and set by the
|
> - **Creates:** Objects, attributes and properties modified and set by the
|
||||||
> component.
|
> component.
|
||||||
|
|
||||||
| Name | Component | Creates | Description |
|
| Name | Component | Creates | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ |
|
| ------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ |
|
||||||
| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. |
|
| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. |
|
||||||
| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. |
|
| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. |
|
||||||
| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. |
|
| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. |
|
||||||
| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. |
|
| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. |
|
||||||
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
|
||||||
| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
|
||||||
|
|
||||||
The processing pipeline always **depends on the statistical model** and its
|
The processing pipeline always **depends on the statistical model** and its
|
||||||
capabilities. For example, a pipeline can only include an entity recognizer
|
capabilities. For example, a pipeline can only include an entity recognizer
|
||||||
component if the model includes data to make predictions of entity labels. This
|
component if the model includes data to make predictions of entity labels. This
|
||||||
is why each model will specify the pipeline to use in its meta data, as a simple
|
is why each model will specify the pipeline to use in its meta data and
|
||||||
list containing the component names:
|
[config](/usage/training#config), as a simple list containing the component
|
||||||
|
names:
|
||||||
|
|
||||||
```json
|
```ini
|
||||||
"pipeline": ["tagger", "parser", "ner"]
|
pipeline = ["tagger", "parser", "ner"]
|
||||||
```
|
```
|
||||||
|
|
||||||
import Accordion from 'components/accordion.js'
|
import Accordion from 'components/accordion.js'
|
||||||
|
|
||||||
<Accordion title="Does the order of pipeline components matter?" id="pipeline-components-order">
|
<Accordion title="Does the order of pipeline components matter?" id="pipeline-components-order">
|
||||||
|
|
||||||
|
<!-- TODO: note on v3 tok2vec own model vs. upstream listeners -->
|
||||||
|
|
||||||
In spaCy v2.x, the statistical components like the tagger or parser are
|
In spaCy v2.x, the statistical components like the tagger or parser are
|
||||||
independent and don't share any data between themselves. For example, the named
|
independent and don't share any data between themselves. For example, the named
|
||||||
entity recognizer doesn't use any features set by the tagger and parser, and so
|
entity recognizer doesn't use any features set by the tagger and parser, and so
|
||||||
|
@ -48,11 +51,10 @@ pre-defined sentence boundaries, so if a previous component in the pipeline sets
|
||||||
them, its dependency predictions may be different. Similarly, it matters if you
|
them, its dependency predictions may be different. Similarly, it matters if you
|
||||||
add the [`EntityRuler`](/api/entityruler) before or after the statistical entity
|
add the [`EntityRuler`](/api/entityruler) before or after the statistical entity
|
||||||
recognizer: if it's added before, the entity recognizer will take the existing
|
recognizer: if it's added before, the entity recognizer will take the existing
|
||||||
entities into account when making predictions.
|
entities into account when making predictions. The
|
||||||
The [`EntityLinker`](/api/entitylinker), which resolves named entities to
|
[`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge
|
||||||
knowledge base IDs, should be preceded by
|
base IDs, should be preceded by a pipeline component that recognizes entities
|
||||||
a pipeline component that recognizes entities such as the
|
such as the [`EntityRecognizer`](/api/entityrecognizer).
|
||||||
[`EntityRecognizer`](/api/entityrecognizer).
|
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,30 @@
|
||||||
spaCy's models are **statistical** and every "decision" they make – for example,
|
spaCy's tagger, parser, text categorizer and many other components are powered
|
||||||
|
by **statistical models**. Every "decision" these components make – for example,
|
||||||
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
which part-of-speech tag to assign, or whether a word is a named entity – is a
|
||||||
**prediction**. This prediction is based on the examples the model has seen
|
**prediction** based on the model's current **weight values**. The weight
|
||||||
|
values are estimated based on examples the model has seen
|
||||||
during **training**. To train a model, you first need training data – examples
|
during **training**. To train a model, you first need training data – examples
|
||||||
of text, and the labels you want the model to predict. This could be a
|
of text, and the labels you want the model to predict. This could be a
|
||||||
part-of-speech tag, a named entity or any other information.
|
part-of-speech tag, a named entity or any other information.
|
||||||
|
|
||||||
The model is then shown the unlabelled text and will make a prediction. Because
|
Training is an iterative process in which the model's predictions are compared
|
||||||
we know the correct answer, we can give the model feedback on its prediction in
|
against the reference annotations in order to estimate the **gradient of the
|
||||||
the form of an **error gradient** of the **loss function** that calculates the
|
loss**. The gradient of the loss is then used to calculate the gradient of the
|
||||||
difference between the training example and the expected output. The greater the
|
weights through [backpropagation](https://thinc.ai/backprop101). The gradients
|
||||||
difference, the more significant the gradient and the updates to our model.
|
indicate how the weight values should be changed so that the model's
|
||||||
|
predictions become more similar to the reference labels over time.
|
||||||
|
|
||||||
> - **Training data:** Examples and their annotations.
|
> - **Training data:** Examples and their annotations.
|
||||||
> - **Text:** The input text the model should predict a label for.
|
> - **Text:** The input text the model should predict a label for.
|
||||||
> - **Label:** The label the model should predict.
|
> - **Label:** The label the model should predict.
|
||||||
> - **Gradient:** Gradient of the loss function calculating the difference
|
> - **Gradient:** The direction and rate of change for a numeric value.
|
||||||
> between input and expected output.
|
> Minimising the gradient of the weights should result in predictions that
|
||||||
|
> are closer to the reference labels on the training data.
|
||||||
|
|
||||||
![The training process](../../images/training.svg)
|
![The training process](../../images/training.svg)
|
||||||
|
|
||||||
When training a model, we don't just want it to memorize our examples – we want
|
When training a model, we don't just want it to memorize our examples – we want
|
||||||
it to come up with a theory that can be **generalized across other examples**.
|
it to come up with a theory that can be **generalized across unseen data**.
|
||||||
After all, we don't just want the model to learn that this one instance of
|
After all, we don't just want the model to learn that this one instance of
|
||||||
"Amazon" right here is a company – we want it to learn that "Amazon", in
|
"Amazon" right here is a company – we want it to learn that "Amazon", in
|
||||||
contexts _like this_, is most likely a company. That's why the training data
|
contexts _like this_, is most likely a company. That's why the training data
|
||||||
|
@ -34,5 +38,4 @@ it's learning the right things, you don't only need **training data** – you'll
|
||||||
also need **evaluation data**. If you only test the model with the data it was
|
also need **evaluation data**. If you only test the model with the data it was
|
||||||
trained on, you'll have no idea how well it's generalizing. If you want to train
|
trained on, you'll have no idea how well it's generalizing. If you want to train
|
||||||
a model from scratch, you usually need at least a few hundred examples for both
|
a model from scratch, you usually need at least a few hundred examples for both
|
||||||
training and evaluation. To update an existing model, you can already achieve
|
training and evaluation.
|
||||||
decent results with very few examples – as long as they're representative.
|
|
||||||
|
|
|
@ -909,9 +909,8 @@ If you're using a statistical model, writing to the `nlp.Defaults` or
|
||||||
`English.Defaults` directly won't work, since the regular expressions are read
|
`English.Defaults` directly won't work, since the regular expressions are read
|
||||||
from the model and will be compiled when you load it. If you modify
|
from the model and will be compiled when you load it. If you modify
|
||||||
`nlp.Defaults`, you'll only see the effect if you call
|
`nlp.Defaults`, you'll only see the effect if you call
|
||||||
[`spacy.blank`](/api/top-level#spacy.blank) or `Defaults.create_tokenizer()`. If
|
[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
|
||||||
you want to modify the tokenizer loaded from a statistical model, you should
|
loaded from a statistical model, you should modify `nlp.tokenizer` directly.
|
||||||
modify `nlp.tokenizer` directly.
|
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -1386,8 +1385,7 @@ import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
nlp = English() # just the language with no model
|
nlp = English() # just the language with no model
|
||||||
sentencizer = nlp.create_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
nlp.add_pipe(sentencizer)
|
|
||||||
doc = nlp("This is a sentence. This is another sentence.")
|
doc = nlp("This is a sentence. This is another sentence.")
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
print(sent.text)
|
print(sent.text)
|
||||||
|
@ -1422,6 +1420,7 @@ take advantage of dependency-based sentence segmentation.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
from spacy.language import Language
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
text = "this is a sentence...hello...and another sentence."
|
text = "this is a sentence...hello...and another sentence."
|
||||||
|
@ -1430,13 +1429,14 @@ nlp = spacy.load("en_core_web_sm")
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print("Before:", [sent.text for sent in doc.sents])
|
print("Before:", [sent.text for sent in doc.sents])
|
||||||
|
|
||||||
|
@Language.component("set_custom_coundaries")
|
||||||
def set_custom_boundaries(doc):
|
def set_custom_boundaries(doc):
|
||||||
for token in doc[:-1]:
|
for token in doc[:-1]:
|
||||||
if token.text == "...":
|
if token.text == "...":
|
||||||
doc[token.i+1].is_sent_start = True
|
doc[token.i + 1].is_sent_start = True
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
nlp.add_pipe(set_custom_boundaries, before="parser")
|
nlp.add_pipe("set_custom_boundaries", before="parser")
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print("After:", [sent.text for sent in doc.sents])
|
print("After:", [sent.text for sent in doc.sents])
|
||||||
```
|
```
|
||||||
|
|
|
@ -97,32 +97,40 @@ but also your own custom processing functions. A pipeline component can be added
|
||||||
to an already existing `nlp` object, specified when initializing a `Language`
|
to an already existing `nlp` object, specified when initializing a `Language`
|
||||||
class, or defined within a [model package](/usage/saving-loading#models).
|
class, or defined within a [model package](/usage/saving-loading#models).
|
||||||
|
|
||||||
When you load a model, spaCy first consults the model's
|
> #### config.cfg (excerpt)
|
||||||
[`meta.json`](/usage/saving-loading#models). The meta typically includes the
|
|
||||||
model details, the ID of a language class, and an optional list of pipeline
|
|
||||||
components. spaCy then does the following:
|
|
||||||
|
|
||||||
> #### meta.json (excerpt)
|
|
||||||
>
|
>
|
||||||
> ```json
|
> ```ini
|
||||||
> {
|
> [nlp]
|
||||||
> "lang": "en",
|
> lang = "en"
|
||||||
> "name": "core_web_sm",
|
> pipeline = ["tagger", "parser"]
|
||||||
> "description": "Example model for spaCy",
|
>
|
||||||
> "pipeline": ["tagger", "parser", "ner"]
|
> [components]
|
||||||
> }
|
>
|
||||||
|
> [components.tagger]
|
||||||
|
> factory = "tagger"
|
||||||
|
> # settings for the tagger component
|
||||||
|
>
|
||||||
|
> [components.parser]
|
||||||
|
> factory = "parser"
|
||||||
|
> # settings for the parser component
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
When you load a model, spaCy first consults the model's
|
||||||
|
[`meta.json`](/usage/saving-loading#models) and
|
||||||
|
[`config.cfg`](/usage/training#config). The config tells spaCy what language
|
||||||
|
class to use, which components are in the pipeline, and how those components
|
||||||
|
should be created. spaCy will then do the following:
|
||||||
|
|
||||||
1. Load the **language class and data** for the given ID via
|
1. Load the **language class and data** for the given ID via
|
||||||
[`get_lang_class`](/api/top-level#util.get_lang_class) and initialize it. The
|
[`get_lang_class`](/api/top-level#util.get_lang_class) and initialize it. The
|
||||||
`Language` class contains the shared vocabulary, tokenization rules and the
|
`Language` class contains the shared vocabulary, tokenization rules and the
|
||||||
language-specific annotation scheme.
|
language-specific settings.
|
||||||
2. Iterate over the **pipeline names** and create each component using
|
2. Iterate over the **pipeline names** and look up each component name in the
|
||||||
[`create_pipe`](/api/language#create_pipe), which looks them up in
|
`[components]` block. The `factory` tells spaCy which
|
||||||
`Language.factories`.
|
[component factory](#custom-components-factories) to use for adding the
|
||||||
3. Add each pipeline component to the pipeline in order, using
|
component with with [`add_pipe`](/api/language#add_pipe). The settings are
|
||||||
[`add_pipe`](/api/language#add_pipe).
|
passed into the factory.
|
||||||
4. Make the **model data** available to the `Language` class by calling
|
3. Make the **model data** available to the `Language` class by calling
|
||||||
[`from_disk`](/api/language#from_disk) with the path to the model data
|
[`from_disk`](/api/language#from_disk) with the path to the model data
|
||||||
directory.
|
directory.
|
||||||
|
|
||||||
|
@ -132,17 +140,25 @@ So when you call this...
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
```
|
```
|
||||||
|
|
||||||
... the model's `meta.json` tells spaCy to use the language `"en"` and the
|
... the model's `config.cfg` tells spaCy to use the language `"en"` and the
|
||||||
pipeline `["tagger", "parser", "ner"]`. spaCy will then initialize
|
pipeline `["tagger", "parser", "ner"]`. spaCy will then initialize
|
||||||
`spacy.lang.en.English`, and create each pipeline component and add it to the
|
`spacy.lang.en.English`, and create each pipeline component and add it to the
|
||||||
processing pipeline. It'll then load in the model's data from its data directory
|
processing pipeline. It'll then load in the model's data from its data directory
|
||||||
and return the modified `Language` class for you to use as the `nlp` object.
|
and return the modified `Language` class for you to use as the `nlp` object.
|
||||||
|
|
||||||
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
spaCy v3.0 introduces a `config.cfg`, which includes more detailed settings for
|
||||||
|
the model pipeline, its components and the
|
||||||
|
[training process](/usage/training#config). You can export the config of your
|
||||||
|
current `nlp` object by calling [`nlp.config.to_disk`](/api/language#config).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
Fundamentally, a [spaCy model](/models) consists of three components: **the
|
Fundamentally, a [spaCy model](/models) consists of three components: **the
|
||||||
weights**, i.e. binary data loaded in from a directory, a **pipeline** of
|
weights**, i.e. binary data loaded in from a directory, a **pipeline** of
|
||||||
functions called in order, and **language data** like the tokenization rules and
|
functions called in order, and **language data** like the tokenization rules and
|
||||||
annotation scheme. All of this is specific to each model, and defined in the
|
language-specific settings. For example, a Spanish NER model requires different
|
||||||
model's `meta.json` – for example, a Spanish NER model requires different
|
|
||||||
weights, language data and pipeline components than an English parsing and
|
weights, language data and pipeline components than an English parsing and
|
||||||
tagging model. This is also why the pipeline state is always held by the
|
tagging model. This is also why the pipeline state is always held by the
|
||||||
`Language` class. [`spacy.load`](/api/top-level#spacy.load) puts this all
|
`Language` class. [`spacy.load`](/api/top-level#spacy.load) puts this all
|
||||||
|
@ -158,9 +174,8 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
|
||||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language instance, e.g. English()
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language instance, e.g. English()
|
||||||
nlp = cls() # 2. Initialize it
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
component = nlp.create_pipe(name) # 3. Create the pipeline components
|
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
||||||
nlp.add_pipe(component) # 4. Add the component to the pipeline
|
nlp.from_disk(model_data_path) # 4. Load in the binary data
|
||||||
nlp.from_disk(model_data_path) # 5. Load in the binary data
|
|
||||||
```
|
```
|
||||||
|
|
||||||
When you call `nlp` on a text, spaCy will **tokenize** it and then **call each
|
When you call `nlp` on a text, spaCy will **tokenize** it and then **call each
|
||||||
|
@ -190,36 +205,34 @@ print(nlp.pipe_names)
|
||||||
|
|
||||||
### Built-in pipeline components {#built-in}
|
### Built-in pipeline components {#built-in}
|
||||||
|
|
||||||
spaCy ships with several built-in pipeline components that are also available in
|
spaCy ships with several built-in pipeline components that are registered with
|
||||||
the `Language.factories`. This means that you can initialize them by calling
|
string names. This means that you can initialize them by calling
|
||||||
[`nlp.create_pipe`](/api/language#create_pipe) with their string names and
|
[`nlp.add_pipe`](/api/language#add_pipe) with their names and spaCy will know
|
||||||
require them in the pipeline settings in your model's `meta.json`.
|
how to create them. See the [API documentation](/api) for a full list of
|
||||||
|
available pipeline components and component functions.
|
||||||
|
|
||||||
> #### Usage
|
> #### Usage
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Option 1: Import and initialize
|
> nlp = spacy.blank("en")
|
||||||
> from spacy.pipeline import EntityRuler
|
> nlp.add_pipe("sentencizer")
|
||||||
> ruler = EntityRuler(nlp)
|
> # add_pipe returns the added component
|
||||||
> nlp.add_pipe(ruler)
|
> ruler = nlp.add_pipe("entity_ruler")
|
||||||
>
|
|
||||||
> # Option 2: Using nlp.create_pipe
|
|
||||||
> sentencizer = nlp.create_pipe("sentencizer")
|
|
||||||
> nlp.add_pipe(sentencizer)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| String name | Component | Description |
|
| String name | Component | Description |
|
||||||
| ------------------- | ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
|
| --------------- | ------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||||
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
|
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
|
||||||
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
|
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
|
||||||
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
|
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
|
||||||
| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
|
| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
|
||||||
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. |
|
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. |
|
||||||
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. |
|
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. |
|
||||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||||
| `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser. |
|
|
||||||
| `merge_entities` | [`merge_entities`](/api/pipeline-functions#merge_entities) | Merge all entities into a single token. Should be added after the entity recognizer. |
|
<!-- TODO: update with more components -->
|
||||||
| `merge_subtokens` | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens) | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. |
|
|
||||||
|
<!-- TODO: explain default config and factories -->
|
||||||
|
|
||||||
### Disabling and modifying pipeline components {#disabling}
|
### Disabling and modifying pipeline components {#disabling}
|
||||||
|
|
||||||
|
@ -233,7 +246,6 @@ list:
|
||||||
```python
|
```python
|
||||||
### Disable loading
|
### Disable loading
|
||||||
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
|
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
|
||||||
nlp = English().from_disk("/model", disable=["ner"])
|
|
||||||
```
|
```
|
||||||
|
|
||||||
In some cases, you do want to load all pipeline components and their weights,
|
In some cases, you do want to load all pipeline components and their weights,
|
||||||
|
@ -297,15 +309,18 @@ nlp.replace_pipe("tagger", my_custom_tagger)
|
||||||
|
|
||||||
## Creating custom pipeline components {#custom-components}
|
## Creating custom pipeline components {#custom-components}
|
||||||
|
|
||||||
A component receives a `Doc` object and can modify it – for example, by using
|
A pipeline component is a function that receives a `Doc` object, modifies it and
|
||||||
the current weights to make a prediction and set some annotation on the
|
returns it – – for example, by using the current weights to make a prediction
|
||||||
document. By adding a component to the pipeline, you'll get access to the `Doc`
|
and set some annotation on the document. By adding a component to the pipeline,
|
||||||
at any point **during processing** – instead of only being able to modify it
|
you'll get access to the `Doc` at any point **during processing** – instead of
|
||||||
afterwards.
|
only being able to modify it afterwards.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> @Language.component("my_component")
|
||||||
> def my_component(doc):
|
> def my_component(doc):
|
||||||
> # do something to the doc here
|
> # do something to the doc here
|
||||||
> return doc
|
> return doc
|
||||||
|
@ -316,6 +331,12 @@ afterwards.
|
||||||
| `doc` | `Doc` | The `Doc` object processed by the previous component. |
|
| `doc` | `Doc` | The `Doc` object processed by the previous component. |
|
||||||
| **RETURNS** | `Doc` | The `Doc` object processed by this pipeline component. |
|
| **RETURNS** | `Doc` | The `Doc` object processed by this pipeline component. |
|
||||||
|
|
||||||
|
The [`@Language.component`](/api/language#component) decorator lets you turn a
|
||||||
|
simple function into a pipeline component. It takes at least one argument, the
|
||||||
|
**name** of the component factory. You can use this name to add an instance of
|
||||||
|
your component to the pipeline. It can also be listed in your model config, so
|
||||||
|
you can save, load and train models using your component.
|
||||||
|
|
||||||
Custom components can be added to the pipeline using the
|
Custom components can be added to the pipeline using the
|
||||||
[`add_pipe`](/api/language#add_pipe) method. Optionally, you can either specify
|
[`add_pipe`](/api/language#add_pipe) method. Optionally, you can either specify
|
||||||
a component to add it **before or after**, tell spaCy to add it **first or
|
a component to add it **before or after**, tell spaCy to add it **first or
|
||||||
|
@ -325,23 +346,43 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.add_pipe(my_component)
|
> nlp.add_pipe("my_component")
|
||||||
> nlp.add_pipe(my_component, first=True)
|
> nlp.add_pipe("my_component", first=True)
|
||||||
> nlp.add_pipe(my_component, before="parser")
|
> nlp.add_pipe("my_component", before="parser")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| -------- | ---- | ------------------------------------------------------------------------ |
|
| -------- | --------- | ------------------------------------------------------------------------ |
|
||||||
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
|
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
|
||||||
| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
|
| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
|
||||||
| `before` | str | String name of component to add the new component **before**. |
|
| `before` | str / int | String name or index to add the new component **before**. |
|
||||||
| `after` | str | String name of component to add the new component **after**. |
|
| `after` | str / int | String name or index to add the new component **after**. |
|
||||||
|
|
||||||
### Example: A simple pipeline component {#custom-components-simple}
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
As of v3.0, components need to be registered using the
|
||||||
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator so spaCy knows that a
|
||||||
|
function is a component. [`nlp.add_pipe`](/api/language#add_pipe) now takes the
|
||||||
|
**string name** of the component factory instead of the component function. This
|
||||||
|
doesn't only save you lines of code, it also allows spaCy to validate and track
|
||||||
|
your custom components, and make sure they can be saved and loaded.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- ruler = nlp.create_pipe("entity_ruler")
|
||||||
|
- nlp.add_pipe(ruler)
|
||||||
|
+ ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Examples: Simple stateless pipeline components {#custom-components-simple}
|
||||||
|
|
||||||
The following component receives the `Doc` in the pipeline and prints some
|
The following component receives the `Doc` in the pipeline and prints some
|
||||||
information about it: the number of tokens, the part-of-speech tags of the
|
information about it: the number of tokens, the part-of-speech tags of the
|
||||||
tokens and a conditional message based on the document length.
|
tokens and a conditional message based on the document length. The
|
||||||
|
[`@Language.component`](/api/language#component) decorator lets you register the
|
||||||
|
component under the name `"info_component"`.
|
||||||
|
|
||||||
> #### ✏️ Things to try
|
> #### ✏️ Things to try
|
||||||
>
|
>
|
||||||
|
@ -352,11 +393,16 @@ tokens and a conditional message based on the document length.
|
||||||
> this change reflected in `nlp.pipe_names`.
|
> this change reflected in `nlp.pipe_names`.
|
||||||
> 3. Print `nlp.pipeline`. You'll see a list of tuples describing the component
|
> 3. Print `nlp.pipeline`. You'll see a list of tuples describing the component
|
||||||
> name and the function that's called on the `Doc` object in the pipeline.
|
> name and the function that's called on the `Doc` object in the pipeline.
|
||||||
|
> 4. Change the first argument to `@Language.component`, the name, to something
|
||||||
|
> else. spaCy should now complain that it doesn't know a component of the
|
||||||
|
> name `"info_component"`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
@Language.component("info_component")
|
||||||
def my_component(doc):
|
def my_component(doc):
|
||||||
print(f"After tokenization, this doc has {len(doc)} tokens.")
|
print(f"After tokenization, this doc has {len(doc)} tokens.")
|
||||||
print("The part-of-speech tags are:", [token.pos_ for token in doc])
|
print("The part-of-speech tags are:", [token.pos_ for token in doc])
|
||||||
|
@ -365,76 +411,16 @@ def my_component(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
nlp.add_pipe(my_component, name="print_info", last=True)
|
nlp.add_pipe("info_component", name="print_info", last=True)
|
||||||
print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info']
|
print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info']
|
||||||
doc = nlp("This is a sentence.")
|
doc = nlp("This is a sentence.")
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Of course, you can also wrap your component as a class to allow initializing it
|
Here's another example of a pipeline component that implements custom logic to
|
||||||
with custom settings and hold state within the component. This is useful for
|
improve the sentence boundaries set by the dependency parser. The custom logic
|
||||||
**stateful components**, especially ones which **depend on shared data**. In the
|
should therefore be applied **after** tokenization, but _before_ the dependency
|
||||||
following example, the custom component `EntityMatcher` can be initialized with
|
parsing – this way, the parser can also take advantage of the sentence
|
||||||
`nlp` object, a terminology list and an entity label. Using the
|
boundaries.
|
||||||
[`PhraseMatcher`](/api/phrasematcher), it then matches the terms in the `Doc`
|
|
||||||
and adds them to the existing entities.
|
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
|
||||||
|
|
||||||
As of v2.1.0, spaCy ships with the [`EntityRuler`](/api/entityruler), a pipeline
|
|
||||||
component for easy, rule-based named entity recognition. Its implementation is
|
|
||||||
similar to the `EntityMatcher` code shown below, but it includes some additional
|
|
||||||
features like support for phrase patterns and token patterns, handling overlaps
|
|
||||||
with existing entities and pattern export as JSONL.
|
|
||||||
|
|
||||||
We'll still keep the pipeline component example below, as it works well to
|
|
||||||
illustrate complex components. But if you're planning on using this type of
|
|
||||||
component in your application, you might find the `EntityRuler` more convenient.
|
|
||||||
[See here](/usage/rule-based-matching#entityruler) for more details and
|
|
||||||
examples.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
```python
|
|
||||||
### {executable="true"}
|
|
||||||
import spacy
|
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Span
|
|
||||||
|
|
||||||
class EntityMatcher:
|
|
||||||
name = "entity_matcher"
|
|
||||||
|
|
||||||
def __init__(self, nlp, terms, label):
|
|
||||||
patterns = [nlp.make_doc(text) for text in terms]
|
|
||||||
self.matcher = PhraseMatcher(nlp.vocab)
|
|
||||||
self.matcher.add(label, patterns)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
matches = self.matcher(doc)
|
|
||||||
for match_id, start, end in matches:
|
|
||||||
span = Span(doc, start, end, label=match_id)
|
|
||||||
doc.ents = list(doc.ents) + [span]
|
|
||||||
return doc
|
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
terms = ("cat", "dog", "tree kangaroo", "giant sea spider")
|
|
||||||
entity_matcher = EntityMatcher(nlp, terms, "ANIMAL")
|
|
||||||
|
|
||||||
nlp.add_pipe(entity_matcher, after="ner")
|
|
||||||
|
|
||||||
print(nlp.pipe_names) # The components in the pipeline
|
|
||||||
|
|
||||||
doc = nlp("This is a text about Barack Obama and a tree kangaroo")
|
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example: Custom sentence segmentation logic {#component-example1}
|
|
||||||
|
|
||||||
Let's say you want to implement custom logic to improve spaCy's sentence
|
|
||||||
boundary detection. Currently, sentence segmentation is based on the dependency
|
|
||||||
parse, which doesn't always produce ideal results. The custom logic should
|
|
||||||
therefore be applied **after** tokenization, but _before_ the dependency parsing
|
|
||||||
– this way, the parser can also take advantage of the sentence boundaries.
|
|
||||||
|
|
||||||
> #### ✏️ Things to try
|
> #### ✏️ Things to try
|
||||||
>
|
>
|
||||||
|
@ -448,90 +434,318 @@ therefore be applied **after** tokenization, but _before_ the dependency parsing
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
@Language.component("custom_sentencizer")
|
||||||
def custom_sentencizer(doc):
|
def custom_sentencizer(doc):
|
||||||
for i, token in enumerate(doc[:-2]):
|
for i, token in enumerate(doc[:-2]):
|
||||||
# Define sentence start if pipe + titlecase token
|
# Define sentence start if pipe + titlecase token
|
||||||
if token.text == "|" and doc[i+1].is_title:
|
if token.text == "|" and doc[i + 1].is_title:
|
||||||
doc[i+1].is_sent_start = True
|
doc[i + 1].is_sent_start = True
|
||||||
else:
|
else:
|
||||||
# Explicitly set sentence start to False otherwise, to tell
|
# Explicitly set sentence start to False otherwise, to tell
|
||||||
# the parser to leave those tokens alone
|
# the parser to leave those tokens alone
|
||||||
doc[i+1].is_sent_start = False
|
doc[i + 1].is_sent_start = False
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser
|
nlp.add_pipe("custom_sentencizer", before="parser") # Insert before the parser
|
||||||
doc = nlp("This is. A sentence. | This is. Another sentence.")
|
doc = nlp("This is. A sentence. | This is. Another sentence.")
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
print(sent.text)
|
print(sent.text)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example: Pipeline component for entity matching and tagging with custom attributes {#component-example2}
|
### Component factories and stateful components {#custom-components-factories}
|
||||||
|
|
||||||
This example shows how to create a spaCy extension that takes a terminology list
|
Component factories are callables that take settings and return a **pipeline
|
||||||
(in this case, single- and multi-word company names), matches the occurrences in
|
component function**. This is useful if your component is stateful and if you
|
||||||
a document, labels them as `ORG` entities, merges the tokens and sets custom
|
need to customize their creation, or if you need access to the current `nlp`
|
||||||
`is_tech_org` and `has_tech_org` attributes. For efficient matching, the example
|
object or the shared vocab. Component factories can be registered using the
|
||||||
uses the [`PhraseMatcher`](/api/phrasematcher) which accepts `Doc` objects as
|
[`@Language.factory`](/api/language#factory) decorator and they need at least
|
||||||
match patterns and works well for large terminology lists. It also ensures your
|
**two named arguments** that are filled in automatically when the component is
|
||||||
patterns will always match, even when you customize spaCy's tokenization rules.
|
added to the pipeline:
|
||||||
When you call `nlp` on a text, the custom pipeline component is applied to the
|
|
||||||
`Doc`.
|
> #### Example
|
||||||
|
>
|
||||||
```python
|
> ```python
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_entities.py
|
> from spacy.language import Language
|
||||||
```
|
>
|
||||||
|
> @Language.factory("my_component")
|
||||||
Wrapping this functionality in a pipeline component allows you to reuse the
|
> def my_component(nlp, name):
|
||||||
module with different settings, and have all pre-processing taken care of when
|
> return MyComponent()
|
||||||
you call `nlp` on your text and receive a `Doc` object.
|
> ```
|
||||||
|
|
||||||
### Adding factories {#custom-components-factories}
|
| Argument | Type | Description |
|
||||||
|
| -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
When spaCy loads a model via its `meta.json`, it will iterate over the
|
| `nlp` | [`Language`](/api/language) | The current `nlp` object. Can be used to access the |
|
||||||
`"pipeline"` setting, look up every component name in the internal factories and
|
| `name` | str | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. |
|
||||||
call [`nlp.create_pipe`](/api/language#create_pipe) to initialize the individual
|
|
||||||
components, like the tagger, parser or entity recognizer. If your model uses
|
All other settings can be passed in by the user via the `config` argument on
|
||||||
custom components, this won't work – so you'll have to tell spaCy **where to
|
[`nlp.add_pipe`](/api/language). The
|
||||||
find your component**. You can do this by writing to the `Language.factories`:
|
[`@Language.factory`](/api/language#factory) decorator also lets you define a
|
||||||
|
`default_config` that's used as a fallback.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
### With config {highlight="4,9"}
|
||||||
|
import spacy
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
Language.factories["entity_matcher"] = lambda nlp, **cfg: EntityMatcher(nlp, **cfg)
|
|
||||||
|
@Language.factory("my_component", default_config={"some_setting": True})
|
||||||
|
def my_component(nlp, name, some_setting: bool):
|
||||||
|
return MyComponent(some_setting=some_setting)
|
||||||
|
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe("my_component", config={"some_setting": False})
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also ship the above code and your custom component in your packaged
|
<Accordion title="How is @Language.factory different from @Language.component?" id="factories-decorator-component">
|
||||||
model's `__init__.py`, so it's executed when you load your model. The `**cfg`
|
|
||||||
config parameters are passed all the way down from
|
The [`@Language.component`](/api/language#component) decorator is essentially a
|
||||||
[`spacy.load`](/api/top-level#spacy.load), so you can load the model and its
|
**shortcut** for stateless pipeline component that don't need any settings. This
|
||||||
components with custom settings:
|
means you don't have to always write a function that returns your function if
|
||||||
|
there's no state to be passed through – spaCy can just take care of this for
|
||||||
|
you. The following two code examples are equivalent:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL")
|
# Statless component with @Language.factory
|
||||||
|
@Language.factory("my_component")
|
||||||
|
def create_my_component():
|
||||||
|
def my_component(doc):
|
||||||
|
# Do something to the doc
|
||||||
|
return doc
|
||||||
|
|
||||||
|
return my_component
|
||||||
|
|
||||||
|
# Stateless component with @Language.component
|
||||||
|
@Language.component("my_component")
|
||||||
|
def my_component(doc):
|
||||||
|
# Do something to the doc
|
||||||
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
</Accordion>
|
||||||
|
|
||||||
When you load a model via its package name, like `en_core_web_sm`, spaCy will
|
<Accordion title="Can I add the @Language.factory decorator to a class?" id="factories-class-decorator" spaced>
|
||||||
import the package and then call its `load()` method. This means that custom
|
|
||||||
code in the model's `__init__.py` will be executed, too. This is **not the
|
Yes, the [`@Language.factory`](/api/language#factory) decorator can be added to
|
||||||
case** if you're loading a model from a path containing the model data. Here,
|
a function or a class. If it's added to a class, it expects the `__init__`
|
||||||
spaCy will only read in the `meta.json`. If you want to use custom factories
|
method to take the arguments `nlp` and `name`, and will populate all other
|
||||||
with a model loaded from a path, you need to add them to `Language.factories`
|
arguments from the config. That said, it's often cleaner and more intuitive to
|
||||||
_before_ you load the model.
|
make your factory a separate function. That's also how spaCy does it internally.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
### Example: Stateful component with settings
|
||||||
|
|
||||||
|
This example shows a **stateful** pipeline component for handling acronyms:
|
||||||
|
based on a dictionary, it will detect acronyms and their expanded forms in both
|
||||||
|
directions and add them to a list as the custom `doc._.acronyms`
|
||||||
|
[extension attribute](#custom-components-attributes). Under the hood, it uses
|
||||||
|
the [`PhraseMatcher`](/api/phrasematcher) to find instances of the phrases.
|
||||||
|
|
||||||
|
The factory function takes three arguments: the shared `nlp` object and
|
||||||
|
component instance `name`, which are passed in automatically by spaCy, and a
|
||||||
|
`case_sensitive` config setting that makes the matching and acronym detection
|
||||||
|
case-sensitive.
|
||||||
|
|
||||||
|
> #### ✏️ Things to try
|
||||||
|
>
|
||||||
|
> 1. Change the `config` passed to `nlp.add_pipe` and set `"case_sensitive"` to
|
||||||
|
> `True`. You should see that the expanded acronym for "LOL" isn't detected
|
||||||
|
> anymore.
|
||||||
|
> 2. Add some more terms to the `DICTIONARY` and update the processed text so
|
||||||
|
> they're detected.
|
||||||
|
> 3. Add a `name` argument to `nlp.add_pipe` to change the component name. Print
|
||||||
|
> `nlp.pipe_names` to see the change reflected in the pipeline.
|
||||||
|
> 4. Print the config of the current `nlp` object with
|
||||||
|
> `print(nlp.config.to_str())` and inspect the `[components]` block. You
|
||||||
|
> should see an entry for the acronyms component, referencing the factory
|
||||||
|
> `acronyms` and the config settings.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.matcher import PhraseMatcher
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
DICTIONARY = {"lol": "laughing out loud", "brb": "be right back"}
|
||||||
|
DICTIONARY.update({value: key for key, value in DICTIONARY.items()})
|
||||||
|
|
||||||
|
@Language.factory("acronyms", default_config={"case_sensitive": False})
|
||||||
|
def create_acronym_component(nlp: Language, name: str, case_sensitive: bool):
|
||||||
|
return AcronymComponent(nlp, case_sensitive)
|
||||||
|
|
||||||
|
class AcronymComponent:
|
||||||
|
def __init__(self, nlp: Language, case_sensitive: bool):
|
||||||
|
# Create the matcher and match on Token.lower if case-insensitive
|
||||||
|
matcher_attr = "TEXT" if case_sensitive else "LOWER"
|
||||||
|
self.matcher = PhraseMatcher(nlp.vocab, attr=matcher_attr)
|
||||||
|
self.matcher.add("ACRONYMS", [nlp.make_doc(term) for term in DICTIONARY])
|
||||||
|
self.case_sensitive = case_sensitive
|
||||||
|
# Register custom extension on the Doc
|
||||||
|
if not Doc.has_extension("acronyms"):
|
||||||
|
Doc.set_extension("acronyms", default=[])
|
||||||
|
|
||||||
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
|
# Add the matched spans when doc is processed
|
||||||
|
for _, start, end in self.matcher(doc):
|
||||||
|
span = doc[start:end]
|
||||||
|
acronym = DICTIONARY.get(span.text if self.case_sensitive else span.text.lower())
|
||||||
|
doc._.acronyms.append((span, acronym))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
# Add the component to the pipeline and configure it
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe("acronyms", config={"case_sensitive": False})
|
||||||
|
|
||||||
|
# Process a doc and see the results
|
||||||
|
doc = nlp("LOL, be right back")
|
||||||
|
print(doc._.acronyms)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python type hints and pydantic validation {#type-hints new="3"}
|
||||||
|
|
||||||
|
spaCy's configs are powered by our machine learning library Thinc's
|
||||||
|
[configuration system](https://thinc.ai/docs/usage-config), which supports
|
||||||
|
[type hints](https://docs.python.org/3/library/typing.html) and even
|
||||||
|
[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types)
|
||||||
|
using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your component
|
||||||
|
factory provides type hints, the values that are passed in will be **checked
|
||||||
|
against the expected types**. If the value can't be cast to an integer, spaCy
|
||||||
|
will raise an error. `pydantic` also provides strict types like `StrictFloat`,
|
||||||
|
which will force the value to be an integer and raise an error if it's not – for
|
||||||
|
instance, if your config defines a float.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
If you're not using
|
||||||
|
[strict types](https://pydantic-docs.helpmanual.io/usage/types/#strict-types),
|
||||||
|
values that can be **cast to** the given type will still be accepted. For
|
||||||
|
example, `1` can be cast to a `float` or a `bool` type, but not to a
|
||||||
|
`List[str]`. However, if the type is
|
||||||
|
[`StrictFloat`](https://pydantic-docs.helpmanual.io/usage/types/#strict-types),
|
||||||
|
only a float will be accepted.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
The following example shows a custom pipeline component for debugging. It can be
|
||||||
|
added anywhere in the pipeline and logs information about the `nlp` object and
|
||||||
|
the `Doc` that passes through. The `log_level` config setting lets the user
|
||||||
|
customize what log statements are shown – for instance, `"INFO"` will show info
|
||||||
|
logs and more critical logging statements, whereas `"DEBUG"` will show
|
||||||
|
everything. The value is annotated as a `StrictStr`, so it will only accept a
|
||||||
|
string value.
|
||||||
|
|
||||||
|
> #### ✏️ Things to try
|
||||||
|
>
|
||||||
|
> 1. Change the `config` passed to `nlp.add_pipe` to use the log level `"INFO"`.
|
||||||
|
> You should see that only the statement logged with `logger.info` is shown.
|
||||||
|
> 2. Change the `config` passed to `nlp.add_pipe` so that it contains unexpected
|
||||||
|
> values – for example, a boolean instead of a string: `"log_level": False`.
|
||||||
|
> You should see a validation error.
|
||||||
|
> 3. Check out the docs on `pydantic`'s
|
||||||
|
> [constrained types](https://pydantic-docs.helpmanual.io/usage/types/#constrained-types)
|
||||||
|
> and write a type hint for `log_level` that only accepts the exact string
|
||||||
|
> values `"DEBUG"`, `"INFO"` or `"CRITICAL"`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from pydantic import StrictStr
|
||||||
|
import logging
|
||||||
|
|
||||||
|
@Language.factory("debug", default_config={"log_level": "DEBUG"})
|
||||||
|
class DebugComponent:
|
||||||
|
def __init__(self, nlp: Language, name: str, log_level: StrictStr):
|
||||||
|
self.logger = logging.getLogger(f"spacy.{name}")
|
||||||
|
self.logger.setLevel(log_level)
|
||||||
|
self.logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
|
|
||||||
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
|
self.logger.debug(f"Doc: {len(doc)} tokens, is_tagged: {doc.is_tagged}")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
nlp.add_pipe("debug", config={"log_level": "DEBUG"})
|
||||||
|
doc = nlp("This is a text...")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Language-specific factories {#factories-language new="3"}
|
||||||
|
|
||||||
|
There are many use case where you might want your pipeline components to be
|
||||||
|
language-specific. Sometimes this requires entirely different implementation per
|
||||||
|
language, sometimes the only difference is in the settings or data. spaCy allows
|
||||||
|
you to register factories of the **same name** on both the `Language` base
|
||||||
|
class, as well as its **subclasses** like `English` or `German`. Factories are
|
||||||
|
resolved starting with the specific subclass. If the subclass doesn't define a
|
||||||
|
component of that name, spaCy will check the `Language` base class.
|
||||||
|
|
||||||
|
Here's an example of a pipeline component that overwrites the normalized form of
|
||||||
|
a token, the `Token.norm_` with an entry from a language-specific lookup table.
|
||||||
|
It's registered twice under the name `"token_normalizer"` – once using
|
||||||
|
`@English.factory` and once using `@German.factory`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.de import German
|
||||||
|
|
||||||
|
class TokenNormalizer:
|
||||||
|
def __init__(self, norm_table):
|
||||||
|
self.norm_table = norm_table
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
for token in doc:
|
||||||
|
# Overwrite the token.norm_ if there's an entry in the data
|
||||||
|
token.norm_ = self.norm_table.get(token.text, token.norm_)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@English.factory("token_normalizer")
|
||||||
|
def create_en_normalizer(nlp, name):
|
||||||
|
return TokenNormalizer({"realise": "realize", "colour": "color"})
|
||||||
|
|
||||||
|
@German.factory("token_normalizer")
|
||||||
|
def create_de_normalizer(nlp, name):
|
||||||
|
return TokenNormalizer({"daß": "dass", "wußte": "wusste"})
|
||||||
|
|
||||||
|
nlp_en = English()
|
||||||
|
nlp_en.add_pipe("token_normalizer") # uses the English factory
|
||||||
|
print([token.norm_ for token in nlp_en("realise colour daß wußte")])
|
||||||
|
|
||||||
|
nlp_de = German()
|
||||||
|
nlp_de.add_pipe("token_normalizer") # uses the German factory
|
||||||
|
print([token.norm_ for token in nlp_de("realise colour daß wußte")])
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Implementation details">
|
||||||
|
|
||||||
|
Under the hood, language-specific factories are added to the
|
||||||
|
[`factories` registry](/api/top-level#registry) prefixed with the language code,
|
||||||
|
e.g. `"en.token_normalizer"`. When resolving the factory in
|
||||||
|
[`nlp.add_pipe`](/api/language#add_pipe), spaCy first checks for a
|
||||||
|
language-specific version of the factory using `nlp.lang` and if none is
|
||||||
|
available, falls back to looking up the regular factory name.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<!-- TODO:
|
||||||
|
|
||||||
|
### Trainable components {#trainable new="3"}
|
||||||
|
|
||||||
|
-->
|
||||||
|
|
||||||
## Extension attributes {#custom-components-attributes new="2"}
|
## Extension attributes {#custom-components-attributes new="2"}
|
||||||
|
|
||||||
As of v2.0, spaCy allows you to set any custom attributes and methods on the
|
spaCy allows you to set any custom attributes and methods on the `Doc`, `Span`
|
||||||
`Doc`, `Span` and `Token`, which become available as `Doc._`, `Span._` and
|
and `Token`, which become available as `Doc._`, `Span._` and `Token._` – for
|
||||||
`Token._` – for example, `Token._.my_attr`. This lets you store additional
|
example, `Token._.my_attr`. This lets you store additional information relevant
|
||||||
information relevant to your application, add new features and functionality to
|
to your application, add new features and functionality to spaCy, and implement
|
||||||
spaCy, and implement your own models trained with other machine learning
|
your own models trained with other machine learning libraries. It also lets you
|
||||||
libraries. It also lets you take advantage of spaCy's data structures and the
|
take advantage of spaCy's data structures and the `Doc` object as the "single
|
||||||
`Doc` object as the "single source of truth".
|
source of truth".
|
||||||
|
|
||||||
<Accordion title="Why ._ and not just a top-level attribute?" id="why-dot-underscore">
|
<Accordion title="Why ._ and not just a top-level attribute?" id="why-dot-underscore">
|
||||||
|
|
||||||
|
@ -641,7 +855,73 @@ attributes on the `Doc`, `Span` and `Token` – for example, the capital,
|
||||||
latitude/longitude coordinates and even the country flag.
|
latitude/longitude coordinates and even the country flag.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_countries_api.py
|
### {executable="true"}
|
||||||
|
import requests
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.matcher import PhraseMatcher
|
||||||
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
|
||||||
|
@Language.factory("rest_countries")
|
||||||
|
class RESTCountriesComponent:
|
||||||
|
def __init__(self, nlp, name, label="GPE"):
|
||||||
|
r = requests.get("https://restcountries.eu/rest/v2/all")
|
||||||
|
r.raise_for_status() # make sure requests raises an error if it fails
|
||||||
|
countries = r.json()
|
||||||
|
# Convert API response to dict keyed by country name for easy lookup
|
||||||
|
self.countries = {c["name"]: c for c in countries}
|
||||||
|
self.label = label
|
||||||
|
# Set up the PhraseMatcher with Doc patterns for each country name
|
||||||
|
self.matcher = PhraseMatcher(nlp.vocab)
|
||||||
|
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
|
||||||
|
# Register attribute on the Token. We'll be overwriting this based on
|
||||||
|
# the matches, so we're only setting a default value, not a getter.
|
||||||
|
Token.set_extension("is_country", default=False)
|
||||||
|
Token.set_extension("country_capital", default=False)
|
||||||
|
Token.set_extension("country_latlng", default=False)
|
||||||
|
Token.set_extension("country_flag", default=False)
|
||||||
|
# Register attributes on Doc and Span via a getter that checks if one of
|
||||||
|
# the contained tokens is set to is_country == True.
|
||||||
|
Doc.set_extension("has_country", getter=self.has_country)
|
||||||
|
Span.set_extension("has_country", getter=self.has_country)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
spans = [] # keep the spans for later so we can merge them afterwards
|
||||||
|
for _, start, end in self.matcher(doc):
|
||||||
|
# Generate Span representing the entity & set label
|
||||||
|
entity = Span(doc, start, end, label=self.label)
|
||||||
|
spans.append(entity)
|
||||||
|
# Set custom attribute on each token of the entity
|
||||||
|
# Can be extended with other data returned by the API, like
|
||||||
|
# currencies, country code, flag, calling code etc.
|
||||||
|
for token in entity:
|
||||||
|
token._.set("is_country", True)
|
||||||
|
token._.set("country_capital", self.countries[entity.text]["capital"])
|
||||||
|
token._.set("country_latlng", self.countries[entity.text]["latlng"])
|
||||||
|
token._.set("country_flag", self.countries[entity.text]["flag"])
|
||||||
|
# Iterate over all spans and merge them into one token
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
for span in spans:
|
||||||
|
retokenizer.merge(span)
|
||||||
|
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||||
|
doc.ents = list(doc.ents) + spans
|
||||||
|
return doc # don't forget to return the Doc!
|
||||||
|
|
||||||
|
def has_country(self, tokens):
|
||||||
|
"""Getter for Doc and Span attributes. Since the getter is only called
|
||||||
|
when we access the attribute, we can refer to the Token's 'is_country'
|
||||||
|
attribute here, which is already set in the processing step."""
|
||||||
|
return any([t._.get("is_country") for t in tokens])
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe("rest_countries", config={"label": "GPE"})
|
||||||
|
doc = nlp("Some text about Colombia and the Czech Republic")
|
||||||
|
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
||||||
|
print("Doc has countries", doc._.has_country) # Doc contains countries
|
||||||
|
for token in doc:
|
||||||
|
if token._.is_country:
|
||||||
|
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
|
||||||
|
print("Entities", [(e.text, e.label_) for e in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
In this case, all data can be fetched on initialization in one request. However,
|
In this case, all data can be fetched on initialization in one request. However,
|
||||||
|
@ -800,11 +1080,6 @@ function that takes a `Doc`, modifies it and returns it.
|
||||||
[`load_model_from_path`](/api/top-level#util.load_model_from_path) utility
|
[`load_model_from_path`](/api/top-level#util.load_model_from_path) utility
|
||||||
functions.
|
functions.
|
||||||
|
|
||||||
```diff
|
|
||||||
+ nlp.add_pipe(my_custom_component)
|
|
||||||
+ return nlp.from_disk(model_path)
|
|
||||||
```
|
|
||||||
|
|
||||||
- Once you're ready to share your extension with others, make sure to **add docs
|
- Once you're ready to share your extension with others, make sure to **add docs
|
||||||
and installation instructions** (you can always link to this page for more
|
and installation instructions** (you can always link to this page for more
|
||||||
info). Make it easy for others to install and use your extension, for example
|
info). Make it easy for others to install and use your extension, for example
|
||||||
|
@ -838,10 +1113,12 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||||
> overlapping entity spans are not allowed.
|
> overlapping entity spans are not allowed.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="1,6-7"}
|
### {highlight="1,8-9"}
|
||||||
import your_custom_entity_recognizer
|
import your_custom_entity_recognizer
|
||||||
from spacy.gold import offsets_from_biluo_tags
|
from spacy.gold import offsets_from_biluo_tags
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
@Language.component("custom_ner_wrapper")
|
||||||
def custom_ner_wrapper(doc):
|
def custom_ner_wrapper(doc):
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
custom_entities = your_custom_entity_recognizer(words)
|
custom_entities = your_custom_entity_recognizer(words)
|
||||||
|
@ -865,22 +1142,24 @@ because it returns the integer ID of the string _and_ makes sure it's added to
|
||||||
the vocab. This is especially important if the custom model uses a different
|
the vocab. This is especially important if the custom model uses a different
|
||||||
label scheme than spaCy's default models.
|
label scheme than spaCy's default models.
|
||||||
|
|
||||||
> #### Example: spacy-stanfordnlp
|
> #### Example: spacy-stanza
|
||||||
>
|
>
|
||||||
> For an example of an end-to-end wrapper for statistical tokenization, tagging
|
> For an example of an end-to-end wrapper for statistical tokenization, tagging
|
||||||
> and parsing, check out
|
> and parsing, check out
|
||||||
> [`spacy-stanfordnlp`](https://github.com/explosion/spacy-stanfordnlp). It uses
|
> [`spacy-stanza`](https://github.com/explosion/spacy-stanza). It uses a very
|
||||||
> a very similar approach to the example in this section – the only difference
|
> similar approach to the example in this section – the only difference is that
|
||||||
> is that it fully replaces the `nlp` object instead of providing a pipeline
|
> it fully replaces the `nlp` object instead of providing a pipeline component,
|
||||||
> component, since it also needs to handle tokenization.
|
> since it also needs to handle tokenization.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="1,9,15-17"}
|
### {highlight="1,11,17-19"}
|
||||||
import your_custom_model
|
import your_custom_model
|
||||||
|
from spacy.language import Language
|
||||||
from spacy.symbols import POS, TAG, DEP, HEAD
|
from spacy.symbols import POS, TAG, DEP, HEAD
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
@Language.component("custom_model_wrapper")
|
||||||
def custom_model_wrapper(doc):
|
def custom_model_wrapper(doc):
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
spaces = [token.whitespace for token in doc]
|
spaces = [token.whitespace for token in doc]
|
||||||
|
|
|
@ -450,6 +450,14 @@ git init # Initialize a Git repo
|
||||||
dvc init # Initialize a DVC project
|
dvc init # Initialize a DVC project
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<Infobox title="Important note on privacy" variant="warning">
|
||||||
|
|
||||||
|
DVC enables usage analytics by default, so if you're working in a
|
||||||
|
privacy-sensitive environment, make sure to
|
||||||
|
[**opt-out manually**](https://dvc.org/doc/user-guide/analytics#opting-out).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
The [`spacy project dvc`](/api/cli#project-dvc) command creates a `dvc.yaml`
|
The [`spacy project dvc`](/api/cli#project-dvc) command creates a `dvc.yaml`
|
||||||
config file based on a workflow defined in your `project.yml`. Whenever you
|
config file based on a workflow defined in your `project.yml`. Whenever you
|
||||||
update your project, you can re-run the command to update your DVC config. You
|
update your project, you can re-run the command to update your DVC config. You
|
||||||
|
|
|
@ -506,11 +506,16 @@ attribute `bad_html` on the token.
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Token
|
from spacy.tokens import Token
|
||||||
|
|
||||||
# We're using a class because the component needs to be initialized with
|
# We're using a component factory because the component needs to be initialized
|
||||||
# the shared vocab via the nlp object
|
# with the shared vocab via the nlp object
|
||||||
|
@Language.factory("html_merger")
|
||||||
|
def create_bad_html_merger(nlp, name):
|
||||||
|
return BadHTMLMerger(nlp)
|
||||||
|
|
||||||
class BadHTMLMerger:
|
class BadHTMLMerger:
|
||||||
def __init__(self, nlp):
|
def __init__(self, nlp):
|
||||||
patterns = [
|
patterns = [
|
||||||
|
@ -536,8 +541,7 @@ class BadHTMLMerger:
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
html_merger = BadHTMLMerger(nlp)
|
nlp.add_pipe("html_merger", last=True) # Add component to the pipeline
|
||||||
nlp.add_pipe(html_merger, last=True) # Add component to the pipeline
|
|
||||||
doc = nlp("Hello<br>world! <br/> This is a test.")
|
doc = nlp("Hello<br>world! <br/> This is a test.")
|
||||||
for token in doc:
|
for token in doc:
|
||||||
print(token.text, token._.bad_html)
|
print(token.text, token._.bad_html)
|
||||||
|
@ -546,10 +550,16 @@ for token in doc:
|
||||||
|
|
||||||
Instead of hard-coding the patterns into the component, you could also make it
|
Instead of hard-coding the patterns into the component, you could also make it
|
||||||
take a path to a JSON file containing the patterns. This lets you reuse the
|
take a path to a JSON file containing the patterns. This lets you reuse the
|
||||||
component with different patterns, depending on your application:
|
component with different patterns, depending on your application. When adding
|
||||||
|
the component to the pipeline with [`nlp.add_pipe`](/api/language#add_pipe), you
|
||||||
|
can pass in the argument via the `config`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json")
|
@Language.factory("html_merger", default_config={"path": None})
|
||||||
|
def create_bad_html_merger(nlp, name, path):
|
||||||
|
return BadHTMLMerger(nlp, path=path)
|
||||||
|
|
||||||
|
nlp.add_pipe("html_merger", config={"path": "/path/to/patterns.json"})
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="Processing pipelines" emoji="📖">
|
<Infobox title="Processing pipelines" emoji="📖">
|
||||||
|
@ -835,7 +845,7 @@ patterns can contain single or multiple tokens.
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
nlp = spacy.load("en_core_web_sm")
|
||||||
matcher = PhraseMatcher(nlp.vocab)
|
matcher = PhraseMatcher(nlp.vocab)
|
||||||
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
|
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
|
||||||
# Only run nlp.make_doc to speed things up
|
# Only run nlp.make_doc to speed things up
|
||||||
|
@ -975,14 +985,12 @@ chosen.
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ruler = EntityRuler(nlp)
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "ORG", "pattern": "Apple"},
|
patterns = [{"label": "ORG", "pattern": "Apple"},
|
||||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
|
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp("Apple is opening its first big office in San Francisco.")
|
doc = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
@ -1000,13 +1008,11 @@ can set `overwrite_ents=True` on initialization.
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
ruler = EntityRuler(nlp)
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
|
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp("MyCorp Inc. is a company in the U.S.")
|
doc = nlp("MyCorp Inc. is a company in the U.S.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
@ -1014,12 +1020,12 @@ print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
|
||||||
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"}
|
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"}
|
||||||
|
|
||||||
The `EntityRuler` can validate patterns against a JSON schema with the option
|
The entity ruler can validate patterns against a JSON schema with the config
|
||||||
`validate=True`. See details under
|
setting `"validate"`. See details under
|
||||||
[Validating and debugging patterns](#pattern-validation).
|
[Validating and debugging patterns](#pattern-validation).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
ruler = EntityRuler(nlp, validate=True)
|
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
|
||||||
```
|
```
|
||||||
|
|
||||||
### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"}
|
### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"}
|
||||||
|
@ -1031,15 +1037,13 @@ the same entity.
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ruler = EntityRuler(nlp)
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
||||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
|
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
|
||||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
|
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
||||||
|
@ -1068,7 +1072,7 @@ line.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
ruler.to_disk("./patterns.jsonl")
|
ruler.to_disk("./patterns.jsonl")
|
||||||
new_ruler = EntityRuler(nlp).from_disk("./patterns.jsonl")
|
new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl")
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="Integration with Prodigy">
|
<Infobox title="Integration with Prodigy">
|
||||||
|
@ -1086,9 +1090,8 @@ pipeline, its patterns are automatically exported to the model directory:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
ruler = EntityRuler(nlp)
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
nlp.to_disk("/path/to/model")
|
nlp.to_disk("/path/to/model")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1100,35 +1103,30 @@ powerful model packages with binary weights _and_ rules included!
|
||||||
|
|
||||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||||
|
|
||||||
|
<!-- TODO: double-check that this still works if the ruler is added to the pipeline on creation, and include suggestion if needed -->
|
||||||
|
|
||||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
|
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
|
||||||
to understand how the `add_patterns` function of the EntityRuler works. For each
|
to understand how the `add_patterns` function of the entity ruler works. For
|
||||||
**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
|
each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc
|
||||||
object. This happens in case you try to add the EntityRuler at the end of an
|
object. This happens in case you try to add the EntityRuler at the end of an
|
||||||
existing pipeline with, for example, a POS tagger and want to extract matches
|
existing pipeline with, for example, a POS tagger and want to extract matches
|
||||||
based on the pattern's POS signature.
|
based on the pattern's POS signature. In this case you would pass a config value
|
||||||
|
of `"phrase_matcher_attr": "POS"` for the entity ruler.
|
||||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for
|
|
||||||
the EntityRuler.
|
|
||||||
|
|
||||||
Running the full language pipeline across every pattern in a large list scales
|
Running the full language pipeline across every pattern in a large list scales
|
||||||
linearly and can therefore take a long time on large amounts of phrase patterns.
|
linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||||
|
|
||||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
|
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
|
||||||
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
||||||
5,000-100,000 phrase patterns respectively.
|
5,000-100,000 phrase patterns respectively. Even with this speedup (but
|
||||||
|
especially if you're using an older version) the `add_patterns` function can
|
||||||
Even with this speedup (but especially if you're using an older version) the
|
still take a long time. An easy workaround to make this function run faster is
|
||||||
`add_patterns` function can still take a long time.
|
disabling the other language pipes while adding the phrase patterns.
|
||||||
|
|
||||||
An easy workaround to make this function run faster is disabling the other
|
|
||||||
language pipes while adding the phrase patterns.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
entityruler = EntityRuler(nlp)
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
|
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
|
||||||
|
|
||||||
with nlp.select_pipes(enable="tagger"):
|
with nlp.select_pipes(enable="tagger"):
|
||||||
entityruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Combining models and rules {#models-rules}
|
## Combining models and rules {#models-rules}
|
||||||
|
@ -1189,9 +1187,11 @@ have in common is that _if_ they occur, they occur in the **previous token**
|
||||||
right before the person entity.
|
right before the person entity.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="7-11"}
|
### {highlight="9-13"}
|
||||||
|
from spacy.language import Language
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
@Language.component("expand_person_entities")
|
||||||
def expand_person_entities(doc):
|
def expand_person_entities(doc):
|
||||||
new_ents = []
|
new_ents = []
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -1210,18 +1210,20 @@ def expand_person_entities(doc):
|
||||||
```
|
```
|
||||||
|
|
||||||
The above function takes a `Doc` object, modifies its `doc.ents` and returns it.
|
The above function takes a `Doc` object, modifies its `doc.ents` and returns it.
|
||||||
This is exactly what a [pipeline component](/usage/processing-pipelines) does,
|
Using the [`@Language.component`](/api/language#component) decorator, we can
|
||||||
so in order to let it run automatically when processing a text with the `nlp`
|
register it as a [pipeline component](/usage/processing-pipelines) so it can run
|
||||||
object, we can use [`nlp.add_pipe`](/api/language#add_pipe) to add it to the
|
automatically when processing a text. We can use
|
||||||
current pipeline.
|
[`nlp.add_pipe`](/api/language#add_pipe) to add it to the current pipeline.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
|
@Language.component("expand_person_entities")
|
||||||
def expand_person_entities(doc):
|
def expand_person_entities(doc):
|
||||||
new_ents = []
|
new_ents = []
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -1236,7 +1238,7 @@ def expand_person_entities(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
# Add the component after the named entity recognizer
|
# Add the component after the named entity recognizer
|
||||||
nlp.add_pipe(expand_person_entities, after='ner')
|
nlp.add_pipe("expand_person_entities", after="ner")
|
||||||
|
|
||||||
doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
|
doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
@ -1347,7 +1349,7 @@ for ent in person_entities:
|
||||||
# children, e.g. at -> Acme Corp Inc.
|
# children, e.g. at -> Acme Corp Inc.
|
||||||
orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
|
orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
|
||||||
# If the verb is in past tense, the company was a previous company
|
# If the verb is in past tense, the company was a previous company
|
||||||
print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
|
print({"person": ent, "orgs": orgs, "past": head.tag_ == "VBD"})
|
||||||
```
|
```
|
||||||
|
|
||||||
To apply this logic automatically when we process a text, we can add it to the
|
To apply this logic automatically when we process a text, we can add it to the
|
||||||
|
@ -1374,11 +1376,12 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.pipeline import merge_entities
|
from spacy.language import Language
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
|
@Language.component("extract_person_orgs")
|
||||||
def extract_person_orgs(doc):
|
def extract_person_orgs(doc):
|
||||||
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
||||||
for ent in person_entities:
|
for ent in person_entities:
|
||||||
|
@ -1391,12 +1394,12 @@ def extract_person_orgs(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
# To make the entities easier to work with, we'll merge them into single tokens
|
# To make the entities easier to work with, we'll merge them into single tokens
|
||||||
nlp.add_pipe(merge_entities)
|
nlp.add_pipe("merge_entities")
|
||||||
nlp.add_pipe(extract_person_orgs)
|
nlp.add_pipe("extract_person_orgs")
|
||||||
|
|
||||||
doc = nlp("Alex Smith worked at Acme Corp Inc.")
|
doc = nlp("Alex Smith worked at Acme Corp Inc.")
|
||||||
# If you're not in a Jupyter / IPython environment, use displacy.serve
|
# If you're not in a Jupyter / IPython environment, use displacy.serve
|
||||||
displacy.render(doc, options={'fine_grained': True})
|
displacy.render(doc, options={"fine_grained": True})
|
||||||
```
|
```
|
||||||
|
|
||||||
If you change the sentence structure above, for example to "was working", you'll
|
If you change the sentence structure above, for example to "was working", you'll
|
||||||
|
@ -1409,7 +1412,8 @@ information is in the attached auxiliary "was":
|
||||||
To solve this, we can adjust the rules to also check for the above construction:
|
To solve this, we can adjust the rules to also check for the above construction:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="9-11"}
|
### {highlight="10-12"}
|
||||||
|
@Language.component("extract_person_orgs")
|
||||||
def extract_person_orgs(doc):
|
def extract_person_orgs(doc):
|
||||||
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
||||||
for ent in person_entities:
|
for ent in person_entities:
|
||||||
|
|
|
@ -15,6 +15,8 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
||||||
|
|
||||||
### Serializing the pipeline {#pipeline}
|
### Serializing the pipeline {#pipeline}
|
||||||
|
|
||||||
|
<!-- TODO: update this -->
|
||||||
|
|
||||||
When serializing the pipeline, keep in mind that this will only save out the
|
When serializing the pipeline, keep in mind that this will only save out the
|
||||||
**binary data for the individual components** to allow spaCy to restore them –
|
**binary data for the individual components** to allow spaCy to restore them –
|
||||||
not the entire objects. This is a good thing, because it makes serialization
|
not the entire objects. This is a good thing, because it makes serialization
|
||||||
|
@ -22,32 +24,35 @@ safe. But it also means that you have to take care of storing the language name
|
||||||
and pipeline component names as well, and restoring them separately before you
|
and pipeline component names as well, and restoring them separately before you
|
||||||
can load in the data.
|
can load in the data.
|
||||||
|
|
||||||
> #### Saving the model meta
|
> #### Saving the meta and config
|
||||||
>
|
>
|
||||||
> The `nlp.meta` attribute is a JSON-serializable dictionary and contains all
|
> The [`nlp.meta`](/api/language#meta) attribute is a JSON-serializable
|
||||||
> model meta information, like the language and pipeline, but also author and
|
> dictionary and contains all model meta information like the author and license
|
||||||
> license information.
|
> information. The [`nlp.config`](/api/language#config) attribute is a
|
||||||
|
> dictionary containing the training configuration, pipeline component factories
|
||||||
|
> and other settings. It is saved out with a model as the `config.cfg`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Serialize
|
### Serialize
|
||||||
bytes_data = nlp.to_bytes()
|
bytes_data = nlp.to_bytes()
|
||||||
lang = nlp.meta["lang"] # "en"
|
lang = nlp.config["nlp"]["lang"] # "en"
|
||||||
pipeline = nlp.meta["pipeline"] # ["tagger", "parser", "ner"]
|
pipeline = nlp.config["nlp"]["pipeline"] # ["tagger", "parser", "ner"]
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Deserialize
|
### Deserialize
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
for pipe_name in pipeline:
|
for pipe_name in pipeline:
|
||||||
pipe = nlp.create_pipe(pipe_name)
|
nlp.add_pipe(pipe_name)
|
||||||
nlp.add_pipe(pipe)
|
|
||||||
nlp.from_bytes(bytes_data)
|
nlp.from_bytes(bytes_data)
|
||||||
```
|
```
|
||||||
|
|
||||||
This is also how spaCy does it under the hood when loading a model: it loads the
|
This is also how spaCy does it under the hood when loading a model: it loads the
|
||||||
model's `meta.json` containing the language and pipeline information,
|
model's `config.cfg` containing the language and pipeline information,
|
||||||
initializes the language class, creates and adds the pipeline components and
|
initializes the language class, creates and adds the pipeline components based
|
||||||
_then_ loads in the binary data. You can read more about this process
|
on the defined
|
||||||
|
[factories](/usage/processing-pipeline#custom-components-factories) and _then_
|
||||||
|
loads in the binary data. You can read more about this process
|
||||||
[here](/usage/processing-pipelines#pipelines).
|
[here](/usage/processing-pipelines#pipelines).
|
||||||
|
|
||||||
### Serializing Doc objects efficiently {#docs new="2.2"}
|
### Serializing Doc objects efficiently {#docs new="2.2"}
|
||||||
|
@ -192,10 +197,9 @@ add to that data and saves and loads the data to and from a JSON file.
|
||||||
> recognizer and including all rules _with_ the model data.
|
> recognizer and including all rules _with_ the model data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {highlight="15-19,21-26"}
|
### {highlight="14-18,20-25"}
|
||||||
|
@Language.factory("my_component")
|
||||||
class CustomComponent:
|
class CustomComponent:
|
||||||
name = "my_component"
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.data = []
|
self.data = []
|
||||||
|
|
||||||
|
@ -228,9 +232,8 @@ component's `to_disk` method.
|
||||||
```python
|
```python
|
||||||
### {highlight="2-4"}
|
### {highlight="2-4"}
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
my_component = CustomComponent()
|
my_component = nlp.add_pipe("my_component")
|
||||||
my_component.add({"hello": "world"})
|
my_component.add({"hello": "world"})
|
||||||
nlp.add_pipe(my_component)
|
|
||||||
nlp.to_disk("/path/to/model")
|
nlp.to_disk("/path/to/model")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -247,7 +250,8 @@ file `data.json` in its subdirectory:
|
||||||
├── parser # data for "parser" component
|
├── parser # data for "parser" component
|
||||||
├── tagger # data for "tagger" component
|
├── tagger # data for "tagger" component
|
||||||
├── vocab # model vocabulary
|
├── vocab # model vocabulary
|
||||||
├── meta.json # model meta.json with name, language and pipeline
|
├── meta.json # model meta.json
|
||||||
|
├── config.cfg # model config
|
||||||
└── tokenizer # tokenization rules
|
└── tokenizer # tokenization rules
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -260,19 +264,14 @@ instance, you could add a
|
||||||
trained with a different library like TensorFlow or PyTorch and make spaCy load
|
trained with a different library like TensorFlow or PyTorch and make spaCy load
|
||||||
its weights automatically when you load the model package.
|
its weights automatically when you load the model package.
|
||||||
|
|
||||||
<Infobox title="Important note on loading components" variant="warning">
|
<Infobox title="Important note on loading custom components" variant="warning">
|
||||||
|
|
||||||
When you load a model from disk, spaCy will check the `"pipeline"` in the
|
When you load back a model with custom components, make sure that the components
|
||||||
model's `meta.json` and look up the component name in the internal factories. To
|
are **available** and that the [`@Language.component`](/api/language#component)
|
||||||
make sure spaCy knows how to initialize `"my_component"`, you'll need to add it
|
or [`@Language.factory`](/api/language#factory) decorators are executed _before_
|
||||||
to the factories:
|
your model is loaded back. Otherwise, spaCy won't know how to resolve the string
|
||||||
|
name of a component factory like `"my_component"` back to a function. For more
|
||||||
```python
|
details, see the documentation on
|
||||||
from spacy.language import Language
|
|
||||||
Language.factories["my_component"] = lambda nlp, **cfg: CustomComponent()
|
|
||||||
```
|
|
||||||
|
|
||||||
For more details, see the documentation on
|
|
||||||
[adding factories](/usage/processing-pipelines#custom-components-factories) or
|
[adding factories](/usage/processing-pipelines#custom-components-factories) or
|
||||||
use [entry points](#entry-points) to make your extension package expose your
|
use [entry points](#entry-points) to make your extension package expose your
|
||||||
custom components to spaCy automatically.
|
custom components to spaCy automatically.
|
||||||
|
@ -293,40 +292,31 @@ installed in the same environment – that's it.
|
||||||
|
|
||||||
| Entry point | Description |
|
| Entry point | Description |
|
||||||
| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories to add to [`Language.factories`](/usage/processing-pipelines#custom-components-factories), keyed by component name. |
|
| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories, keyed by component name. Can be used to expose custom components defined by another package. |
|
||||||
| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. |
|
| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. |
|
||||||
| `spacy_lookups` <Tag variant="new">2.2</Tag> | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. |
|
| `spacy_lookups` <Tag variant="new">2.2</Tag> | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. |
|
||||||
| [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
|
| [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
|
||||||
|
|
||||||
### Custom components via entry points {#entry-points-components}
|
### Custom components via entry points {#entry-points-components}
|
||||||
|
|
||||||
When you load a model, spaCy will generally use the model's `meta.json` to set
|
When you load a model, spaCy will generally use the model's `config.cfg` to set
|
||||||
up the language class and construct the pipeline. The pipeline is specified as a
|
up the language class and construct the pipeline. The pipeline is specified as a
|
||||||
list of strings, e.g. `"pipeline": ["tagger", "paser", "ner"]`. For each of
|
list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
|
||||||
those strings, spaCy will call `nlp.create_pipe` and look up the name in the
|
strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
|
||||||
[built-in factories](/usage/processing-pipelines#custom-components-factories).
|
defined by the decorators [`@Language.component`](/api/language#component) and
|
||||||
If your model wanted to specify its own custom components, you usually have to
|
[`@Language.factory`](/api/language#factory). This means that you have to import
|
||||||
write to `Language.factories` _before_ loading the model.
|
your custom components _before_ loading the model.
|
||||||
|
|
||||||
```python
|
Using entry points, model packages and extension packages can define their own
|
||||||
pipe = nlp.create_pipe("custom_component") # fails 👎
|
`"spacy_factories"`, which will be loaded automatically in the background when
|
||||||
|
the `Language` class is initialized. So if a user has your package installed,
|
||||||
Language.factories["custom_component"] = CustomComponentFactory
|
they'll be able to use your components – even if they **don't import them**!
|
||||||
pipe = nlp.create_pipe("custom_component") # works 👍
|
|
||||||
```
|
|
||||||
|
|
||||||
This is inconvenient and usually required shipping a bunch of component
|
|
||||||
initialization code with the model. Using entry points, model packages and
|
|
||||||
extension packages can now define their own `"spacy_factories"`, which will be
|
|
||||||
added to the built-in factories when the `Language` class is initialized. If a
|
|
||||||
package in the same environment exposes spaCy entry points, all of this happens
|
|
||||||
automatically and no further user action is required.
|
|
||||||
|
|
||||||
To stick with the theme of
|
To stick with the theme of
|
||||||
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
consider the following custom spaCy extension which is initialized with the
|
consider the following custom spaCy
|
||||||
shared `nlp` object and will print a snake when it's called as a pipeline
|
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
|
||||||
component.
|
snake when it's called:
|
||||||
|
|
||||||
> #### Package directory structure
|
> #### Package directory structure
|
||||||
>
|
>
|
||||||
|
@ -337,32 +327,38 @@ component.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### snek.py
|
### snek.py
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
snek = """
|
snek = """
|
||||||
--..,_ _,.--.
|
--..,_ _,.--.
|
||||||
`'.'. .'`__ o `;__.
|
`'.'. .'`__ o `;__. {text}
|
||||||
'.'. .'.'` '---'` `
|
'.'. .'.'` '---'` `
|
||||||
'.`'--....--'`.'
|
'.`'--....--'`.'
|
||||||
`'--....--'`
|
`'--....--'`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class SnekFactory:
|
@Language.component("snek")
|
||||||
def __init__(self, nlp, **cfg):
|
def snek_component(doc):
|
||||||
self.nlp = nlp
|
print(snek.format(text=doc.text))
|
||||||
|
return doc
|
||||||
def __call__(self, doc):
|
|
||||||
print(snek)
|
|
||||||
return doc
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Since it's a very complex and sophisticated module, you want to split it off
|
Since it's a very complex and sophisticated module, you want to split it off
|
||||||
into its own package so you can version it and upload it to PyPi. You also want
|
into its own package so you can version it and upload it to PyPi. You also want
|
||||||
your custom model to be able to define `"pipeline": ["snek"]` in its
|
your custom model to be able to define `pipeline = ["snek"]` in its
|
||||||
`meta.json`. For that, you need to be able to tell spaCy where to find the
|
`config.cfg`. For that, you need to be able to tell spaCy where to find the
|
||||||
factory for `"snek"`. If you don't do this, spaCy will raise an error when you
|
component `"snek"`. If you don't do this, spaCy will raise an error when you try
|
||||||
try to load the model because there's no built-in `"snek"` factory. To add an
|
to load the model because there's no built-in `"snek"` component. To add an
|
||||||
entry to the factories, you can now expose it in your `setup.py` via the
|
entry to the factories, you can now expose it in your `setup.py` via the
|
||||||
`entry_points` dictionary:
|
`entry_points` dictionary:
|
||||||
|
|
||||||
|
> #### Entry point syntax
|
||||||
|
>
|
||||||
|
> Python entry points for a group are formatted as a **list of strings**, with
|
||||||
|
> each string following the syntax of `name = module:object`. In this example,
|
||||||
|
> the created entry point is named `snek` and points to the function
|
||||||
|
> `snek_component` in the module `snek`, i.e. `snek.py`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### setup.py {highlight="5-7"}
|
### setup.py {highlight="5-7"}
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
@ -370,73 +366,74 @@ from setuptools import setup
|
||||||
setup(
|
setup(
|
||||||
name="snek",
|
name="snek",
|
||||||
entry_points={
|
entry_points={
|
||||||
"spacy_factories": ["snek = snek:SnekFactory"]
|
"spacy_factories": ["snek = snek:snek_component"]
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
The entry point definition tells spaCy that the name `snek` can be found in the
|
The same package can expose multiple entry points, by the way. To make them
|
||||||
module `snek` (i.e. `snek.py`) as `SnekFactory`. The same package can expose
|
available to spaCy, all you need to do is install the package in your
|
||||||
multiple entry points. To make them available to spaCy, all you need to do is
|
environment:
|
||||||
install the package:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python setup.py develop
|
$ python setup.py develop
|
||||||
```
|
```
|
||||||
|
|
||||||
spaCy is now able to create the pipeline component `'snek'`:
|
spaCy is now able to create the pipeline component `"snek"` – even though you
|
||||||
|
never imported `snek_component`. When you save the
|
||||||
|
[`nlp.config`](/api/language#config) to disk, it includes an entry for your
|
||||||
|
`"snek"` component and any model you train with this config will include the
|
||||||
|
component and know how to load it – if your `snek` package is installed.
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```diff
|
||||||
|
> [nlp]
|
||||||
|
> lang = "en"
|
||||||
|
> + pipeline = ["snek"]
|
||||||
|
>
|
||||||
|
> [components]
|
||||||
|
>
|
||||||
|
> + [components.snek]
|
||||||
|
> + factory = "snek"
|
||||||
|
> ```
|
||||||
|
|
||||||
```
|
```
|
||||||
>>> from spacy.lang.en import English
|
>>> from spacy.lang.en import English
|
||||||
>>> nlp = English()
|
>>> nlp = English()
|
||||||
>>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉
|
>>> nlp.add_pipe("snek") # this now works! 🐍🎉
|
||||||
>>> nlp.add_pipe(snek)
|
|
||||||
>>> doc = nlp("I am snek")
|
>>> doc = nlp("I am snek")
|
||||||
--..,_ _,.--.
|
--..,_ _,.--.
|
||||||
`'.'. .'`__ o `;__.
|
`'.'. .'`__ o `;__. I am snek
|
||||||
'.'. .'.'` '---'` `
|
'.'. .'.'` '---'` `
|
||||||
'.`'--....--'`.'
|
'.`'--....--'`.'
|
||||||
`'--....--'`
|
`'--....--'`
|
||||||
```
|
```
|
||||||
|
|
||||||
Arguably, this gets even more exciting when you train your `en_core_snek_sm`
|
Instead of making your snek component a simple
|
||||||
model. To make sure `snek` is installed with the model, you can add it to the
|
[stateless component](/usage/processing-pipelines#custom-components-simple), you
|
||||||
model's `setup.py`. You can then tell spaCy to construct the model pipeline with
|
could also make it a
|
||||||
the `snek` component by setting `"pipeline": ["snek"]` in the `meta.json`.
|
[factory](/usage/processing-pipelines#custom-components-factories) that takes
|
||||||
|
settings. Your users can then pass in an optional `config` when they add your
|
||||||
|
component to the pipeline and customize its appearance – for example, the
|
||||||
|
`snek_style`.
|
||||||
|
|
||||||
> #### meta.json
|
> #### config.cfg (excerpt)
|
||||||
>
|
>
|
||||||
> ```diff
|
> ```diff
|
||||||
> {
|
> [components.snek]
|
||||||
> "lang": "en",
|
> factory = "snek"
|
||||||
> "name": "core_snek_sm",
|
> + snek_style = "basic"
|
||||||
> "version": "1.0.0",
|
|
||||||
> + "pipeline": ["snek"]
|
|
||||||
> }
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
In theory, the entry point mechanism also lets you overwrite built-in factories
|
|
||||||
– including the tokenizer. By default, spaCy will output a warning in these
|
|
||||||
cases, to prevent accidental overwrites and unintended results.
|
|
||||||
|
|
||||||
#### Advanced components with settings {#advanced-cfg}
|
|
||||||
|
|
||||||
The `**cfg` keyword arguments that the factory receives are passed down all the
|
|
||||||
way from `spacy.load`. This means that the factory can respond to custom
|
|
||||||
settings defined when loading the model – for example, the style of the snake to
|
|
||||||
load:
|
|
||||||
|
|
||||||
```python
|
|
||||||
nlp = spacy.load("en_core_snek_sm", snek_style="cute")
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
SNEKS = {"basic": snek, "cute": cute_snek} # collection of sneks
|
SNEKS = {"basic": snek, "cute": cute_snek} # collection of sneks
|
||||||
|
|
||||||
|
@Language.factory("snek", default_config={"snek_style": "basic"})
|
||||||
class SnekFactory:
|
class SnekFactory:
|
||||||
def __init__(self, nlp, **cfg):
|
def __init__(self, nlp: Language, name: str, snek_style: str):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.snek_style = cfg.get("snek_style", "basic")
|
self.snek_style = snek_style
|
||||||
self.snek = SNEKS[self.snek_style]
|
self.snek = SNEKS[self.snek_style]
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
@ -444,6 +441,14 @@ class SnekFactory:
|
||||||
return doc
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### setup.py
|
||||||
|
entry_points={
|
||||||
|
- "spacy_factories": ["snek = snek:snek_component"]
|
||||||
|
+ "spacy_factories": ["snek = snek:SnekFactory"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
The factory can also implement other pipeline component like `to_disk` and
|
The factory can also implement other pipeline component like `to_disk` and
|
||||||
`from_disk` for serialization, or even `update` to make the component trainable.
|
`from_disk` for serialization, or even `update` to make the component trainable.
|
||||||
If a component exposes a `from_disk` method and is included in a model's
|
If a component exposes a `from_disk` method and is included in a model's
|
||||||
|
@ -452,12 +457,12 @@ model. When you save out a model using `nlp.to_disk` and the component exposes a
|
||||||
`to_disk` method, it will be called with the disk path.
|
`to_disk` method, it will be called with the disk path.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
snek_path = path / "snek.txt"
|
snek_path = path / "snek.txt"
|
||||||
with snek_path.open("w", encoding="utf8") as snek_file:
|
with snek_path.open("w", encoding="utf8") as snek_file:
|
||||||
snek_file.write(self.snek)
|
snek_file.write(self.snek)
|
||||||
|
|
||||||
def from_disk(self, path, **cfg):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
snek_path = path / "snek.txt"
|
snek_path = path / "snek.txt"
|
||||||
with snek_path.open("r", encoding="utf8") as snek_file:
|
with snek_path.open("r", encoding="utf8") as snek_file:
|
||||||
self.snek = snek_file.read()
|
self.snek = snek_file.read()
|
||||||
|
@ -473,24 +478,20 @@ the `snek.txt` and make it available to the component.
|
||||||
To stay with the theme of the previous example and
|
To stay with the theme of the previous example and
|
||||||
[this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
let's imagine you wanted to implement your own `SnekLanguage` class for your
|
let's imagine you wanted to implement your own `SnekLanguage` class for your
|
||||||
custom model – but you don't necessarily want to modify spaCy's code to
|
custom model – but you don't necessarily want to modify spaCy's code to add a
|
||||||
[add a language](/usage/adding-languages). In your package, you could then
|
language. In your package, you could then implement the following
|
||||||
implement the following:
|
[custom language subclass](/usage/linguistic-features#language-subclass):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### snek.py
|
### snek.py
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.attrs import LANG
|
|
||||||
|
|
||||||
class SnekDefaults(Language.Defaults):
|
class SnekDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
stop_words = set(["sss", "hiss"])
|
||||||
lex_attr_getters[LANG] = lambda text: "snk"
|
|
||||||
|
|
||||||
|
|
||||||
class SnekLanguage(Language):
|
class SnekLanguage(Language):
|
||||||
lang = "snk"
|
lang = "snk"
|
||||||
Defaults = SnekDefaults
|
Defaults = SnekDefaults
|
||||||
# Some custom snek language stuff here
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Alongside the `spacy_factories`, there's also an entry point option for
|
Alongside the `spacy_factories`, there's also an entry point option for
|
||||||
|
@ -510,31 +511,12 @@ setup(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
In spaCy, you can then load the custom `sk` language and it will be resolved to
|
In spaCy, you can then load the custom `snk` language and it will be resolved to
|
||||||
`SnekLanguage` via the custom entry point. This is especially relevant for model
|
`SnekLanguage` via the custom entry point. This is especially relevant for model
|
||||||
packages, which could then specify `"lang": "snk"` in their `meta.json` without
|
packages you train, which could then specify `lang = snk` in their `config.cfg`
|
||||||
spaCy raising an error because the language is not available in the core
|
without spaCy raising an error because the language is not available in the core
|
||||||
library.
|
library.
|
||||||
|
|
||||||
> #### meta.json
|
|
||||||
>
|
|
||||||
> ```diff
|
|
||||||
> {
|
|
||||||
> - "lang": "en",
|
|
||||||
> + "lang": "snk",
|
|
||||||
> "name": "core_snek_sm",
|
|
||||||
> "version": "1.0.0",
|
|
||||||
> "pipeline": ["snek"]
|
|
||||||
> }
|
|
||||||
> ```
|
|
||||||
|
|
||||||
```python
|
|
||||||
from spacy.util import get_lang_class
|
|
||||||
|
|
||||||
SnekLanguage = get_lang_class("snk")
|
|
||||||
nlp = SnekLanguage()
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"}
|
### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"}
|
||||||
|
|
||||||
If you're training a named entity recognition model for a custom domain, you may
|
If you're training a named entity recognition model for a custom domain, you may
|
||||||
|
@ -611,7 +593,7 @@ manually and place it in the model data directory, or supply a path to it using
|
||||||
the `--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
the `--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
||||||
docs.
|
docs.
|
||||||
|
|
||||||
> #### meta.json
|
> #### meta.json (example)
|
||||||
>
|
>
|
||||||
> ```json
|
> ```json
|
||||||
> {
|
> {
|
||||||
|
@ -622,8 +604,7 @@ docs.
|
||||||
> "description": "Example model for spaCy",
|
> "description": "Example model for spaCy",
|
||||||
> "author": "You",
|
> "author": "You",
|
||||||
> "email": "you@example.com",
|
> "email": "you@example.com",
|
||||||
> "license": "CC BY-SA 3.0",
|
> "license": "CC BY-SA 3.0"
|
||||||
> "pipeline": ["tagger", "parser", "ner"]
|
|
||||||
> }
|
> }
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -631,66 +612,39 @@ docs.
|
||||||
$ python -m spacy package /home/me/data/en_example_model /home/me/my_models
|
$ python -m spacy package /home/me/data/en_example_model /home/me/my_models
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will create a model package directory that should look like this:
|
This command will create a model package directory and will run
|
||||||
|
`python setup.py sdist` in that directory to create `.tar.gz` archive of your
|
||||||
|
model package that can be installed using `pip install`.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure
|
### Directory structure
|
||||||
└── /
|
└── /
|
||||||
├── MANIFEST.in # to include meta.json
|
├── MANIFEST.in # to include meta.json
|
||||||
├── meta.json # model meta data
|
├── meta.json # model meta data
|
||||||
├── setup.py # setup file for pip installation
|
├── setup.py # setup file for pip installation
|
||||||
└── en_example_model # model directory
|
├── en_example_model # model directory
|
||||||
├── __init__.py # init for pip installation
|
│ ├── __init__.py # init for pip installation
|
||||||
└── en_example_model-1.0.0 # model data
|
│ └── en_example_model-1.0.0 # model data
|
||||||
|
└── dist
|
||||||
|
└── en_example_model-1.0.0.tar.gz # installable package
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also find templates for all files on
|
You can also find templates for all files in the
|
||||||
[GitHub](https://github.com/explosion/spacy-models/tree/master/template). If
|
[`cli/package.py` source](https://github.com/explosion/spacy/tree/master/spacy/cli/package.py).
|
||||||
you're creating the package manually, keep in mind that the directories need to
|
If you're creating the package manually, keep in mind that the directories need
|
||||||
be named according to the naming conventions of `lang_name` and
|
to be named according to the naming conventions of `lang_name` and
|
||||||
`lang_name-version`.
|
`lang_name-version`.
|
||||||
|
|
||||||
### Customizing the model setup {#models-custom}
|
### Customizing the model setup {#models-custom}
|
||||||
|
|
||||||
The meta.json includes the model details, like name, requirements and license,
|
|
||||||
and lets you customize how the model should be initialized and loaded. You can
|
|
||||||
define the language data to be loaded and the
|
|
||||||
[processing pipeline](/usage/processing-pipelines) to execute.
|
|
||||||
|
|
||||||
| Setting | Type | Description |
|
|
||||||
| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `lang` | str | ID of the language class to initialize. |
|
|
||||||
| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
|
|
||||||
|
|
||||||
The `load()` method that comes with our model package templates will take care
|
The `load()` method that comes with our model package templates will take care
|
||||||
of putting all this together and returning a `Language` object with the loaded
|
of putting all this together and returning a `Language` object with the loaded
|
||||||
pipeline and data. If your model requires custom
|
pipeline and data. If your model requires custom
|
||||||
[pipeline components](/usage/processing-pipelines) or a custom language class,
|
[pipeline components](/usage/processing-pipelines) or a custom language class,
|
||||||
you can also **ship the code with your model**. For examples of this, check out
|
you can also **ship the code with your model** and include it in the
|
||||||
the implementations of spaCy's
|
`__init__.py` – for example, to register custom
|
||||||
[`load_model_from_init_py`](/api/top-level#util.load_model_from_init_py) and
|
[pipeline components](/usage/processing-pipelines#custom-components) before the
|
||||||
[`load_model_from_path`](/api/top-level#util.load_model_from_path) utility
|
`nlp` object is created.
|
||||||
functions.
|
|
||||||
|
|
||||||
### Building the model package {#models-building}
|
|
||||||
|
|
||||||
To build the package, run the following command from within the directory. For
|
|
||||||
more information on building Python packages, see the docs on Python's
|
|
||||||
[setuptools](https://setuptools.readthedocs.io/en/latest/).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python setup.py sdist
|
|
||||||
```
|
|
||||||
|
|
||||||
This will create a `.tar.gz` archive in a directory `/dist`. The model can be
|
|
||||||
installed by pointing pip to the path of the archive:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ pip install /path/to/en_example_model-1.0.0.tar.gz
|
|
||||||
```
|
|
||||||
|
|
||||||
You can then load the model via its name, `en_example_model`, or import it
|
|
||||||
directly as a module and then call its `load()` method.
|
|
||||||
|
|
||||||
### Loading a custom model package {#loading}
|
### Loading a custom model package {#loading}
|
||||||
|
|
||||||
|
|
|
@ -149,12 +149,12 @@ not just define static settings, but also construct objects like architectures,
|
||||||
schedules, optimizers or any other custom components. The main top-level
|
schedules, optimizers or any other custom components. The main top-level
|
||||||
sections of a config file are:
|
sections of a config file are:
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
| ------------- | -------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `training` | Settings and controls for the training and evaluation process. |
|
| `training` | Settings and controls for the training and evaluation process. |
|
||||||
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
|
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
|
||||||
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/docs/processing-pipelines) component names. |
|
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
|
||||||
| `components` | Definitions of the [pipeline components](/docs/processing-pipelines) and their models. |
|
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
|
||||||
|
|
||||||
<Infobox title="Config format and settings" emoji="📖">
|
<Infobox title="Config format and settings" emoji="📖">
|
||||||
|
|
||||||
|
@ -328,18 +328,15 @@ spaCy's configs are powered by our machine learning library Thinc's
|
||||||
[type hints](https://docs.python.org/3/library/typing.html) and even
|
[type hints](https://docs.python.org/3/library/typing.html) and even
|
||||||
[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types)
|
[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types)
|
||||||
using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered
|
using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered
|
||||||
function provides For example, `start: int` in the example above will ensure
|
function provides type hints, the values that are passed in will be checked
|
||||||
that the value received as the argument `start` is an integer. If the value
|
against the expected types. For example, `start: int` in the example above will
|
||||||
can't be cast to an integer, spaCy will raise an error.
|
ensure that the value received as the argument `start` is an integer. If the
|
||||||
|
value can't be cast to an integer, spaCy will raise an error.
|
||||||
`start: pydantic.StrictInt` will force the value to be an integer and raise an
|
`start: pydantic.StrictInt` will force the value to be an integer and raise an
|
||||||
error if it's not – for instance, if your config defines a float.
|
error if it's not – for instance, if your config defines a float.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
### Defining custom architectures {#custom-architectures}
|
|
||||||
|
|
||||||
<!-- TODO: this could maybe be a more general example of using Thinc to compose some layers? We don't want to go too deep here and probably want to focus on a simple architecture example to show how it works -->
|
|
||||||
|
|
||||||
### Wrapping PyTorch and TensorFlow {#custom-frameworks}
|
### Wrapping PyTorch and TensorFlow {#custom-frameworks}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: -->
|
||||||
|
@ -352,6 +349,10 @@ mattis pretium.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
|
### Defining custom architectures {#custom-architectures}
|
||||||
|
|
||||||
|
<!-- TODO: this could maybe be a more general example of using Thinc to compose some layers? We don't want to go too deep here and probably want to focus on a simple architecture example to show how it works -->
|
||||||
|
|
||||||
## Parallel Training with Ray {#parallel-training}
|
## Parallel Training with Ray {#parallel-training}
|
||||||
|
|
||||||
<!-- TODO: document Ray integration -->
|
<!-- TODO: document Ray integration -->
|
||||||
|
@ -445,19 +446,6 @@ annotations:
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
> - **Training data**: The training examples.
|
|
||||||
> - **Text and label**: The current example.
|
|
||||||
> - **Doc**: A `Doc` object created from the example text.
|
|
||||||
> - **Example**: An `Example` object holding both predictions and gold-standard
|
|
||||||
> annotations.
|
|
||||||
> - **nlp**: The `nlp` object with the model.
|
|
||||||
> - **Optimizer**: A function that holds state between updates.
|
|
||||||
> - **Update**: Update the model's weights.
|
|
||||||
|
|
||||||
<!-- TODO: update graphic & related text -->
|
|
||||||
|
|
||||||
![The training loop](../images/training-loop.svg)
|
|
||||||
|
|
||||||
Of course, it's not enough to only show a model a single example once.
|
Of course, it's not enough to only show a model a single example once.
|
||||||
Especially if you only have few examples, you'll want to train for a **number of
|
Especially if you only have few examples, you'll want to train for a **number of
|
||||||
iterations**. At each iteration, the training data is **shuffled** to ensure the
|
iterations**. At each iteration, the training data is **shuffled** to ensure the
|
||||||
|
@ -468,12 +456,16 @@ it harder for the model to memorize the training data. For example, a `0.25`
|
||||||
dropout means that each feature or internal representation has a 1/4 likelihood
|
dropout means that each feature or internal representation has a 1/4 likelihood
|
||||||
of being dropped.
|
of being dropped.
|
||||||
|
|
||||||
> - [`begin_training`](/api/language#begin_training): Start the training and
|
> - [`nlp`](/api/language): The `nlp` object with the model.
|
||||||
> return an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object to
|
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
|
||||||
> update the model's weights.
|
> return an optimizer to update the model's weights.
|
||||||
> - [`update`](/api/language#update): Update the model with the training
|
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
||||||
> examplea.
|
> state between updates.
|
||||||
> - [`to_disk`](/api/language#to_disk): Save the updated model to a directory.
|
> - [`nlp.update`](/api/language#update): Update model with examples.
|
||||||
|
> - [`Example`](/api/example): object holding predictions and gold-standard
|
||||||
|
> annotations.
|
||||||
|
> - [`nlp.to_disk`](/api/language#to_disk): Save the updated model to a
|
||||||
|
> directory.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Example training loop
|
### Example training loop
|
||||||
|
|
6
website/docs/usage/transformers.md
Normal file
6
website/docs/usage/transformers.md
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
title: Transformers
|
||||||
|
teaser: Using transformer models like BERT in spaCy
|
||||||
|
---
|
||||||
|
|
||||||
|
TODO: ...
|
|
@ -6,6 +6,7 @@ menu:
|
||||||
- ['New Features', 'features']
|
- ['New Features', 'features']
|
||||||
- ['Backwards Incompatibilities', 'incompat']
|
- ['Backwards Incompatibilities', 'incompat']
|
||||||
- ['Migrating from v2.x', 'migrating']
|
- ['Migrating from v2.x', 'migrating']
|
||||||
|
- ['Migrating plugins', 'plugins']
|
||||||
---
|
---
|
||||||
|
|
||||||
## Summary {#summary}
|
## Summary {#summary}
|
||||||
|
@ -14,20 +15,250 @@ menu:
|
||||||
|
|
||||||
## Backwards Incompatibilities {#incompat}
|
## Backwards Incompatibilities {#incompat}
|
||||||
|
|
||||||
### Removed deprecated methods, attributes and arguments {#incompat-removed}
|
### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
|
||||||
|
|
||||||
|
| Removed | Replacement |
|
||||||
|
| -------------------------------------------------------- | ----------------------------------------- |
|
||||||
|
| `GoldParse` | [`Example`](/api/example) |
|
||||||
|
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||||
|
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
|
||||||
|
|
||||||
|
### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
|
||||||
|
|
||||||
The following deprecated methods, attributes and arguments were removed in v3.0.
|
The following deprecated methods, attributes and arguments were removed in v3.0.
|
||||||
Most of them have been deprecated for quite a while now and many would
|
Most of them have been **deprecated for a while** and many would previously
|
||||||
previously raise errors. Many of them were also mostly internals. If you've been
|
raise errors. Many of them were also mostly internals. If you've been working
|
||||||
working with more recent versions of spaCy v2.x, it's unlikely that your code
|
with more recent versions of spaCy v2.x, it's **unlikely** that your code relied
|
||||||
relied on them.
|
on them.
|
||||||
|
|
||||||
| Class | Removed |
|
| Removed | Replacement |
|
||||||
| --------------------- | ------------------------------------------------------- |
|
| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`Doc`](/api/doc) | `Doc.tokens_from_list`, `Doc.merge` |
|
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
|
||||||
| [`Span`](/api/span) | `Span.merge`, `Span.upper`, `Span.lower`, `Span.string` |
|
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
|
||||||
| [`Token`](/api/token) | `Token.string` |
|
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
|
||||||
|
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
|
||||||
<!-- TODO: complete (see release notes Dropbox Paper doc) -->
|
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
|
||||||
|
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
|
||||||
|
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), |
|
||||||
|
|
||||||
## Migrating from v2.x {#migrating}
|
## Migrating from v2.x {#migrating}
|
||||||
|
|
||||||
|
### Custom pipeline components and factories {#migrating-pipeline-components}
|
||||||
|
|
||||||
|
Custom pipeline components now have to be registered explicitly using the
|
||||||
|
[`@Language.component`](/api/language#component) or
|
||||||
|
[`@Language.factory`](/api/language#factory) decorator. For simple functions
|
||||||
|
that take a `Doc` and return it, all you have to do is add the
|
||||||
|
`@Language.component` decorator to it and assign it a name:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### Stateless function components
|
||||||
|
+ from spacy.language import Language
|
||||||
|
|
||||||
|
+ @Language.component("my_component")
|
||||||
|
def my_component(doc):
|
||||||
|
return doc
|
||||||
|
```
|
||||||
|
|
||||||
|
For class components that are initialized with settings and/or the shared `nlp`
|
||||||
|
object, you can use the `@Language.factory` decorator. Also make sure that that
|
||||||
|
the method used to initialize the factory has **two named arguments**: `nlp`
|
||||||
|
(the current `nlp` object) and `name` (the string name of the component
|
||||||
|
instance).
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### Stateful class components
|
||||||
|
+ from spacy.language import Language
|
||||||
|
|
||||||
|
+ @Language.factory("my_component")
|
||||||
|
class MyComponent:
|
||||||
|
- def __init__(self, nlp):
|
||||||
|
+ def __init__(self, nlp, name):
|
||||||
|
self.nlp = nlp
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
return doc
|
||||||
|
```
|
||||||
|
|
||||||
|
Instead of decorating your class, you could also add a factory function that
|
||||||
|
takes the arguments `nlp` and `name` and returns an instance of your component:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### Stateful class components with factory function
|
||||||
|
+ from spacy.language import Language
|
||||||
|
|
||||||
|
+ @Language.factory("my_component")
|
||||||
|
+ def create_my_component(nlp, name):
|
||||||
|
+ return MyComponent(nlp)
|
||||||
|
|
||||||
|
class MyComponent:
|
||||||
|
def __init__(self, nlp):
|
||||||
|
self.nlp = nlp
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
return doc
|
||||||
|
```
|
||||||
|
|
||||||
|
The `@Language.component` and `@Language.factory` decorators now take care of
|
||||||
|
adding an entry to the component factories, so spaCy knows how to load a
|
||||||
|
component back in from its string name. You won't have to write to
|
||||||
|
`Language.factories` manually anymore.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Adding components to the pipeline {#migrating-add-pipe}
|
||||||
|
|
||||||
|
The [`nlp.add_pipe`](/api/language#add_pipe) method now takes the **string
|
||||||
|
name** of the component factory instead of a callable component. This allows
|
||||||
|
spaCy to track and serialize components that have been added and their settings.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ @Language.component("my_component")
|
||||||
|
def my_component(doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
- nlp.add_pipe(my_component)
|
||||||
|
+ nlp.add_pipe("my_component")
|
||||||
|
```
|
||||||
|
|
||||||
|
[`nlp.add_pipe`](/api/language#add_pipe) now also returns the pipeline component
|
||||||
|
itself, so you can access its attributes. The
|
||||||
|
[`nlp.create_pipe`](/api/language#create_pipe) method is now mostly internals
|
||||||
|
and you typically shouldn't have to use it in your code.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- parser = nlp.create_pipe("parser")
|
||||||
|
- nlp.add_pipe(parser)
|
||||||
|
+ parser = nlp.add_pipe("parser")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training models {#migrating-training}
|
||||||
|
|
||||||
|
To train your models, you should now pretty much always use the
|
||||||
|
[`spacy train`](/api/cli#train) CLI. You shouldn't have to put together your own
|
||||||
|
training scripts anymore, unless you _really_ want to. The training commands now
|
||||||
|
use a [flexible config file](/usage/training#config) that describes all training
|
||||||
|
settings and hyperparameters, as well as your pipeline, model components and
|
||||||
|
architectures to use. The `--code` argument lets you pass in code containing
|
||||||
|
[custom registered functions](/usage/training#custom-code) that you can
|
||||||
|
reference in your config.
|
||||||
|
|
||||||
|
#### Binary .spacy training data format {#migrating-training-format}
|
||||||
|
|
||||||
|
spaCy now uses a new
|
||||||
|
[binary training data format](/api/data-formats#binary-training), which is much
|
||||||
|
smaller and consists of `Doc` objects, serialized via the
|
||||||
|
[`DocBin`](/api/docbin). You can convert your existing JSON-formatted data using
|
||||||
|
the [`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy convert ./training.json ./output
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Training config {#migrating-training-config}
|
||||||
|
|
||||||
|
<!-- TODO: update once we have recommended "getting started with a new config" workflow -->
|
||||||
|
|
||||||
|
```diff
|
||||||
|
### {wrap="true"}
|
||||||
|
- python -m spacy train en ./output ./train.json ./dev.json --pipeline tagger,parser --cnn-window 1 --bilstm-depth 0
|
||||||
|
+ python -m spacy train ./train.spacy ./dev.spacy ./config.cfg --output ./output
|
||||||
|
```
|
||||||
|
|
||||||
|
<Project id="some_example_project">
|
||||||
|
|
||||||
|
The easiest way to get started with an end-to-end training process is to clone a
|
||||||
|
[project](/usage/projects) template. Projects let you manage multi-step
|
||||||
|
workflows, from data preprocessing to training and packaging your model.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
#### Migrating training scripts to CLI command and config {#migrating-training-scripts}
|
||||||
|
|
||||||
|
<!-- TODO: write -->
|
||||||
|
|
||||||
|
#### Packaging models {#migrating-training-packaging}
|
||||||
|
|
||||||
|
The [`spacy package`](/api/cli#package) command now automatically builds the
|
||||||
|
installable `.tar.gz` sdist of the Python package, so you don't have to run this
|
||||||
|
step manually anymore. You can disable the behavior by setting the `--no-sdist`
|
||||||
|
flag.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
python -m spacy package ./model ./packages
|
||||||
|
- cd /output/en_model-0.0.0
|
||||||
|
- python setup.py sdist
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration notes for plugin maintainers {#plugins}
|
||||||
|
|
||||||
|
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
|
||||||
|
and maintaining one of the many awesome [plugins and extensions](/universe).
|
||||||
|
We've tried to keep breaking changes to a minimum and make it as easy as
|
||||||
|
possible for you to upgrade your packages for spaCy v3.
|
||||||
|
|
||||||
|
### Custom pipeline components
|
||||||
|
|
||||||
|
The most common use case for plugins is providing pipeline components and
|
||||||
|
extension attributes.
|
||||||
|
|
||||||
|
- Use the [`@Language.factory`](/api/language#factory) decorator to register
|
||||||
|
your component and assign it a name. This allows users to refer to your
|
||||||
|
components by name and serialize pipelines referencing them. Remove all manual
|
||||||
|
entries to the `Language.factories`.
|
||||||
|
- Make sure your component factories take at least two **named arguments**:
|
||||||
|
`nlp` (the current `nlp` object) and `name` (the instance name of the added
|
||||||
|
component so you can identify multiple instances of the same component).
|
||||||
|
- Update all references to [`nlp.add_pipe`](/api/language#add_pipe) in your docs
|
||||||
|
to use **string names** instead of the component functions.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {highlight="1-5"}
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
@Language.factory("my_component", default_config={"some_setting": False})
|
||||||
|
def create_component(nlp: Language, name: str, some_setting: bool):
|
||||||
|
return MyCoolComponent(some_setting=some_setting)
|
||||||
|
|
||||||
|
|
||||||
|
class MyCoolComponent:
|
||||||
|
def __init__(self, some_setting):
|
||||||
|
self.some_setting = some_setting
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
# Do something to the doc
|
||||||
|
return doc
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Result in config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.my_component]
|
||||||
|
> factory = "my_component"
|
||||||
|
> some_setting = true
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
import spacy
|
||||||
|
from your_plugin import MyCoolComponent
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
- component = MyCoolComponent(some_setting=True)
|
||||||
|
- nlp.add_pipe(component)
|
||||||
|
+ nlp.add_pipe("my_component", config={"some_setting": True})
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Important note on registering factories" variant="warning">
|
||||||
|
|
||||||
|
The [`@Language.factory`](/api/language#factory) decorator takes care of letting
|
||||||
|
spaCy know that a component of that name is available. This means that your
|
||||||
|
users can add it to the pipeline using its **string name**. However, this
|
||||||
|
requires the decorator to be executed – so users will still have to **import
|
||||||
|
your plugin**. Alternatively, your plugin could expose an
|
||||||
|
[entry point](/usage/saving-loading#entry-points), which spaCy can read from.
|
||||||
|
This means that spaCy knows how to initialize `my_component`, even if your
|
||||||
|
package isn't imported.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
|
@ -229,3 +229,5 @@ vectors.data = torch.Tensor(vectors.data).cuda(0)
|
||||||
## Other embeddings {#embeddings}
|
## Other embeddings {#embeddings}
|
||||||
|
|
||||||
<!-- TODO: explain spacy-transformers, doc.tensor, tok2vec? -->
|
<!-- TODO: explain spacy-transformers, doc.tensor, tok2vec? -->
|
||||||
|
|
||||||
|
<!-- TODO: mention sense2vec somewhere? -->
|
||||||
|
|
|
@ -14,6 +14,10 @@ function getNodeTitle({ childMdx }) {
|
||||||
return (frontmatter.title || '').replace("'", '’')
|
return (frontmatter.title || '').replace("'", '’')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findNode(pages, slug) {
|
||||||
|
return slug ? pages.find(({ node }) => node.fields.slug === slug) : null
|
||||||
|
}
|
||||||
|
|
||||||
exports.createPages = ({ graphql, actions }) => {
|
exports.createPages = ({ graphql, actions }) => {
|
||||||
const { createPage } = actions
|
const { createPage } = actions
|
||||||
|
|
||||||
|
@ -70,6 +74,9 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
title
|
title
|
||||||
teaser
|
teaser
|
||||||
source
|
source
|
||||||
|
api_base_class
|
||||||
|
api_string_name
|
||||||
|
api_trainable
|
||||||
tag
|
tag
|
||||||
new
|
new
|
||||||
next
|
next
|
||||||
|
@ -115,10 +122,18 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
const section = frontmatter.section || page.node.relativeDirectory
|
const section = frontmatter.section || page.node.relativeDirectory
|
||||||
const sectionMeta = sections[section] || {}
|
const sectionMeta = sections[section] || {}
|
||||||
const title = getNodeTitle(page.node)
|
const title = getNodeTitle(page.node)
|
||||||
const next = frontmatter.next
|
const next = findNode(pages, frontmatter.next)
|
||||||
? pages.find(({ node }) => node.fields.slug === frontmatter.next)
|
const baseClass = findNode(pages, frontmatter.api_base_class)
|
||||||
: null
|
const apiDetails = {
|
||||||
|
stringName: frontmatter.api_string_name,
|
||||||
|
baseClass: baseClass
|
||||||
|
? {
|
||||||
|
title: getNodeTitle(baseClass.node),
|
||||||
|
slug: frontmatter.api_base_class,
|
||||||
|
}
|
||||||
|
: null,
|
||||||
|
trainable: frontmatter.api_trainable,
|
||||||
|
}
|
||||||
createPage({
|
createPage({
|
||||||
path: replacePath(page.node.fields.slug),
|
path: replacePath(page.node.fields.slug),
|
||||||
component: DEFAULT_TEMPLATE,
|
component: DEFAULT_TEMPLATE,
|
||||||
|
@ -131,6 +146,7 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
sectionTitle: sectionMeta.title,
|
sectionTitle: sectionMeta.title,
|
||||||
menu: frontmatter.menu || [],
|
menu: frontmatter.menu || [],
|
||||||
teaser: frontmatter.teaser,
|
teaser: frontmatter.teaser,
|
||||||
|
apiDetails,
|
||||||
source: frontmatter.source,
|
source: frontmatter.source,
|
||||||
sidebar: frontmatter.sidebar,
|
sidebar: frontmatter.sidebar,
|
||||||
tag: frontmatter.tag,
|
tag: frontmatter.tag,
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
||||||
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
||||||
{ "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
|
{ "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
|
||||||
|
{ "text": "Transformers", "url": "/usage/transformers", "tag": "new" },
|
||||||
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
||||||
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
||||||
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
||||||
|
@ -66,6 +67,7 @@
|
||||||
{
|
{
|
||||||
"label": "Containers",
|
"label": "Containers",
|
||||||
"items": [
|
"items": [
|
||||||
|
{ "text": "Language", "url": "/api/language" },
|
||||||
{ "text": "Doc", "url": "/api/doc" },
|
{ "text": "Doc", "url": "/api/doc" },
|
||||||
{ "text": "Token", "url": "/api/token" },
|
{ "text": "Token", "url": "/api/token" },
|
||||||
{ "text": "Span", "url": "/api/span" },
|
{ "text": "Span", "url": "/api/span" },
|
||||||
|
@ -77,7 +79,6 @@
|
||||||
{
|
{
|
||||||
"label": "Pipeline",
|
"label": "Pipeline",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Language", "url": "/api/language" },
|
|
||||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||||
|
@ -85,14 +86,21 @@
|
||||||
{ "text": "Tagger", "url": "/api/tagger" },
|
{ "text": "Tagger", "url": "/api/tagger" },
|
||||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||||
|
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
||||||
{ "text": "Matcher", "url": "/api/matcher" },
|
|
||||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
|
|
||||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
|
||||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||||
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
||||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" }
|
{ "text": "Other Functions", "url": "/api/pipeline-functions" },
|
||||||
|
{ "text": "Pipe", "url": "/api/pipe" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Matchers",
|
||||||
|
"items": [
|
||||||
|
{ "text": "Matcher", "url": "/api/matcher" },
|
||||||
|
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
|
||||||
|
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -87,6 +87,17 @@ const Link = ({
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const OptionalLink = ({ to, href, children, ...props }) => {
|
||||||
|
const dest = to || href
|
||||||
|
return dest ? (
|
||||||
|
<Link to={dest} {...props}>
|
||||||
|
{children}
|
||||||
|
</Link>
|
||||||
|
) : (
|
||||||
|
children || null
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
Link.defaultProps = {
|
Link.defaultProps = {
|
||||||
hidden: false,
|
hidden: false,
|
||||||
hideIcon: false,
|
hideIcon: false,
|
||||||
|
|
|
@ -4,43 +4,105 @@ import classNames from 'classnames'
|
||||||
|
|
||||||
import Button from './button'
|
import Button from './button'
|
||||||
import Tag from './tag'
|
import Tag from './tag'
|
||||||
import { H1 } from './typography'
|
import { OptionalLink } from './link'
|
||||||
|
import { InlineCode } from './code'
|
||||||
|
import { H1, Label, InlineList, Help } from './typography'
|
||||||
|
import Icon from './icon'
|
||||||
|
|
||||||
import classes from '../styles/title.module.sass'
|
import classes from '../styles/title.module.sass'
|
||||||
|
|
||||||
const Title = ({ id, title, tag, version, teaser, source, image, children, ...props }) => (
|
const MetaItem = ({ label, url, children, help }) => (
|
||||||
<header className={classes.root}>
|
<span>
|
||||||
{(image || source) && (
|
<Label className={classes.label}>{label}:</Label>
|
||||||
<div className={classes.corner}>
|
<OptionalLink to={url}>{children}</OptionalLink>
|
||||||
{source && (
|
{help && (
|
||||||
<Button to={source} icon="code">
|
<>
|
||||||
Source
|
{' '}
|
||||||
</Button>
|
<Help>{help}</Help>
|
||||||
)}
|
</>
|
||||||
|
|
||||||
{image && (
|
|
||||||
<div className={classes.image}>
|
|
||||||
<img src={image} width={100} height={100} alt="" />
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
)}
|
)}
|
||||||
<H1 className={classes.h1} id={id} {...props}>
|
</span>
|
||||||
{title}
|
|
||||||
</H1>
|
|
||||||
{tag && <Tag spaced>{tag}</Tag>}
|
|
||||||
{version && (
|
|
||||||
<Tag variant="new" spaced>
|
|
||||||
{version}
|
|
||||||
</Tag>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{teaser && <div className={classNames('heading-teaser', classes.teaser)}>{teaser}</div>}
|
|
||||||
|
|
||||||
{children}
|
|
||||||
</header>
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const Title = ({
|
||||||
|
id,
|
||||||
|
title,
|
||||||
|
tag,
|
||||||
|
version,
|
||||||
|
teaser,
|
||||||
|
source,
|
||||||
|
image,
|
||||||
|
apiDetails,
|
||||||
|
children,
|
||||||
|
...props
|
||||||
|
}) => {
|
||||||
|
const hasApiDetails = Object.values(apiDetails).some(v => v)
|
||||||
|
const metaIconProps = { className: classes.metaIcon, width: 18 }
|
||||||
|
return (
|
||||||
|
<header className={classes.root}>
|
||||||
|
{(image || source) && (
|
||||||
|
<div className={classes.corner}>
|
||||||
|
{source && (
|
||||||
|
<Button to={source} icon="code">
|
||||||
|
Source
|
||||||
|
</Button>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{image && (
|
||||||
|
<div className={classes.image}>
|
||||||
|
<img src={image} width={100} height={100} alt="" />
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
<H1 className={classes.h1} id={id} {...props}>
|
||||||
|
{title}
|
||||||
|
</H1>
|
||||||
|
{(tag || version) && (
|
||||||
|
<div className={classes.tags}>
|
||||||
|
{tag && <Tag spaced>{tag}</Tag>}
|
||||||
|
{version && (
|
||||||
|
<Tag variant="new" spaced>
|
||||||
|
{version}
|
||||||
|
</Tag>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{hasApiDetails && (
|
||||||
|
<InlineList Component="div" className={classes.teaser}>
|
||||||
|
{apiDetails.stringName && (
|
||||||
|
<MetaItem
|
||||||
|
label="String name"
|
||||||
|
//help="String name of the component to use with nlp.add_pipe"
|
||||||
|
>
|
||||||
|
<InlineCode>{apiDetails.stringName}</InlineCode>
|
||||||
|
</MetaItem>
|
||||||
|
)}
|
||||||
|
{apiDetails.baseClass && (
|
||||||
|
<MetaItem label="Base class" url={apiDetails.baseClass.slug}>
|
||||||
|
<InlineCode>{apiDetails.baseClass.title}</InlineCode>
|
||||||
|
</MetaItem>
|
||||||
|
)}
|
||||||
|
{apiDetails.trainable != null && (
|
||||||
|
<MetaItem label="Trainable">
|
||||||
|
<span aria-label={apiDetails.trainable ? 'yes' : 'no'}>
|
||||||
|
{apiDetails.trainable ? (
|
||||||
|
<Icon name="yes" variant="success" {...metaIconProps} />
|
||||||
|
) : (
|
||||||
|
<Icon name="no" {...metaIconProps} />
|
||||||
|
)}
|
||||||
|
</span>
|
||||||
|
</MetaItem>
|
||||||
|
)}
|
||||||
|
</InlineList>
|
||||||
|
)}
|
||||||
|
{teaser && <div className={classNames('heading-teaser', classes.teaser)}>{teaser}</div>}
|
||||||
|
{children}
|
||||||
|
</header>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
Title.propTypes = {
|
Title.propTypes = {
|
||||||
title: PropTypes.string,
|
title: PropTypes.string,
|
||||||
tag: PropTypes.string,
|
tag: PropTypes.string,
|
||||||
|
|
|
@ -59,9 +59,9 @@ export const InlineList = ({ Component = 'p', gutterBottom = true, className, ch
|
||||||
return <Component className={listClassNames}>{children}</Component>
|
return <Component className={listClassNames}>{children}</Component>
|
||||||
}
|
}
|
||||||
|
|
||||||
export const Help = ({ children }) => (
|
export const Help = ({ children, size = 16 }) => (
|
||||||
<span className={classes.help} data-tooltip={children}>
|
<span className={classes.help} data-tooltip={children}>
|
||||||
<Icon name="help2" width={16} />
|
<Icon name="help2" width={size} />
|
||||||
</span>
|
</span>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -414,7 +414,7 @@ body [id]:target
|
||||||
.cm-number
|
.cm-number
|
||||||
color: var(--syntax-number)
|
color: var(--syntax-number)
|
||||||
|
|
||||||
.cm-def
|
.cm-def, .cm-meta
|
||||||
color: var(--syntax-function)
|
color: var(--syntax-function)
|
||||||
|
|
||||||
// Jupyter
|
// Jupyter
|
||||||
|
|
|
@ -17,13 +17,17 @@
|
||||||
background: var(--color-subtle-opaque)
|
background: var(--color-subtle-opaque)
|
||||||
|
|
||||||
.footer
|
.footer
|
||||||
background: var(--color-theme-light)
|
--color-inline-code-bg: var(--color-theme-opaque)
|
||||||
|
background: var(--color-theme-light) !important
|
||||||
border-top: 2px solid var(--color-theme)
|
border-top: 2px solid var(--color-theme)
|
||||||
|
|
||||||
& > td:first-child
|
& > td:first-child
|
||||||
font-family: var(--font-secondary)
|
font-family: var(--font-secondary)
|
||||||
color: var(--color-theme)
|
color: var(--color-theme)
|
||||||
|
|
||||||
|
& > td:nth-child(2) a
|
||||||
|
border: 0
|
||||||
|
|
||||||
.td
|
.td
|
||||||
padding: 1rem
|
padding: 1rem
|
||||||
|
|
||||||
|
@ -79,6 +83,13 @@
|
||||||
white-space: nowrap
|
white-space: nowrap
|
||||||
z-index: 5
|
z-index: 5
|
||||||
|
|
||||||
|
&:first-child // directly after thead/tr
|
||||||
|
border-bottom: 0
|
||||||
|
|
||||||
|
td em
|
||||||
|
top: -7px
|
||||||
|
|
||||||
|
|
||||||
// Responsive table
|
// Responsive table
|
||||||
// Shadows adapted from "CSS only Responsive Tables" by David Bushell
|
// Shadows adapted from "CSS only Responsive Tables" by David Bushell
|
||||||
// http://codepen.io/dbushell/pen/wGaamR
|
// http://codepen.io/dbushell/pen/wGaamR
|
||||||
|
|
|
@ -34,3 +34,16 @@
|
||||||
|
|
||||||
.corner
|
.corner
|
||||||
float: right
|
float: right
|
||||||
|
|
||||||
|
.label
|
||||||
|
display: inline
|
||||||
|
margin-right: var(--spacing-xs)
|
||||||
|
|
||||||
|
.meta-icon
|
||||||
|
position: relative
|
||||||
|
top: -2px
|
||||||
|
|
||||||
|
.tags
|
||||||
|
display: inline-block
|
||||||
|
position: relative
|
||||||
|
top: 0.5rem
|
||||||
|
|
|
@ -30,6 +30,7 @@ const Docs = ({ pageContext, children }) => (
|
||||||
menu,
|
menu,
|
||||||
theme,
|
theme,
|
||||||
version,
|
version,
|
||||||
|
apiDetails,
|
||||||
} = pageContext
|
} = pageContext
|
||||||
const { sidebars = [], modelsRepo, languages, nightly } = site.siteMetadata
|
const { sidebars = [], modelsRepo, languages, nightly } = site.siteMetadata
|
||||||
const isModels = section === 'models'
|
const isModels = section === 'models'
|
||||||
|
@ -102,6 +103,7 @@ const Docs = ({ pageContext, children }) => (
|
||||||
tag={tag}
|
tag={tag}
|
||||||
version={version}
|
version={version}
|
||||||
id="_title"
|
id="_title"
|
||||||
|
apiDetails={apiDetails}
|
||||||
/>
|
/>
|
||||||
{children}
|
{children}
|
||||||
{subFooter}
|
{subFooter}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user