diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 7368bcef3..c5f60adfc 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -16,6 +16,7 @@ from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401 +from .init_labels import init_labels_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py new file mode 100644 index 000000000..29cb23072 --- /dev/null +++ b/spacy/cli/init_labels.py @@ -0,0 +1,43 @@ +from typing import Optional +import logging +from pathlib import Path +from wasabi import msg +import typer +import srsly + +from .. import util +from ..training.initialize import init_nlp, convert_vectors +from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code, setup_gpu + + +@init_cli.command( + "labels", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def init_labels_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the labels"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + if not output_path.exists(): + output_path.mkdir() + util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu, silent=False) + for name, component in nlp.pipeline: + if getattr(component, "label_data", None) is not None: + srsly.write_json(output_path / f"{name}.json", component.label_data) + msg.good(f"Saving {name} labels to {output_path}/{name}.json") + else: + msg.info(f"No labels found for {name}") diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 9b28a7ca1..c9798a638 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional +from typing import Optional, Union, Dict import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -101,6 +101,11 @@ class Morphologizer(Tagger): """RETURNS (Tuple[str]): The labels currently added to the component.""" return tuple(self.cfg["labels_morph"].keys()) + @property + def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]: + """RETURNS (Dict): A dictionary with all labels data.""" + return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]} + def add_label(self, label): """Add a new label to the pipe. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index b8961f307..481430a2c 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +from typing import Optional, Tuple import srsly from thinc.api import set_dropout_rate, Model @@ -32,6 +33,20 @@ cdef class Pipe: self.name = name self.cfg = dict(cfg) + @property + def labels(self) -> Optional[Tuple[str]]: + if "labels" in self.cfg: + return tuple(self.cfg["labels"]) + else: + return None + + @property + def label_data(self): + """Optional JSON-serializable data that would be sufficient to recreate + the label set if provided to the `pipe.initialize()` method. + """ + return None + def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the nlp object diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index ec635de5c..65c17c771 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def label_data(self): + return self.labels + def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 3d5aca14e..253b6f08c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -90,6 +90,16 @@ class Tagger(Pipe): """ return tuple(self.cfg["labels"]) + @property + def label_data(self): + """Data about the labels currently added to the component. + + RETURNS (Dict): The labels data. + + DOCS: https://nightly.spacy.io/api/tagger#labels + """ + return tuple(self.cfg["labels"]) + def __call__(self, doc): """Apply the pipe to a Doc. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index ea058ad31..63b040333 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -154,8 +154,23 @@ class TextCategorizer(Pipe): @labels.setter def labels(self, value: List[str]) -> None: + # TODO: This really shouldn't be here. I had a look and I added it when + # I added the labels property, but it's pretty nasty to have this, and + # will lead to problems. self.cfg["labels"] = tuple(value) + @property + def label_data(self) -> Dict: + """RETURNS (Dict): Information about the component's labels. + + DOCS: https://nightly.spacy.io/api/textcategorizer#labels + """ + return { + "labels": self.labels, + "positive": self.cfg["positive_label"], + "threshold": self.cfg["threshold"] + } + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index c250d2522..9f165cb15 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -95,6 +95,10 @@ cdef class Parser(Pipe): class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] return class_names + @property + def label_data(self): + return self.moves.labels + @property def tok2vec(self): """Return the embedding and convolutional layer of the model."""