Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

2025-12-22 09:34:23 +03:00 · 2020-09-29 16:53:48 +02:00 · 2020-09-29 16:53:48 +02:00 · 30c76dbd67
commit 30c76dbd67
parent fd594cfb9b e957d66b92
8 changed files with 98 additions and 1 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -16,6 +16,7 @@ from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
+from .init_labels import init_labels_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
--- a/spacy/cli/init_labels.py
+++ b/spacy/cli/init_labels.py
@ -0,0 +1,43 @@
+from typing import Optional
+import logging
+from pathlib import Path
+from wasabi import msg
+import typer
+import srsly
+
+from .. import util
+from ..training.initialize import init_nlp, convert_vectors
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu
+
+
+@init_cli.command(
+    "labels",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def init_labels_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the labels"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    if not output_path.exists():
+        output_path.mkdir()
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+    for name, component in nlp.pipeline:
+        if getattr(component, "label_data", None) is not None:
+            srsly.write_json(output_path / f"{name}.json", component.label_data)
+            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+        else:
+            msg.info(f"No labels found for {name}")
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional
+from typing import Optional, Union, Dict
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
        """RETURNS (Tuple[str]): The labels currently added to the component."""
        return tuple(self.cfg["labels_morph"].keys())

+    @property
+    def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
+        """RETURNS (Dict): A dictionary with all labels data."""
+        return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
+
    def add_label(self, label):
        """Add a new label to the pipe.

--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True
+from typing import Optional, Tuple
 import srsly
 from thinc.api import set_dropout_rate, Model

@ -32,6 +33,20 @@ cdef class Pipe:
        self.name = name
        self.cfg = dict(cfg)

+    @property
+    def labels(self) -> Optional[Tuple[str]]:
+        if "labels" in self.cfg:
+            return tuple(self.cfg["labels"])
+        else:
+            return None
+    
+    @property
+    def label_data(self):
+        """Optional JSON-serializable data that would be sufficient to recreate
+        the label set if provided to the `pipe.initialize()` method.
+        """
+        return None
+
    def __call__(self, Doc doc):
        """Apply the pipe to one document. The document is modified in place,
        and returned. This usually happens under the hood when the nlp object
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
        # are 0
        return tuple(["I", "S"])

+    @property
+    def label_data(self):
+        return self.labels
+
    def set_annotations(self, docs, batch_tag_ids):
        """Modify a batch of documents, using pre-computed scores.

--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -90,6 +90,16 @@ class Tagger(Pipe):
        """
        return tuple(self.cfg["labels"])

+    @property
+    def label_data(self):
+        """Data about the labels currently added to the component.
+
+        RETURNS (Dict): The labels data.
+
+        DOCS: https://nightly.spacy.io/api/tagger#labels
+        """
+        return tuple(self.cfg["labels"])
+
    def __call__(self, doc):
        """Apply the pipe to a Doc.

--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -154,8 +154,23 @@ class TextCategorizer(Pipe):

    @labels.setter
    def labels(self, value: List[str]) -> None:
+        # TODO: This really shouldn't be here. I had a look and I added it when
+        # I added the labels property, but it's pretty nasty to have this, and
+        # will lead to problems.
        self.cfg["labels"] = tuple(value)

+    @property
+    def label_data(self) -> Dict:
+        """RETURNS (Dict): Information about the component's labels.
+
+        DOCS: https://nightly.spacy.io/api/textcategorizer#labels
+        """
+        return {
+            "labels": self.labels,
+            "positive": self.cfg["positive_label"],
+            "threshold": self.cfg["threshold"]
+        }
+
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -95,6 +95,10 @@ cdef class Parser(Pipe):
        class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
        return class_names

+    @property
+    def label_data(self):
+        return self.moves.labels
+
    @property
    def tok2vec(self):
        """Return the embedding and convolutional layer of the model."""