Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

This commit is contained in:
Ines Montani 2020-09-29 16:53:48 +02:00
commit 30c76dbd67
8 changed files with 98 additions and 1 deletions

View File

@ -16,6 +16,7 @@ from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401 from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401 from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_labels import init_labels_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401 from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401 from .project.clone import project_clone # noqa: F401

43
spacy/cli/init_labels.py Normal file
View File

@ -0,0 +1,43 @@
from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
@init_cli.command(
"labels",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
if not output_path.exists():
output_path.mkdir()
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
srsly.write_json(output_path / f"{name}.json", component.label_data)
msg.good(f"Saving {name} labels to {output_path}/{name}.json")
else:
msg.info(f"No labels found for {name}")

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional from typing import Optional, Union, Dict
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
"""RETURNS (Tuple[str]): The labels currently added to the component.""" """RETURNS (Tuple[str]): The labels currently added to the component."""
return tuple(self.cfg["labels_morph"].keys()) return tuple(self.cfg["labels_morph"].keys())
@property
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
"""RETURNS (Dict): A dictionary with all labels data."""
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
def add_label(self, label): def add_label(self, label):
"""Add a new label to the pipe. """Add a new label to the pipe.

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Optional, Tuple
import srsly import srsly
from thinc.api import set_dropout_rate, Model from thinc.api import set_dropout_rate, Model
@ -32,6 +33,20 @@ cdef class Pipe:
self.name = name self.name = name
self.cfg = dict(cfg) self.cfg = dict(cfg)
@property
def labels(self) -> Optional[Tuple[str]]:
if "labels" in self.cfg:
return tuple(self.cfg["labels"])
else:
return None
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
the label set if provided to the `pipe.initialize()` method.
"""
return None
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is modified in place, """Apply the pipe to one document. The document is modified in place,
and returned. This usually happens under the hood when the nlp object and returned. This usually happens under the hood when the nlp object

View File

@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
# are 0 # are 0
return tuple(["I", "S"]) return tuple(["I", "S"])
@property
def label_data(self):
return self.labels
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores. """Modify a batch of documents, using pre-computed scores.

View File

@ -90,6 +90,16 @@ class Tagger(Pipe):
""" """
return tuple(self.cfg["labels"]) return tuple(self.cfg["labels"])
@property
def label_data(self):
"""Data about the labels currently added to the component.
RETURNS (Dict): The labels data.
DOCS: https://nightly.spacy.io/api/tagger#labels
"""
return tuple(self.cfg["labels"])
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to a Doc. """Apply the pipe to a Doc.

View File

@ -154,8 +154,23 @@ class TextCategorizer(Pipe):
@labels.setter @labels.setter
def labels(self, value: List[str]) -> None: def labels(self, value: List[str]) -> None:
# TODO: This really shouldn't be here. I had a look and I added it when
# I added the labels property, but it's pretty nasty to have this, and
# will lead to problems.
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
@property
def label_data(self) -> Dict:
"""RETURNS (Dict): Information about the component's labels.
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
"""
return {
"labels": self.labels,
"positive": self.cfg["positive_label"],
"threshold": self.cfg["threshold"]
}
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are

View File

@ -95,6 +95,10 @@ cdef class Parser(Pipe):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
return class_names return class_names
@property
def label_data(self):
return self.moves.labels
@property @property
def tok2vec(self): def tok2vec(self):
"""Return the embedding and convolutional layer of the model.""" """Return the embedding and convolutional layer of the model."""