mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare
This commit is contained in:
commit
30c76dbd67
|
@ -16,6 +16,7 @@ from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
from .init_labels import init_labels_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
|
43
spacy/cli/init_labels.py
Normal file
43
spacy/cli/init_labels.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command(
|
||||||
|
"labels",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def init_labels_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
|
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
with show_validation_error(config_path):
|
||||||
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
|
||||||
|
for name, component in nlp.pipeline:
|
||||||
|
if getattr(component, "label_data", None) is not None:
|
||||||
|
srsly.write_json(output_path / f"{name}.json", component.label_data)
|
||||||
|
msg.good(f"Saving {name} labels to {output_path}/{name}.json")
|
||||||
|
else:
|
||||||
|
msg.info(f"No labels found for {name}")
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional
|
from typing import Optional, Union, Dict
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||||
return tuple(self.cfg["labels_morph"].keys())
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||||
|
"""RETURNS (Dict): A dictionary with all labels data."""
|
||||||
|
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
from typing import Optional, Tuple
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate, Model
|
from thinc.api import set_dropout_rate, Model
|
||||||
|
|
||||||
|
@ -32,6 +33,20 @@ cdef class Pipe:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Optional[Tuple[str]]:
|
||||||
|
if "labels" in self.cfg:
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
the label set if provided to the `pipe.initialize()` method.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Apply the pipe to one document. The document is modified in place,
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
and returned. This usually happens under the hood when the nlp object
|
and returned. This usually happens under the hood when the nlp object
|
||||||
|
|
|
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.labels
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
|
|
@ -90,6 +90,16 @@ class Tagger(Pipe):
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Data about the labels currently added to the component.
|
||||||
|
|
||||||
|
RETURNS (Dict): The labels data.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/tagger#labels
|
||||||
|
"""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the pipe to a Doc.
|
"""Apply the pipe to a Doc.
|
||||||
|
|
||||||
|
|
|
@ -154,8 +154,23 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value: List[str]) -> None:
|
def labels(self, value: List[str]) -> None:
|
||||||
|
# TODO: This really shouldn't be here. I had a look and I added it when
|
||||||
|
# I added the labels property, but it's pretty nasty to have this, and
|
||||||
|
# will lead to problems.
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> Dict:
|
||||||
|
"""RETURNS (Dict): Information about the component's labels.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"labels": self.labels,
|
||||||
|
"positive": self.cfg["positive_label"],
|
||||||
|
"threshold": self.cfg["threshold"]
|
||||||
|
}
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
|
|
@ -95,6 +95,10 @@ cdef class Parser(Pipe):
|
||||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||||
return class_names
|
return class_names
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
return self.moves.labels
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
"""Return the embedding and convolutional layer of the model."""
|
"""Return the embedding and convolutional layer of the model."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user