Add label_data property to pipeline

This commit is contained in:
Matthew Honnibal 2020-09-29 16:22:13 +02:00
parent 591038b1a4
commit 58c8d4b414
6 changed files with 54 additions and 1 deletions

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional from typing import Optional, Union, Dict
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
"""RETURNS (Tuple[str]): The labels currently added to the component.""" """RETURNS (Tuple[str]): The labels currently added to the component."""
return tuple(self.cfg["labels_morph"].keys()) return tuple(self.cfg["labels_morph"].keys())
@property
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
"""RETURNS (Dict): A dictionary with all labels data."""
return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
def add_label(self, label): def add_label(self, label):
"""Add a new label to the pipe. """Add a new label to the pipe.

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Optional, Tuple
import srsly import srsly
from thinc.api import set_dropout_rate, Model from thinc.api import set_dropout_rate, Model
@ -32,6 +33,20 @@ cdef class Pipe:
self.name = name self.name = name
self.cfg = dict(cfg) self.cfg = dict(cfg)
@property
def labels(self) -> Optional[Tuple[str]]:
if "labels" in self.cfg:
return tuple(self.cfg["labels"])
else:
return None
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
the label set if provided to the `pipe.initialize()` method.
"""
return None
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is modified in place, """Apply the pipe to one document. The document is modified in place,
and returned. This usually happens under the hood when the nlp object and returned. This usually happens under the hood when the nlp object

View File

@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
# are 0 # are 0
return tuple(["I", "S"]) return tuple(["I", "S"])
@property
def label_data(self):
return self.labels
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores. """Modify a batch of documents, using pre-computed scores.

View File

@ -90,6 +90,16 @@ class Tagger(Pipe):
""" """
return tuple(self.cfg["labels"]) return tuple(self.cfg["labels"])
@property
def label_data(self):
"""Data about the labels currently added to the component.
RETURNS (Dict): The labels data.
DOCS: https://nightly.spacy.io/api/tagger#labels
"""
return tuple(self.cfg["labels"])
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to a Doc. """Apply the pipe to a Doc.

View File

@ -154,8 +154,23 @@ class TextCategorizer(Pipe):
@labels.setter @labels.setter
def labels(self, value: List[str]) -> None: def labels(self, value: List[str]) -> None:
# TODO: This really shouldn't be here. I had a look and I added it when
# I added the labels property, but it's pretty nasty to have this, and
# will lead to problems.
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
@property
def label_data(self) -> Dict:
"""RETURNS (Dict): Information about the component's labels.
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
"""
return {
"labels": self.labels,
"positive": self.cfg["positive_label"],
"threshold": self.cfg["threshold"]
}
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are

View File

@ -95,6 +95,10 @@ cdef class Parser(Pipe):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
return class_names return class_names
@property
def label_data(self):
return self.moves.labels
@property @property
def tok2vec(self): def tok2vec(self):
"""Return the embedding and convolutional layer of the model.""" """Return the embedding and convolutional layer of the model."""