diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 26931606b..a87c20c31 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -13,6 +13,7 @@ from .sentencizer import Sentencizer from .tagger import Tagger from .textcat import TextCategorizer from .spancat import SpanCategorizer +from .spancat_exclusive import SpanCategorizerExclusive from .span_ruler import SpanRuler from .textcat_multilabel import MultiLabel_TextCategorizer from .tok2vec import Tok2Vec @@ -31,6 +32,7 @@ __all__ = [ "SentenceRecognizer", "Sentencizer", "SpanCategorizer", + "SpanCategorizerExclusive", "SpanRuler", "Tagger", "TextCategorizer", diff --git a/spacy/pipeline/spancat_exclusive.py b/spacy/pipeline/spancat_exclusive.py index a7253c9f7..3d169b199 100644 --- a/spacy/pipeline/spancat_exclusive.py +++ b/spacy/pipeline/spancat_exclusive.py @@ -78,7 +78,7 @@ def make_spancat( model: Model[Tuple[List[Doc], Ragged], Floats2d], spans_key: str, scorer: Optional[Callable], - negative_weight: Optional[float] = 1.0, + negative_weight: float = 1.0, allow_overlap: Optional[bool] = True, ) -> "SpanCategorizerExclusive": """Create a SpanCategorizer component. The span categorizer consists of two @@ -95,7 +95,7 @@ def make_spancat( spans_key (str): Key of the doc.spans dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. - negative_weight (Optional[float]): Multiplier for the loss terms. + negative_weight (float): Multiplier for the loss terms. Can be used to down weigh the negative samples if there are too many. allow_overlap (Optional[bool]): If True the data is assumed to contain overlapping spans. @@ -133,7 +133,6 @@ class Ranges: return False -# TODO: Documentation class SpanCategorizerExclusive(TrainablePipe): """Pipeline component to label spans of text. @@ -148,7 +147,7 @@ class SpanCategorizerExclusive(TrainablePipe): name: str = "spancat_exclusive", *, spans_key: str = "spans", - negative_weight: Optional[float], + negative_weight: float = 1.0, scorer: Optional[Callable] = spancat_score, allow_overlap: Optional[bool] = True, ) -> None: @@ -161,7 +160,7 @@ class SpanCategorizerExclusive(TrainablePipe): During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. - negative_weight (Optional[float]): Multiplier for the loss terms. + negative_weight (float): Multiplier for the loss terms. Can be used to down weigh the negative samples if there are too many. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the Doc.spans[spans_key] with overlapping @@ -394,13 +393,15 @@ class SpanCategorizerExclusive(TrainablePipe): ) -> None: """Initialize the pipe for training, using a representative set of data examples. + get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Optional[Language]): The current nlp object the component is part of. labels (Optional[List[str]]): The labels to add to the component, typically generated by the `init labels` command. If no labels are provided, the get_examples callback is used to extract the labels from the data. - DOCS: https://spacy.io/api/spancategorizer#initialize + + DOCS: https://spacy.io/api/spancategorizerexclusive#initialize """ subbatch: List[Example] = [] if labels is not None: @@ -419,9 +420,11 @@ class SpanCategorizerExclusive(TrainablePipe): # + 1 for the "no-label" category Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels) self.model.initialize(X=(docs, spans), Y=Y) - # FIXME I think this branch is broken else: - raise ValueError("Cannot initialize without examples.") + # FIXME: Ideally we want to raise an error to avoid implicitly + # raising it when initializing without examples. For now, we'll just + # copy over what `spancat` did. + self.model.initialize() def _validate_categories(self, examples: Iterable[Example]): # TODO