From f84b59d68ae84415a1f2060c1ad457901255c5b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 2 Jun 2023 18:20:23 +0200 Subject: [PATCH] Add docs and unify default configs for spancat and span finder * Add `allow_overlap=True` to span finder scorer --- spacy/pipeline/span_finder.py | 16 +- spacy/pipeline/spancat.py | 4 +- spacy/tests/pipeline/test_spancat.py | 4 +- website/docs/api/spancategorizer.mdx | 21 +- website/docs/api/spanfinder.mdx | 372 +++++++++++++++++++++++++++ website/meta/sidebars.json | 1 + 6 files changed, 407 insertions(+), 11 deletions(-) create mode 100644 website/docs/api/spanfinder.mdx diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index 8727be953..af4109c76 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -27,8 +27,8 @@ nO = 2 [model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 96 -rows = [5000, 2000, 1000, 1000] -attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 1000, 2500, 1000] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] include_static_vectors = false [model.tok2vec.encode] @@ -63,23 +63,26 @@ def make_span_finder( nlp: Language, name: str, model: Model[Iterable[Doc], Floats2d], - scorer: Optional[Callable], + spans_key: str, threshold: float, max_length: Optional[int], min_length: Optional[int], - spans_key: str, + scorer: Optional[Callable], ) -> "SpanFinder": """Create a SpanFinder component. The component predicts whether a token is the start or the end of a potential span. model (Model[List[Doc], Floats2d]): A model instance that is given a list of documents and predicts a probability for each token. - threshold (float): Minimum probability to consider a prediction positive. spans_key (str): Key of the doc.spans dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. + threshold (float): Minimum probability to consider a prediction positive. max_length (Optional[int]): Max length of the produced spans, defaults to None meaning unlimited length. min_length (Optional[int]): Min length of the produced spans, defaults to None meaining shortest span is length 1. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. """ return SpanFinder( nlp, @@ -107,6 +110,7 @@ def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) ) kwargs.setdefault("has_annotation", lambda doc: key in doc.spans) + kwargs.setdefault("allow_overlap", True) scores = Scorer.score_spans(examples, **kwargs) scores.pop(f"{kwargs['attr']}_per_type", None) return scores @@ -141,11 +145,11 @@ class SpanFinder(TrainablePipe): model: Model[Iterable[Doc], Floats2d], name: str = "span_finder", *, + spans_key: str = DEFAULT_SPANS_KEY, threshold: float = 0.5, max_length: Optional[int] = None, min_length: Optional[int] = None, scorer: Optional[Callable] = span_finder_score, - spans_key: str = DEFAULT_SPANS_KEY, ) -> None: """Initialize the span boundary detector. model (thinc.api.Model): The Thinc Model powering the pipeline component. diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 185c0cd5d..08a5478a9 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -31,8 +31,8 @@ hidden_size = 128 [model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 96 -rows = [5000, 2000, 1000, 1000] -attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 1000, 2500, 1000] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] include_static_vectors = false [model.tok2vec.encode] diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 1d29c89ec..eb8685294 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -445,7 +445,7 @@ def test_overfitting_IO(): spans = doc.spans[SPAN_KEY] assert len(spans) == 2 assert len(spans.attrs["scores"]) == 2 - assert min(spans.attrs["scores"]) > 0.9 + assert min(spans.attrs["scores"]) > 0.8 assert set([span.text for span in spans]) == {"London", "Berlin"} assert set([span.label_ for span in spans]) == {"LOC"} @@ -457,7 +457,7 @@ def test_overfitting_IO(): spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 2 assert len(spans2.attrs["scores"]) == 2 - assert min(spans2.attrs["scores"]) > 0.9 + assert min(spans2.attrs["scores"]) > 0.8 assert set([span.text for span in spans2]) == {"London", "Berlin"} assert set([span.label_ for span in spans2]) == {"LOC"} diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index f54a8687b..81a473ac2 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -105,7 +105,7 @@ architectures and their arguments and hyperparameters. > > # Construction via add_pipe with custom model > config = {"model": {"@architectures": "my_spancat"}} -> parser = nlp.add_pipe("spancat", config=config) +> spancat = nlp.add_pipe("spancat", config=config) > > # Construction from class > from spacy.pipeline import SpanCategorizer @@ -524,3 +524,22 @@ has two columns, indicating the start and end position. | `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | | `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | + +### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.preset_spans_suggester.v1" +> spans_key = "my_spans" +> ``` + +Suggest all spans that are already stored in doc.spans[spans_key]. This is +useful when an upstream component is used to set the spans on the Doc such as a +[`SpanRuler`](/api/spanruler) or [`SpanFinder`](/api/spanfinder). + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| `spans_key` | Key of [`Doc.spans`](/api/doc/#spans) that provides spans to suggest. ~~str~~ | +| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | diff --git a/website/docs/api/spanfinder.mdx b/website/docs/api/spanfinder.mdx new file mode 100644 index 000000000..acc3b7445 --- /dev/null +++ b/website/docs/api/spanfinder.mdx @@ -0,0 +1,372 @@ +--- +title: SpanFinder +tag: class,experimental +source: spacy/pipeline/span_finder.py +version: 3.6 +teaser: + 'Pipeline component for identifying potentially overlapping spans of text' +api_base_class: /api/pipe +api_string_name: span_finder +api_trainable: true +--- + +The span finder identifies potentially overlapping, unlabeled spans. It +identifies tokens that start or end spans and annotates unlabeled spans between +starts and ends, with optional filters for min and max span length. It is +intended for use in combination with a component like +[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the +spans. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the +doc under `doc.spans[spans_key]`, where `spans_key` is a component config +setting. + +## Assigned Attributes {id="assigned-attributes"} + +Predictions will be saved to `Doc.spans[spans_key]` as a +[`SpanGroup`](/api/spangroup). + +`spans_key` defaults to `"sc"`, but can be passed as a parameter. The +`span_finder` component will overwrite any existing spans under the spans key +`doc.spans[spans_key]`. + +| Location | Value | +| ---------------------- | ---------------------------------- | +| `Doc.spans[spans_key]` | The unlabeled spans. ~~SpanGroup~~ | + +## Config and implementation {id="config"} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy.pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL +> config = { +> "threshold": 0.5, +> "spans_key": "my_spans", +> "max_length": None, +> "min_length": None, +> "model": DEFAULT_SPAN_FINDER_MODEL, +> } +> nlp.add_pipe("span_finder", config=config) +> ``` + +| Setting | Description | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ | +| `max_length` | Max length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ | +| `min_length` | Min length of the produced spans, defaults to `None` meaning shortest span is length 1. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | + +```python +%%GITHUB_SPACY/spacy/pipeline/span_finder.py +``` + +## SpanFinder.\_\_init\_\_ {id="init",tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> span_finder = nlp.add_pipe("span_finder") +> +> # Construction via add_pipe with custom model +> config = {"model": {"@architectures": "my_span_finder"}} +> span_finder = nlp.add_pipe("span_finder", config=config) +> +> # Construction from class +> from spacy.pipeline import SpanFinder +> span_finder = SpanFinder(nlp.vocab, model) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ | +| `max_length` | Max length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ | +| `min_length` | Min length of the produced spans, defaults to `None` meaning shortest span is length 1. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | + +## SpanFinder.\_\_call\_\_ {id="call",tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/spanfinder#call) and [`pipe`](/api/spanfinder#pipe) delegate +to the [`predict`](/api/spanfinder#predict) and +[`set_annotations`](/api/spanfinder#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> span_finder = nlp.add_pipe("span_finder") +> # This usually happens under the hood +> processed = span_finder(doc) +> ``` + +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | + +## SpanFinder.pipe {id="pipe",tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/spanfinder#call) and +[`pipe`](/api/spanfinder#pipe) delegate to the +[`predict`](/api/spanfinder#predict) and +[`set_annotations`](/api/spanfinder#set_annotations) methods. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> for doc in span_finder.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | + +## SpanFinder.initialize {id="initialize",tag="method"} + +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network and +[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) This +method is typically called by [`Language.initialize`](/api/language#initialize) +and lets you customize arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> span_finder.initialize(lambda: examples, nlp=nlp) +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | + +## SpanFinder.predict {id="predict",tag="method"} + +Apply the component's model to a batch of [`Doc`](/api/doc) objects without +modifying them. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> scores = span_finder.predict([doc1, doc2]) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | + +## SpanFinder.set_annotations {id="set_annotations",tag="method"} + +Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> scores = span_finder.predict(docs) +> span_finder.set_annotations(docs, scores) +> ``` + +| Name | Description | +| -------- | ---------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `SpanFinder.predict`. | + +## SpanFinder.update {id="update",tag="method"} + +Learn from a batch of [`Example`](/api/example) objects containing the +predictions and gold-standard annotations, and update the component's model. +Delegates to [`predict`](/api/spanfinder#predict) and +[`get_loss`](/api/spanfinder#get_loss). + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> optimizer = nlp.initialize() +> losses = span_finder.update(examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + +## SpanFinder.get_loss {id="get_loss",tag="method"} + +Find the loss and gradient of loss for the batch of documents and their +predicted scores. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> scores = span_finder.predict([eg.predicted for eg in examples]) +> loss, d_loss = span_finder.get_loss(examples, scores) +> ``` + +| Name | Description | +| -------------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + +## SpanFinder.create_optimizer {id="create_optimizer",tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> optimizer = span_finder.create_optimizer() +> ``` + +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | + +## SpanFinder.use_params {id="use_params",tag="method, contextmanager"} + +Modify the pipe's model to use the given parameter values. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> with span_finder.use_params(optimizer.averages): +> span_finder.to_disk("/best_model") +> ``` + +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | + +## SpanFinder.to_disk {id="to_disk",tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> span_finder.to_disk("/path/to/span_finder") +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | + +## SpanFinder.from_disk {id="from_disk",tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> span_finder.from_disk("/path/to/span_finder") +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `SpanFinder` object. ~~SpanFinder~~ | + +## SpanFinder.to_bytes {id="to_bytes",tag="method"} + +> #### Example +> +> ```python +> span_finder = nlp.add_pipe("span_finder") +> span_finder_bytes = span_finder.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `SpanFinder` object. ~~bytes~~ | + +## SpanFinder.from_bytes {id="from_bytes",tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> span_finder_bytes = span_finder.to_bytes() +> span_finder = nlp.add_pipe("span_finder") +> span_finder.from_bytes(span_finder_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `SpanFinder` object. ~~SpanFinder~~ | + +## Serialization fields {id="serialization-fields"} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = span_finder.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index b5c555da6..12c3fce35 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -106,6 +106,7 @@ { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, { "text": "Sentencizer", "url": "/api/sentencizer" }, { "text": "SpanCategorizer", "url": "/api/spancategorizer" }, + { "text": "SpanFinder", "url": "/api/spanfinder" }, { "text": "SpanResolver", "url": "/api/span-resolver" }, { "text": "SpanRuler", "url": "/api/spanruler" }, { "text": "Tagger", "url": "/api/tagger" },