From 6c268d4ed9eeac591a9826212403fb7fba725fdb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Feb 2023 11:35:32 +0100 Subject: [PATCH] Add Language.pipe_as_tuples As part of the transition to v4, add `Language.pipe_as_tuples()` and deprecate `Language.pipe(as_tuples=True)`. --- spacy/errors.py | 3 ++ spacy/language.py | 26 ++++++++++++ spacy/tests/test_language.py | 20 ++++++++- website/docs/api/language.mdx | 47 ++++++++++++++++----- website/docs/usage/processing-pipelines.mdx | 13 +++--- 5 files changed, 91 insertions(+), 18 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index d143e341c..0438a3127 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -215,6 +215,9 @@ class Warnings(metaclass=ErrorsWithCodes): W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + W125 = ("As of spaCy v3.6, `nlp.pipe(as_tuples=True)` has been deprecated " + "in favor of `nlp.pipe_as_tuples()`. `nlp.pipe(as_tuples=True)` " + "will be removed in spaCy v4.0.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index 9fdcf6328..855871df4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1516,6 +1516,7 @@ class Language: DOCS: https://spacy.io/api/language#pipe """ if as_tuples: + warnings.warn(Warnings.W125, DeprecationWarning) texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts) docs_with_contexts = ( self._ensure_doc_with_context(text, context) for text, context in texts @@ -1574,6 +1575,31 @@ class Language: for doc in docs: yield doc + def pipe_as_tuples( + self, + texts: Iterable[Tuple[Union[str, Doc], _AnyContext]], + *, + batch_size: Optional[int] = None, + disable: Iterable[str] = SimpleFrozenList(), + component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + n_process: int = 1, + ) -> Iterator[Tuple[Doc, _AnyContext]]: + docs_with_contexts = ( + self._ensure_doc_with_context(text, context) for text, context in texts + ) + docs = self.pipe( + docs_with_contexts, + batch_size=batch_size, + disable=disable, + n_process=n_process, + component_cfg=component_cfg, + ) + for doc in docs: + context = doc._context + doc._context = None + yield (doc, context) + return + def _has_gpu_model(self, disable: Iterable[str]): for name, proc in self.pipeline: is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 236856dad..b36828648 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -362,6 +362,22 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process): assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] +def test_language_pipe_as_tuples(): + nlp = English() + texts = [ + ("TEXT 111", 111), + ("TEXT 222", 222), + ("TEXT 333", 333), + ("TEXT 342", 342), + ("TEXT 666", 666), + ] + with pytest.warns(DeprecationWarning): + docs_contexts = list(nlp.pipe(texts, as_tuples=True)) + assert len(docs_contexts) == len(texts) + docs_contexts = list(nlp.pipe_as_tuples(texts)) + assert len(docs_contexts) == len(texts) + + @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): """Test the error handling of nlp.pipe with input as tuples""" @@ -378,11 +394,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): ("TEXT 666", 666), ] with pytest.raises(ValueError): - list(nlp.pipe(texts, as_tuples=True)) + list(nlp.pipe_as_tuples(texts)) nlp.set_error_handler(warn_error) logger = logging.getLogger("spacy") with mock.patch.object(logger, "warning") as mock_warning: - tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process)) + tuples = list(nlp.pipe_as_tuples(texts)) # HACK/TODO? the warnings in child processes don't seem to be # detected by the mock logger if n_process == 1: diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index 93ddd79a2..60f9f766f 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -198,16 +198,43 @@ tokenization is skipped but the rest of the pipeline is run. > assert doc.has_annotation("DEP") > ``` -| Name | Description | -| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ | -| _keyword-only_ | | -| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | -| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ | -| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | +| Name | Description | +| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ | +| _keyword-only_ | | +| `as_tuples` | Deprecated in v3.6 in favor of [`Language.pipe_as_tuples`](#pipe_as_tuples). If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | +| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ | +| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | + +## Language.pipe_as_tuples {id="pipe_as_tuples",tag="method",version="3.6"} + +Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples +in order. This is usually more efficient than processing texts one-by-one. + +> #### Example +> +> ```python +> texts = [ +> ("One document.", {"id": 1}), +> "...", +> ("Lots of documents", {"id": 1000}), +> ] +> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50): +> assert doc.has_annotation("DEP") +> ``` + +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~ | +| _keyword-only_ | | +| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ | +| **YIELDS** | Documents in the order of the original texts. ~~Tuple(Doc, Any)~~ | ## Language.set_error_handler {id="set_error_handler",tag="method",version="3"} diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx index 307cb9dcb..973a80ef3 100644 --- a/website/docs/usage/processing-pipelines.mdx +++ b/website/docs/usage/processing-pipelines.mdx @@ -88,11 +88,12 @@ have to call `list()` on it first: -You can use the `as_tuples` option to pass additional context along with each -doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then -the input should be a sequence of `(text, context)` tuples and the output will -be a sequence of `(doc, context)` tuples. For example, you can pass metadata in -the context and save it in a [custom attribute](#custom-components-attributes): +You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to +pass additional context along with each doc when using the functionality of +[`nlp.pipe`](/api/language#pipe). The input should be a sequence of +`(text, context)` tuples and the output will be a sequence of `(doc, context)` +tuples. For example, you can pass metadata in the context and save it in a +[custom attribute](#custom-components-attributes): ```python {executable="true"} import spacy @@ -107,7 +108,7 @@ text_tuples = [ ] nlp = spacy.load("en_core_web_sm") -doc_tuples = nlp.pipe(text_tuples, as_tuples=True) +doc_tuples = nlp.pipe_as_tuples(text_tuples) docs = [] for doc, context in doc_tuples: