From 6c268d4ed9eeac591a9826212403fb7fba725fdb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Feb 2023 11:35:32 +0100
Subject: [PATCH] Add Language.pipe_as_tuples

As part of the transition to v4, add `Language.pipe_as_tuples()` and
deprecate `Language.pipe(as_tuples=True)`.
---
 spacy/errors.py                             |  3 ++
 spacy/language.py                           | 26 ++++++++++++
 spacy/tests/test_language.py                | 20 ++++++++-
 website/docs/api/language.mdx               | 47 ++++++++++++++++-----
 website/docs/usage/processing-pipelines.mdx | 13 +++---
 5 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d143e341c..0438a3127 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -215,6 +215,9 @@ class Warnings(metaclass=ErrorsWithCodes):
     W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
+    W125 = ("As of spaCy v3.6, `nlp.pipe(as_tuples=True)` has been deprecated "
+            "in favor of `nlp.pipe_as_tuples()`. `nlp.pipe(as_tuples=True)` "
+            "will be removed in spaCy v4.0.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/language.py b/spacy/language.py
index 9fdcf6328..855871df4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1516,6 +1516,7 @@ class Language:
         DOCS: https://spacy.io/api/language#pipe
         """
         if as_tuples:
+            warnings.warn(Warnings.W125, DeprecationWarning)
             texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
             docs_with_contexts = (
                 self._ensure_doc_with_context(text, context) for text, context in texts
@@ -1574,6 +1575,31 @@ class Language:
         for doc in docs:
             yield doc
 
+    def pipe_as_tuples(
+        self,
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
+        *,
+        batch_size: Optional[int] = None,
+        disable: Iterable[str] = SimpleFrozenList(),
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        n_process: int = 1,
+    ) -> Iterator[Tuple[Doc, _AnyContext]]:
+        docs_with_contexts = (
+            self._ensure_doc_with_context(text, context) for text, context in texts
+        )
+        docs = self.pipe(
+            docs_with_contexts,
+            batch_size=batch_size,
+            disable=disable,
+            n_process=n_process,
+            component_cfg=component_cfg,
+        )
+        for doc in docs:
+            context = doc._context
+            doc._context = None
+            yield (doc, context)
+        return
+
     def _has_gpu_model(self, disable: Iterable[str]):
         for name, proc in self.pipeline:
             is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable  # type: ignore
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 236856dad..b36828648 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -362,6 +362,22 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
             assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
 
 
+def test_language_pipe_as_tuples():
+    nlp = English()
+    texts = [
+        ("TEXT 111", 111),
+        ("TEXT 222", 222),
+        ("TEXT 333", 333),
+        ("TEXT 342", 342),
+        ("TEXT 666", 666),
+    ]
+    with pytest.warns(DeprecationWarning):
+        docs_contexts = list(nlp.pipe(texts, as_tuples=True))
+        assert len(docs_contexts) == len(texts)
+    docs_contexts = list(nlp.pipe_as_tuples(texts))
+    assert len(docs_contexts) == len(texts)
+
+
 @pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
     """Test the error handling of nlp.pipe with input as tuples"""
@@ -378,11 +394,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
             ("TEXT 666", 666),
         ]
         with pytest.raises(ValueError):
-            list(nlp.pipe(texts, as_tuples=True))
+            list(nlp.pipe_as_tuples(texts))
         nlp.set_error_handler(warn_error)
         logger = logging.getLogger("spacy")
         with mock.patch.object(logger, "warning") as mock_warning:
-            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+            tuples = list(nlp.pipe_as_tuples(texts))
             # HACK/TODO? the warnings in child processes don't seem to be
             # detected by the mock logger
             if n_process == 1:
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 93ddd79a2..60f9f766f 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -198,16 +198,43 @@ tokenization is skipped but the rest of the pipeline is run.
 >     assert doc.has_annotation("DEP")
 > ```
 
-| Name            | Description                                                                                                                                                         |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `texts`         | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~                                                                                             |
-| _keyword-only_  |                                                                                                                                                                     |
-| `as_tuples`     | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
-| `batch_size`    | The number of texts to buffer. ~~Optional[int]~~                                                                                                                    |
-| `disable`       | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                     |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                      |
-| `n_process`     | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
-| **YIELDS**      | Documents in the order of the original text. ~~Doc~~                                                                                                                |
+| Name            | Description                                                                                                                                                                                                                                      |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `texts`         | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~                                                                                                                                                                          |
+| _keyword-only_  |                                                                                                                                                                                                                                                  |
+| `as_tuples`     | Deprecated in v3.6 in favor of [`Language.pipe_as_tuples`](#pipe_as_tuples). If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
+| `batch_size`    | The number of texts to buffer. ~~Optional[int]~~                                                                                                                                                                                                 |
+| `disable`       | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                                                                                                  |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                                                                   |
+| `n_process`     | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                                                                                                            |
+| **YIELDS**      | Documents in the order of the original text. ~~Doc~~                                                                                                                                                                                             |
+
+## Language.pipe_as_tuples {id="pipe_as_tuples",tag="method",version="3.6"}
+
+Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples
+in order. This is usually more efficient than processing texts one-by-one.
+
+> #### Example
+>
+> ```python
+> texts = [
+>     ("One document.", {"id": 1}),
+>     "...",
+>     ("Lots of documents", {"id": 1000}),
+> ]
+> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50):
+>     assert doc.has_annotation("DEP")
+> ```
+
+| Name            | Description                                                                                                                                    |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts`         | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~                                                                                           |
+| _keyword-only_  |                                                                                                                                                |
+| `batch_size`    | The number of texts to buffer. ~~Optional[int]~~                                                                                               |
+| `disable`       | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `n_process`     | Number of processors to use. Defaults to `1`. ~~int~~                                                                                          |
+| **YIELDS**      | Documents in the order of the original texts. ~~Tuple(Doc, Any)~~                                                                              |
 
 ## Language.set_error_handler {id="set_error_handler",tag="method",version="3"}
 
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index 307cb9dcb..973a80ef3 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -88,11 +88,12 @@ have to call `list()` on it first:
 
 </Infobox>
 
-You can use the `as_tuples` option to pass additional context along with each
-doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
-the input should be a sequence of `(text, context)` tuples and the output will
-be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
-the context and save it in a [custom attribute](#custom-components-attributes):
+You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to
+pass additional context along with each doc when using the functionality of
+[`nlp.pipe`](/api/language#pipe). The input should be a sequence of
+`(text, context)` tuples and the output will be a sequence of `(doc, context)`
+tuples. For example, you can pass metadata in the context and save it in a
+[custom attribute](#custom-components-attributes):
 
 ```python {executable="true"}
 import spacy
@@ -107,7 +108,7 @@ text_tuples = [
 ]
 
 nlp = spacy.load("en_core_web_sm")
-doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
+doc_tuples = nlp.pipe_as_tuples(text_tuples)
 
 docs = []
 for doc, context in doc_tuples: