Refactor pipe(as_tuples) into a separate method

2025-09-10 22:22:39 +03:00 · 2022-08-17 09:26:16 +02:00 · 2022-08-17 09:26:16 +02:00 · 6b36d85920
commit 6b36d85920
parent 551e73ccfc
5 changed files with 90 additions and 70 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -539,6 +539,10 @@ class Errors(metaclass=ErrorsWithCodes):
            "issue tracker: http://github.com/explosion/spaCy/issues")
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")

+    # New errors added in v4.x
+    E300 = ("nlp.pipe(text_tuples, as_tuples=True) has been replaced with:\n"
+            "nlp.pipe_as_tuples(text_tuples)")
+
    # New errors added in v3.x
    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
            "permit overlapping spans.")
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1470,51 +1470,20 @@ class Language:
                except StopIteration:
                    pass

-    @overload
    def pipe(
        self,
        texts: Iterable[Union[str, Doc]],
        *,
-        as_tuples: Literal[False] = ...,
-        batch_size: Optional[int] = ...,
-        disable: Iterable[str] = ...,
-        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
-        n_process: int = ...,
-    ) -> Iterator[Doc]:
-        ...
-
-    @overload
-    def pipe(  # noqa: F811
-        self,
-        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
-        *,
-        as_tuples: Literal[True] = ...,
-        batch_size: Optional[int] = ...,
-        disable: Iterable[str] = ...,
-        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
-        n_process: int = ...,
-    ) -> Iterator[Tuple[Doc, _AnyContext]]:
-        ...
-
-    def pipe(  # noqa: F811
-        self,
-        texts: Union[
-            Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
-        ],
-        *,
-        as_tuples: bool = False,
        batch_size: Optional[int] = None,
        disable: Iterable[str] = SimpleFrozenList(),
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        n_process: int = 1,
-    ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
+        as_tuples: Optional[bool] = None, # deprecated
+    ) -> Iterator[Doc]:
        """Process texts as a stream, and yield `Doc` objects in order.

        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
            process.
-        as_tuples (bool): If set to True, inputs should be a sequence of
-            (text, context) tuples. Output will then be a sequence of
-            (doc, context) tuples. Defaults to False.
        batch_size (Optional[int]): The number of texts to buffer.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
@ -1524,25 +1493,8 @@ class Language:

        DOCS: https://spacy.io/api/language#pipe
        """
-        if as_tuples:
-            texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
-            docs_with_contexts = (
-                self._ensure_doc_with_context(text, context) for text, context in texts
-            )
-            docs = self.pipe(
-                docs_with_contexts,
-                batch_size=batch_size,
-                disable=disable,
-                n_process=n_process,
-                component_cfg=component_cfg,
-            )
-            for doc in docs:
-                context = doc._context
-                doc._context = None
-                yield (doc, context)
-            return
-
-        texts = cast(Iterable[Union[str, Doc]], texts)
+        if as_tuples is not None:
+            raise ValueError(Errors.E300)

        # Set argument defaults
        if n_process == -1:
@ -1583,6 +1535,31 @@ class Language:
        for doc in docs:
            yield doc

+    def pipe_as_tuples(
+        self,
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
+        *,
+        batch_size: Optional[int] = None,
+        disable: Iterable[str] = SimpleFrozenList(),
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        n_process: int = 1,
+    ) -> Iterator[Tuple[Doc, _AnyContext]]:
+        docs_with_contexts = (
+            self._ensure_doc_with_context(text, context) for text, context in texts
+        )
+        docs = self.pipe(
+            docs_with_contexts,
+            batch_size=batch_size,
+            disable=disable,
+            n_process=n_process,
+            component_cfg=component_cfg,
+        )
+        for doc in docs:
+            context = doc._context
+            doc._context = None
+            yield (doc, context)
+        return
+
    def _has_gpu_model(self, disable: Iterable[str]):
        for name, proc in self.pipeline:
            is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable  # type: ignore
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -271,11 +271,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
            ("TEXT 666", 666),
        ]
        with pytest.raises(ValueError):
-            list(nlp.pipe(texts, as_tuples=True))
+            list(nlp.pipe_as_tuples(texts))
        nlp.set_error_handler(warn_error)
        logger = logging.getLogger("spacy")
        with mock.patch.object(logger, "warning") as mock_warning:
-            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+            tuples = list(nlp.pipe_as_tuples(texts, n_process=n_process))
            # HACK/TODO? the warnings in child processes don't seem to be
            # detected by the mock logger
            if n_process == 1:
@ -287,6 +287,18 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
            assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)


+def test_language_previous_pipe_as_tuples_error(nlp):
+    texts = [
+        ("TEXT 111", 111),
+        ("TEXT 222", 222),
+        ("TEXT 333", 333),
+        ("TEXT 342", 342),
+        ("TEXT 666", 666),
+    ]
+    with pytest.raises(ValueError, match="nlp.pipe_as_tuples"):
+        list(nlp.pipe(texts, as_tuples=True))
+
+
@pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_pipe(en_vocab, n_process):
    """Test the error handling of a component's pipe method"""
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -191,16 +191,42 @@ more efficient than processing texts one-by-one.
 >     assert doc.has_annotation("DEP")
 > ```

-| Name                                       | Description                                                                                                                                                         |
-| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `texts`                                    | A sequence of strings. ~~Iterable[str]~~                                                                                                                            |
-| _keyword-only_                             |                                                                                                                                                                     |
-| `as_tuples`                                | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
-| `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                                                    |
-| `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                     |
-| `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                      |
-| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
-| **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
+| Name                                       | Description                                                                                                                                    |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts`                                    | A sequence of strings. ~~Iterable[str]~~                                                                                                       |
+| _keyword-only_                             |                                                                                                                                                |
+| `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                               |
+| `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                |
+| `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                          |
+| **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                           |
+
+## Language.pipe_as_tuples {#pipe_as_tuples tag="method"}
+
+Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples
+in order. This is usually more efficient than processing texts one-by-one.
+
+> #### Example
+>
+> ```python
+> texts = [
+>     ("One document.", {"id": 1}),
+>     "...",
+>     ("Lots of documents", {"id": 1000}),
+> ]
+> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50):
+>     assert doc.has_annotation("DEP")
+> ```
+
+| Name                                       | Description                                                                                                                                    |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts`                                    | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~                                                                                           |
+| _keyword-only_                             |                                                                                                                                                |
+| `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                               |
+| `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                |
+| `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                          |
+| **YIELDS**                                 | Documents in the order of the original text. ~~Tuple(Doc, Any)~~                                                                               |

 ## Language.set_error_handler {#set_error_handler tag="method" new="3"}

--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -91,11 +91,12 @@ have to call `list()` on it first:

 </Infobox>

-You can use the `as_tuples` option to pass additional context along with each
-doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
-the input should be a sequence of `(text, context)` tuples and the output will
-be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
-the context and save it in a [custom attribute](#custom-components-attributes):
+You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to
+pass additional context along with each doc when using the functionality of
+[`nlp.pipe`](/api/language#pipe). The input should be a sequence of
+`(text, context)` tuples and the output will be a sequence of `(doc, context)`
+tuples. For example, you can pass metadata in the context and save it in a
+[custom attribute](#custom-components-attributes):

 ```python
 ### {executable="true"}
@ -111,7 +112,7 @@ text_tuples = [
 ]

 nlp = spacy.load("en_core_web_sm")
-doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
+doc_tuples = nlp.pipe_as_tuples(text_tuples)

 docs = []
 for doc, context in doc_tuples: