diff --git a/spacy/errors.py b/spacy/errors.py
index fd412a4da..9b7473dc8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -539,6 +539,10 @@ class Errors(metaclass=ErrorsWithCodes):
"issue tracker: http://github.com/explosion/spaCy/issues")
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+ # New errors added in v4.x
+ E300 = ("nlp.pipe(text_tuples, as_tuples=True) has been replaced with:\n"
+ "nlp.pipe_as_tuples(text_tuples)")
+
# New errors added in v3.x
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
"permit overlapping spans.")
diff --git a/spacy/language.py b/spacy/language.py
index 816bd6531..00909718a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1470,51 +1470,20 @@ class Language:
except StopIteration:
pass
- @overload
def pipe(
self,
texts: Iterable[Union[str, Doc]],
*,
- as_tuples: Literal[False] = ...,
- batch_size: Optional[int] = ...,
- disable: Iterable[str] = ...,
- component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
- n_process: int = ...,
- ) -> Iterator[Doc]:
- ...
-
- @overload
- def pipe( # noqa: F811
- self,
- texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
- *,
- as_tuples: Literal[True] = ...,
- batch_size: Optional[int] = ...,
- disable: Iterable[str] = ...,
- component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
- n_process: int = ...,
- ) -> Iterator[Tuple[Doc, _AnyContext]]:
- ...
-
- def pipe( # noqa: F811
- self,
- texts: Union[
- Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
- ],
- *,
- as_tuples: bool = False,
batch_size: Optional[int] = None,
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1,
- ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
+ as_tuples: Optional[bool] = None, # deprecated
+ ) -> Iterator[Doc]:
"""Process texts as a stream, and yield `Doc` objects in order.
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
process.
- as_tuples (bool): If set to True, inputs should be a sequence of
- (text, context) tuples. Output will then be a sequence of
- (doc, context) tuples. Defaults to False.
batch_size (Optional[int]): The number of texts to buffer.
disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
@@ -1524,25 +1493,8 @@ class Language:
DOCS: https://spacy.io/api/language#pipe
"""
- if as_tuples:
- texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
- docs_with_contexts = (
- self._ensure_doc_with_context(text, context) for text, context in texts
- )
- docs = self.pipe(
- docs_with_contexts,
- batch_size=batch_size,
- disable=disable,
- n_process=n_process,
- component_cfg=component_cfg,
- )
- for doc in docs:
- context = doc._context
- doc._context = None
- yield (doc, context)
- return
-
- texts = cast(Iterable[Union[str, Doc]], texts)
+ if as_tuples is not None:
+ raise ValueError(Errors.E300)
# Set argument defaults
if n_process == -1:
@@ -1583,6 +1535,31 @@ class Language:
for doc in docs:
yield doc
+ def pipe_as_tuples(
+ self,
+ texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
+ *,
+ batch_size: Optional[int] = None,
+ disable: Iterable[str] = SimpleFrozenList(),
+ component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+ n_process: int = 1,
+ ) -> Iterator[Tuple[Doc, _AnyContext]]:
+ docs_with_contexts = (
+ self._ensure_doc_with_context(text, context) for text, context in texts
+ )
+ docs = self.pipe(
+ docs_with_contexts,
+ batch_size=batch_size,
+ disable=disable,
+ n_process=n_process,
+ component_cfg=component_cfg,
+ )
+ for doc in docs:
+ context = doc._context
+ doc._context = None
+ yield (doc, context)
+ return
+
def _has_gpu_model(self, disable: Iterable[str]):
for name, proc in self.pipeline:
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index c5fdc8eb0..4b38822b6 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -271,11 +271,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
("TEXT 666", 666),
]
with pytest.raises(ValueError):
- list(nlp.pipe(texts, as_tuples=True))
+ list(nlp.pipe_as_tuples(texts))
nlp.set_error_handler(warn_error)
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
- tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+ tuples = list(nlp.pipe_as_tuples(texts, n_process=n_process))
# HACK/TODO? the warnings in child processes don't seem to be
# detected by the mock logger
if n_process == 1:
@@ -287,6 +287,18 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
+def test_language_previous_pipe_as_tuples_error(nlp):
+ texts = [
+ ("TEXT 111", 111),
+ ("TEXT 222", 222),
+ ("TEXT 333", 333),
+ ("TEXT 342", 342),
+ ("TEXT 666", 666),
+ ]
+ with pytest.raises(ValueError, match="nlp.pipe_as_tuples"):
+ list(nlp.pipe(texts, as_tuples=True))
+
+
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
"""Test the error handling of a component's pipe method"""
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 9a413efaf..ddf34681d 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -191,16 +191,42 @@ more efficient than processing texts one-by-one.
> assert doc.has_annotation("DEP")
> ```
-| Name | Description |
-| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `texts` | A sequence of strings. ~~Iterable[str]~~ |
-| _keyword-only_ | |
-| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
-| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
-| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ |
-| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
+| Name | Description |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts` | A sequence of strings. ~~Iterable[str]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
+| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ |
+| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
+
+## Language.pipe_as_tuples {#pipe_as_tuples tag="method"}
+
+Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples
+in order. This is usually more efficient than processing texts one-by-one.
+
+> #### Example
+>
+> ```python
+> texts = [
+> ("One document.", {"id": 1}),
+> "...",
+> ("Lots of documents", {"id": 1000}),
+> ]
+> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50):
+> assert doc.has_annotation("DEP")
+> ```
+
+| Name | Description |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts` | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
+| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ |
+| **YIELDS** | Documents in the order of the original text. ~~Tuple(Doc, Any)~~ |
## Language.set_error_handler {#set_error_handler tag="method" new="3"}
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index bd28810ae..6b7e76619 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -91,11 +91,12 @@ have to call `list()` on it first:
-You can use the `as_tuples` option to pass additional context along with each
-doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
-the input should be a sequence of `(text, context)` tuples and the output will
-be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
-the context and save it in a [custom attribute](#custom-components-attributes):
+You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to
+pass additional context along with each doc when using the functionality of
+[`nlp.pipe`](/api/language#pipe). The input should be a sequence of
+`(text, context)` tuples and the output will be a sequence of `(doc, context)`
+tuples. For example, you can pass metadata in the context and save it in a
+[custom attribute](#custom-components-attributes):
```python
### {executable="true"}
@@ -111,7 +112,7 @@ text_tuples = [
]
nlp = spacy.load("en_core_web_sm")
-doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
+doc_tuples = nlp.pipe_as_tuples(text_tuples)
docs = []
for doc, context in doc_tuples: