From 6b36d85920fd5604d9a86dc60ea24b1a0837b688 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Aug 2022 09:26:16 +0200 Subject: [PATCH] Refactor pipe(as_tuples) into a separate method --- spacy/errors.py | 4 ++ spacy/language.py | 81 ++++++++-------------- spacy/tests/test_language.py | 16 ++++- website/docs/api/language.md | 46 +++++++++--- website/docs/usage/processing-pipelines.md | 13 ++-- 5 files changed, 90 insertions(+), 70 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index fd412a4da..9b7473dc8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -539,6 +539,10 @@ class Errors(metaclass=ErrorsWithCodes): "issue tracker: http://github.com/explosion/spaCy/issues") E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.") + # New errors added in v4.x + E300 = ("nlp.pipe(text_tuples, as_tuples=True) has been replaced with:\n" + "nlp.pipe_as_tuples(text_tuples)") + # New errors added in v3.x E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " "permit overlapping spans.") diff --git a/spacy/language.py b/spacy/language.py index 816bd6531..00909718a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1470,51 +1470,20 @@ class Language: except StopIteration: pass - @overload def pipe( self, texts: Iterable[Union[str, Doc]], *, - as_tuples: Literal[False] = ..., - batch_size: Optional[int] = ..., - disable: Iterable[str] = ..., - component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., - n_process: int = ..., - ) -> Iterator[Doc]: - ... - - @overload - def pipe( # noqa: F811 - self, - texts: Iterable[Tuple[Union[str, Doc], _AnyContext]], - *, - as_tuples: Literal[True] = ..., - batch_size: Optional[int] = ..., - disable: Iterable[str] = ..., - component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., - n_process: int = ..., - ) -> Iterator[Tuple[Doc, _AnyContext]]: - ... - - def pipe( # noqa: F811 - self, - texts: Union[ - Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]] - ], - *, - as_tuples: bool = False, batch_size: Optional[int] = None, disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, - ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]: + as_tuples: Optional[bool] = None, # deprecated + ) -> Iterator[Doc]: """Process texts as a stream, and yield `Doc` objects in order. texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to process. - as_tuples (bool): If set to True, inputs should be a sequence of - (text, context) tuples. Output will then be a sequence of - (doc, context) tuples. Defaults to False. batch_size (Optional[int]): The number of texts to buffer. disable (List[str]): Names of the pipeline components to disable. component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword @@ -1524,25 +1493,8 @@ class Language: DOCS: https://spacy.io/api/language#pipe """ - if as_tuples: - texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts) - docs_with_contexts = ( - self._ensure_doc_with_context(text, context) for text, context in texts - ) - docs = self.pipe( - docs_with_contexts, - batch_size=batch_size, - disable=disable, - n_process=n_process, - component_cfg=component_cfg, - ) - for doc in docs: - context = doc._context - doc._context = None - yield (doc, context) - return - - texts = cast(Iterable[Union[str, Doc]], texts) + if as_tuples is not None: + raise ValueError(Errors.E300) # Set argument defaults if n_process == -1: @@ -1583,6 +1535,31 @@ class Language: for doc in docs: yield doc + def pipe_as_tuples( + self, + texts: Iterable[Tuple[Union[str, Doc], _AnyContext]], + *, + batch_size: Optional[int] = None, + disable: Iterable[str] = SimpleFrozenList(), + component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + n_process: int = 1, + ) -> Iterator[Tuple[Doc, _AnyContext]]: + docs_with_contexts = ( + self._ensure_doc_with_context(text, context) for text, context in texts + ) + docs = self.pipe( + docs_with_contexts, + batch_size=batch_size, + disable=disable, + n_process=n_process, + component_cfg=component_cfg, + ) + for doc in docs: + context = doc._context + doc._context = None + yield (doc, context) + return + def _has_gpu_model(self, disable: Iterable[str]): for name, proc in self.pipeline: is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index c5fdc8eb0..4b38822b6 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -271,11 +271,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): ("TEXT 666", 666), ] with pytest.raises(ValueError): - list(nlp.pipe(texts, as_tuples=True)) + list(nlp.pipe_as_tuples(texts)) nlp.set_error_handler(warn_error) logger = logging.getLogger("spacy") with mock.patch.object(logger, "warning") as mock_warning: - tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process)) + tuples = list(nlp.pipe_as_tuples(texts, n_process=n_process)) # HACK/TODO? the warnings in child processes don't seem to be # detected by the mock logger if n_process == 1: @@ -287,6 +287,18 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666) +def test_language_previous_pipe_as_tuples_error(nlp): + texts = [ + ("TEXT 111", 111), + ("TEXT 222", 222), + ("TEXT 333", 333), + ("TEXT 342", 342), + ("TEXT 666", 666), + ] + with pytest.raises(ValueError, match="nlp.pipe_as_tuples"): + list(nlp.pipe(texts, as_tuples=True)) + + @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 9a413efaf..ddf34681d 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -191,16 +191,42 @@ more efficient than processing texts one-by-one. > assert doc.has_annotation("DEP") > ``` -| Name | Description | -| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | A sequence of strings. ~~Iterable[str]~~ | -| _keyword-only_ | | -| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | -| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | -| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | +| Name | Description | +| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | A sequence of strings. ~~Iterable[str]~~ | +| _keyword-only_ | | +| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | +| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | + +## Language.pipe_as_tuples {#pipe_as_tuples tag="method"} + +Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples +in order. This is usually more efficient than processing texts one-by-one. + +> #### Example +> +> ```python +> texts = [ +> ("One document.", {"id": 1}), +> "...", +> ("Lots of documents", {"id": 1000}), +> ] +> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50): +> assert doc.has_annotation("DEP") +> ``` + +| Name | Description | +| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~ | +| _keyword-only_ | | +| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | +| **YIELDS** | Documents in the order of the original text. ~~Tuple(Doc, Any)~~ | ## Language.set_error_handler {#set_error_handler tag="method" new="3"} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bd28810ae..6b7e76619 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -91,11 +91,12 @@ have to call `list()` on it first: -You can use the `as_tuples` option to pass additional context along with each -doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then -the input should be a sequence of `(text, context)` tuples and the output will -be a sequence of `(doc, context)` tuples. For example, you can pass metadata in -the context and save it in a [custom attribute](#custom-components-attributes): +You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to +pass additional context along with each doc when using the functionality of +[`nlp.pipe`](/api/language#pipe). The input should be a sequence of +`(text, context)` tuples and the output will be a sequence of `(doc, context)` +tuples. For example, you can pass metadata in the context and save it in a +[custom attribute](#custom-components-attributes): ```python ### {executable="true"} @@ -111,7 +112,7 @@ text_tuples = [ ] nlp = spacy.load("en_core_web_sm") -doc_tuples = nlp.pipe(text_tuples, as_tuples=True) +doc_tuples = nlp.pipe_as_tuples(text_tuples) docs = [] for doc, context in doc_tuples: