mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Refactor pipe(as_tuples) into a separate method
This commit is contained in:
parent
551e73ccfc
commit
6b36d85920
|
@ -539,6 +539,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
|
# New errors added in v4.x
|
||||||
|
E300 = ("nlp.pipe(text_tuples, as_tuples=True) has been replaced with:\n"
|
||||||
|
"nlp.pipe_as_tuples(text_tuples)")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
"permit overlapping spans.")
|
"permit overlapping spans.")
|
||||||
|
|
|
@ -1470,51 +1470,20 @@ class Language:
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@overload
|
|
||||||
def pipe(
|
def pipe(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[Union[str, Doc]],
|
texts: Iterable[Union[str, Doc]],
|
||||||
*,
|
*,
|
||||||
as_tuples: Literal[False] = ...,
|
|
||||||
batch_size: Optional[int] = ...,
|
|
||||||
disable: Iterable[str] = ...,
|
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
|
||||||
n_process: int = ...,
|
|
||||||
) -> Iterator[Doc]:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def pipe( # noqa: F811
|
|
||||||
self,
|
|
||||||
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
|
||||||
*,
|
|
||||||
as_tuples: Literal[True] = ...,
|
|
||||||
batch_size: Optional[int] = ...,
|
|
||||||
disable: Iterable[str] = ...,
|
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
|
||||||
n_process: int = ...,
|
|
||||||
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
|
||||||
...
|
|
||||||
|
|
||||||
def pipe( # noqa: F811
|
|
||||||
self,
|
|
||||||
texts: Union[
|
|
||||||
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
|
|
||||||
],
|
|
||||||
*,
|
|
||||||
as_tuples: bool = False,
|
|
||||||
batch_size: Optional[int] = None,
|
batch_size: Optional[int] = None,
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
n_process: int = 1,
|
n_process: int = 1,
|
||||||
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
|
as_tuples: Optional[bool] = None, # deprecated
|
||||||
|
) -> Iterator[Doc]:
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
|
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
|
||||||
process.
|
process.
|
||||||
as_tuples (bool): If set to True, inputs should be a sequence of
|
|
||||||
(text, context) tuples. Output will then be a sequence of
|
|
||||||
(doc, context) tuples. Defaults to False.
|
|
||||||
batch_size (Optional[int]): The number of texts to buffer.
|
batch_size (Optional[int]): The number of texts to buffer.
|
||||||
disable (List[str]): Names of the pipeline components to disable.
|
disable (List[str]): Names of the pipeline components to disable.
|
||||||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||||
|
@ -1524,25 +1493,8 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#pipe
|
DOCS: https://spacy.io/api/language#pipe
|
||||||
"""
|
"""
|
||||||
if as_tuples:
|
if as_tuples is not None:
|
||||||
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
|
raise ValueError(Errors.E300)
|
||||||
docs_with_contexts = (
|
|
||||||
self._ensure_doc_with_context(text, context) for text, context in texts
|
|
||||||
)
|
|
||||||
docs = self.pipe(
|
|
||||||
docs_with_contexts,
|
|
||||||
batch_size=batch_size,
|
|
||||||
disable=disable,
|
|
||||||
n_process=n_process,
|
|
||||||
component_cfg=component_cfg,
|
|
||||||
)
|
|
||||||
for doc in docs:
|
|
||||||
context = doc._context
|
|
||||||
doc._context = None
|
|
||||||
yield (doc, context)
|
|
||||||
return
|
|
||||||
|
|
||||||
texts = cast(Iterable[Union[str, Doc]], texts)
|
|
||||||
|
|
||||||
# Set argument defaults
|
# Set argument defaults
|
||||||
if n_process == -1:
|
if n_process == -1:
|
||||||
|
@ -1583,6 +1535,31 @@ class Language:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
def pipe_as_tuples(
|
||||||
|
self,
|
||||||
|
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
||||||
|
*,
|
||||||
|
batch_size: Optional[int] = None,
|
||||||
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
|
n_process: int = 1,
|
||||||
|
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
||||||
|
docs_with_contexts = (
|
||||||
|
self._ensure_doc_with_context(text, context) for text, context in texts
|
||||||
|
)
|
||||||
|
docs = self.pipe(
|
||||||
|
docs_with_contexts,
|
||||||
|
batch_size=batch_size,
|
||||||
|
disable=disable,
|
||||||
|
n_process=n_process,
|
||||||
|
component_cfg=component_cfg,
|
||||||
|
)
|
||||||
|
for doc in docs:
|
||||||
|
context = doc._context
|
||||||
|
doc._context = None
|
||||||
|
yield (doc, context)
|
||||||
|
return
|
||||||
|
|
||||||
def _has_gpu_model(self, disable: Iterable[str]):
|
def _has_gpu_model(self, disable: Iterable[str]):
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
|
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
|
||||||
|
|
|
@ -271,11 +271,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
||||||
("TEXT 666", 666),
|
("TEXT 666", 666),
|
||||||
]
|
]
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts, as_tuples=True))
|
list(nlp.pipe_as_tuples(texts))
|
||||||
nlp.set_error_handler(warn_error)
|
nlp.set_error_handler(warn_error)
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
with mock.patch.object(logger, "warning") as mock_warning:
|
with mock.patch.object(logger, "warning") as mock_warning:
|
||||||
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
|
tuples = list(nlp.pipe_as_tuples(texts, n_process=n_process))
|
||||||
# HACK/TODO? the warnings in child processes don't seem to be
|
# HACK/TODO? the warnings in child processes don't seem to be
|
||||||
# detected by the mock logger
|
# detected by the mock logger
|
||||||
if n_process == 1:
|
if n_process == 1:
|
||||||
|
@ -287,6 +287,18 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
||||||
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
|
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
|
||||||
|
|
||||||
|
|
||||||
|
def test_language_previous_pipe_as_tuples_error(nlp):
|
||||||
|
texts = [
|
||||||
|
("TEXT 111", 111),
|
||||||
|
("TEXT 222", 222),
|
||||||
|
("TEXT 333", 333),
|
||||||
|
("TEXT 342", 342),
|
||||||
|
("TEXT 666", 666),
|
||||||
|
]
|
||||||
|
with pytest.raises(ValueError, match="nlp.pipe_as_tuples"):
|
||||||
|
list(nlp.pipe(texts, as_tuples=True))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_process", [1, 2])
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
||||||
"""Test the error handling of a component's pipe method"""
|
"""Test the error handling of a component's pipe method"""
|
||||||
|
|
|
@ -191,16 +191,42 @@ more efficient than processing texts one-by-one.
|
||||||
> assert doc.has_annotation("DEP")
|
> assert doc.has_annotation("DEP")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts` | A sequence of strings. ~~Iterable[str]~~ |
|
| `texts` | A sequence of strings. ~~Iterable[str]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
|
||||||
|
## Language.pipe_as_tuples {#pipe_as_tuples tag="method"}
|
||||||
|
|
||||||
|
Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples
|
||||||
|
in order. This is usually more efficient than processing texts one-by-one.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> texts = [
|
||||||
|
> ("One document.", {"id": 1}),
|
||||||
|
> "...",
|
||||||
|
> ("Lots of documents", {"id": 1000}),
|
||||||
|
> ]
|
||||||
|
> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50):
|
||||||
|
> assert doc.has_annotation("DEP")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `texts` | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||||
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||||
|
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||||
|
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||||
|
| **YIELDS** | Documents in the order of the original text. ~~Tuple(Doc, Any)~~ |
|
||||||
|
|
||||||
## Language.set_error_handler {#set_error_handler tag="method" new="3"}
|
## Language.set_error_handler {#set_error_handler tag="method" new="3"}
|
||||||
|
|
||||||
|
|
|
@ -91,11 +91,12 @@ have to call `list()` on it first:
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
You can use the `as_tuples` option to pass additional context along with each
|
You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to
|
||||||
doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
|
pass additional context along with each doc when using the functionality of
|
||||||
the input should be a sequence of `(text, context)` tuples and the output will
|
[`nlp.pipe`](/api/language#pipe). The input should be a sequence of
|
||||||
be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
|
`(text, context)` tuples and the output will be a sequence of `(doc, context)`
|
||||||
the context and save it in a [custom attribute](#custom-components-attributes):
|
tuples. For example, you can pass metadata in the context and save it in a
|
||||||
|
[custom attribute](#custom-components-attributes):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
@ -111,7 +112,7 @@ text_tuples = [
|
||||||
]
|
]
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
|
doc_tuples = nlp.pipe_as_tuples(text_tuples)
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
for doc, context in doc_tuples:
|
for doc, context in doc_tuples:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user