mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
Add Language.pipe_as_tuples
As part of the transition to v4, add `Language.pipe_as_tuples()` and deprecate `Language.pipe(as_tuples=True)`.
This commit is contained in:
parent
9a454676f3
commit
6c268d4ed9
|
@ -215,6 +215,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
W125 = ("As of spaCy v3.6, `nlp.pipe(as_tuples=True)` has been deprecated "
|
||||
"in favor of `nlp.pipe_as_tuples()`. `nlp.pipe(as_tuples=True)` "
|
||||
"will be removed in spaCy v4.0.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
|
|
@ -1516,6 +1516,7 @@ class Language:
|
|||
DOCS: https://spacy.io/api/language#pipe
|
||||
"""
|
||||
if as_tuples:
|
||||
warnings.warn(Warnings.W125, DeprecationWarning)
|
||||
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
|
||||
docs_with_contexts = (
|
||||
self._ensure_doc_with_context(text, context) for text, context in texts
|
||||
|
@ -1574,6 +1575,31 @@ class Language:
|
|||
for doc in docs:
|
||||
yield doc
|
||||
|
||||
def pipe_as_tuples(
|
||||
self,
|
||||
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
||||
*,
|
||||
batch_size: Optional[int] = None,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
n_process: int = 1,
|
||||
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
||||
docs_with_contexts = (
|
||||
self._ensure_doc_with_context(text, context) for text, context in texts
|
||||
)
|
||||
docs = self.pipe(
|
||||
docs_with_contexts,
|
||||
batch_size=batch_size,
|
||||
disable=disable,
|
||||
n_process=n_process,
|
||||
component_cfg=component_cfg,
|
||||
)
|
||||
for doc in docs:
|
||||
context = doc._context
|
||||
doc._context = None
|
||||
yield (doc, context)
|
||||
return
|
||||
|
||||
def _has_gpu_model(self, disable: Iterable[str]):
|
||||
for name, proc in self.pipeline:
|
||||
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
|
||||
|
|
|
@ -362,6 +362,22 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
|||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||
|
||||
|
||||
def test_language_pipe_as_tuples():
|
||||
nlp = English()
|
||||
texts = [
|
||||
("TEXT 111", 111),
|
||||
("TEXT 222", 222),
|
||||
("TEXT 333", 333),
|
||||
("TEXT 342", 342),
|
||||
("TEXT 666", 666),
|
||||
]
|
||||
with pytest.warns(DeprecationWarning):
|
||||
docs_contexts = list(nlp.pipe(texts, as_tuples=True))
|
||||
assert len(docs_contexts) == len(texts)
|
||||
docs_contexts = list(nlp.pipe_as_tuples(texts))
|
||||
assert len(docs_contexts) == len(texts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
||||
"""Test the error handling of nlp.pipe with input as tuples"""
|
||||
|
@ -378,11 +394,11 @@ def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
|
|||
("TEXT 666", 666),
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, as_tuples=True))
|
||||
list(nlp.pipe_as_tuples(texts))
|
||||
nlp.set_error_handler(warn_error)
|
||||
logger = logging.getLogger("spacy")
|
||||
with mock.patch.object(logger, "warning") as mock_warning:
|
||||
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
|
||||
tuples = list(nlp.pipe_as_tuples(texts))
|
||||
# HACK/TODO? the warnings in child processes don't seem to be
|
||||
# detected by the mock logger
|
||||
if n_process == 1:
|
||||
|
|
|
@ -198,16 +198,43 @@ tokenization is skipped but the rest of the pipeline is run.
|
|||
> assert doc.has_annotation("DEP")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_tuples` | Deprecated in v3.6 in favor of [`Language.pipe_as_tuples`](#pipe_as_tuples). If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
||||
## Language.pipe_as_tuples {id="pipe_as_tuples",tag="method",version="3.6"}
|
||||
|
||||
Process `(text, context)` tuples as a stream, and yield `(Doc, context)` tuples
|
||||
in order. This is usually more efficient than processing texts one-by-one.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> texts = [
|
||||
> ("One document.", {"id": 1}),
|
||||
> "...",
|
||||
> ("Lots of documents", {"id": 1000}),
|
||||
> ]
|
||||
> for doc, context in nlp.pipe_as_tuples(texts, batch_size=50):
|
||||
> assert doc.has_annotation("DEP")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts` | A sequence of strings. ~~Iterable[Tuple(str, Any)]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `n_process` | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original texts. ~~Tuple(Doc, Any)~~ |
|
||||
|
||||
## Language.set_error_handler {id="set_error_handler",tag="method",version="3"}
|
||||
|
||||
|
|
|
@ -88,11 +88,12 @@ have to call `list()` on it first:
|
|||
|
||||
</Infobox>
|
||||
|
||||
You can use the `as_tuples` option to pass additional context along with each
|
||||
doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
|
||||
the input should be a sequence of `(text, context)` tuples and the output will
|
||||
be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
|
||||
the context and save it in a [custom attribute](#custom-components-attributes):
|
||||
You can use the [`nlp.pipe_as_tuples`](/api/language#pipe_as_tuples) method to
|
||||
pass additional context along with each doc when using the functionality of
|
||||
[`nlp.pipe`](/api/language#pipe). The input should be a sequence of
|
||||
`(text, context)` tuples and the output will be a sequence of `(doc, context)`
|
||||
tuples. For example, you can pass metadata in the context and save it in a
|
||||
[custom attribute](#custom-components-attributes):
|
||||
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
@ -107,7 +108,7 @@ text_tuples = [
|
|||
]
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
|
||||
doc_tuples = nlp.pipe_as_tuples(text_tuples)
|
||||
|
||||
docs = []
|
||||
for doc, context in doc_tuples:
|
||||
|
|
Loading…
Reference in New Issue
Block a user