Deprecate (Phrase)Matcher.pipe

This commit is contained in:
Ines Montani 2020-08-31 17:01:24 +02:00
parent db9f8896f5
commit add9de5487
9 changed files with 30 additions and 75 deletions

View File

@ -112,6 +112,9 @@ class Warnings:
"word segmenters: {supported}. Defaulting to {default}.") "word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current " W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.") "segmenter is '{current}'.")
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
"need to match on a stream of documents, you can use nlp.pipe and "
"call the {matcher} on each Doc object.")
@add_codes @add_codes

View File

@ -176,18 +176,10 @@ cdef class Matcher:
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
""" """
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in docs: for doc, context in docs:
matches = self(doc) matches = self(doc)

View File

@ -292,20 +292,10 @@ cdef class PhraseMatcher:
idx += 1 idx += 1
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
DOCS: https://spacy.io/api/phrasematcher#pipe
""" """
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in stream: for doc, context in stream:
matches = self(doc) matches = self(doc)

View File

@ -484,3 +484,12 @@ def test_matcher_as_spans(matcher):
assert isinstance(matches[1], Span) assert isinstance(matches[1], Span)
assert matches[1].text == "Java" assert matches[1].text == "Java"
assert matches[1].label_ == "Java" assert matches[1].label_ == "Java"
def test_matcher_deprecated(matcher):
doc = Doc(matcher.vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -303,3 +303,14 @@ def test_phrase_matcher_as_spans(en_vocab):
assert isinstance(matches[1], Span) assert isinstance(matches[1], Span)
assert matches[1].text == "test" assert matches[1].text == "test"
assert matches[1].label_ == "B" assert matches[1].label_ == "B"
def test_phrase_matcher_deprecated(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
doc = Doc(en_vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -123,27 +123,6 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | | `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
## Matcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import Matcher
> matcher = Matcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"} ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
Get the number of rules added to the matcher. Note that this only returns the Get the number of rules added to the matcher. Note that this only returns the

View File

@ -76,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
</Infobox> </Infobox>
## PhraseMatcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import PhraseMatcher
> matcher = PhraseMatcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## PhraseMatcher.\_\_len\_\_ {#len tag="method"} ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
Get the number of rules added to the matcher. Note that this only returns the Get the number of rules added to the matcher. Note that this only returns the

View File

@ -856,15 +856,6 @@ for token in doc:
print(token.text, token._.is_hashtag) print(token.text, token._.is_hashtag)
``` ```
To process a stream of social media posts, we can use
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
```python
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)
```
## Efficient phrase matching {#phrasematcher} ## Efficient phrase matching {#phrasematcher}
If you need to match large terminology lists, you can also use the If you need to match large terminology lists, you can also use the

View File

@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) | | `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | | `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | | `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |