Merge pull request #6003 from explosion/feature/matcher-as-spans

This commit is contained in:
Ines Montani 2020-08-31 17:50:56 +02:00 committed by GitHub
commit 9af82f3f11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 129 additions and 93 deletions

View File

@ -112,6 +112,9 @@ class Warnings:
"word segmenters: {supported}. Defaulting to {default}.") "word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current " W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.") "segmenter is '{current}'.")
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
"need to match on a stream of documents, you can use nlp.pipe and "
"call the {matcher} on each Doc object.")
@add_codes @add_codes

View File

@ -176,18 +176,10 @@ cdef class Matcher:
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
""" """
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in docs: for doc, context in docs:
matches = self(doc) matches = self(doc)
@ -203,13 +195,16 @@ cdef class Matcher:
else: else:
yield doc yield doc
def __call__(self, object doclike): def __call__(self, object doclike, *, as_spans=False):
"""Find all token sequences matching the supplied pattern. """Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over. doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, as_spans (bool): Return Span objects with labels instead of (match_id,
start, end) tuples.
RETURNS (list): A list of `(match_id, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `match_id` is an integer. If as_spans is set
to True, a list of Span objects is returned.
""" """
if isinstance(doclike, Doc): if isinstance(doclike, Doc):
doc = doclike doc = doclike
@ -262,7 +257,10 @@ cdef class Matcher:
on_match = self._callbacks.get(key, None) on_match = self._callbacks.get(key, None)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, final_matches) on_match(self, doc, i, final_matches)
return final_matches if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
else:
return final_matches
def _normalize_key(self, key): def _normalize_key(self, key):
if isinstance(key, basestring): if isinstance(key, basestring):

View File

@ -7,6 +7,7 @@ import warnings
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.span cimport Span
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..schemas import TokenPattern from ..schemas import TokenPattern
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
result = internal_node result = internal_node
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL) map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
def __call__(self, doc): def __call__(self, doc, *, as_spans=False):
"""Find all sequences matching the supplied patterns on the `Doc`. """Find all sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over. doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, as_spans (bool): Return Span objects with labels instead of (match_id,
start, end) tuples.
RETURNS (list): A list of `(match_id, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `match_id` is an integer. If as_spans is set
to True, a list of Span objects is returned.
DOCS: https://spacy.io/api/phrasematcher#call DOCS: https://spacy.io/api/phrasematcher#call
""" """
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
on_match = self._callbacks.get(self.vocab.strings[ent_id]) on_match = self._callbacks.get(self.vocab.strings[ent_id])
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
return matches if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in matches]
else:
return matches
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil: cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
cdef MapStruct* current_node = self.c_map cdef MapStruct* current_node = self.c_map
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
idx += 1 idx += 1
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
DOCS: https://spacy.io/api/phrasematcher#pipe
""" """
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in stream: for doc, context in stream:
matches = self(doc) matches = self(doc)

View File

@ -2,7 +2,8 @@ import pytest
import re import re
from mock import Mock from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token, Span
from ..doc.test_underscore import clean_underscore # noqa: F401 from ..doc.test_underscore import clean_underscore # noqa: F401
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
assert len(matcher(span_js)) == 1 assert len(matcher(span_js)) == 1
assert len(matcher(span_java)) == 1 assert len(matcher(span_java)) == 1
def test_matcher_as_spans(matcher):
"""Test the new as_spans=True API."""
text = "JavaScript is good but Java is better"
doc = Doc(matcher.vocab, words=text.split())
matches = matcher(doc, as_spans=True)
assert len(matches) == 2
assert isinstance(matches[0], Span)
assert matches[0].text == "JavaScript"
assert matches[0].label_ == "JS"
assert isinstance(matches[1], Span)
assert matches[1].text == "Java"
assert matches[1].label_ == "Java"
def test_matcher_deprecated(matcher):
doc = Doc(matcher.vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -2,7 +2,7 @@ import pytest
import srsly import srsly
from mock import Mock from mock import Mock
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc from spacy.tokens import Doc, Span
from ..util import get_doc from ..util import get_doc
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
# clunky way to vaguely check that callback is unpickled # clunky way to vaguely check that callback is unpickled
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
assert isinstance(callbacks.get("TEST2"), Mock) assert isinstance(callbacks.get("TEST2"), Mock)
def test_phrase_matcher_as_spans(en_vocab):
"""Test the new as_spans=True API."""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
matcher.add("B", [Doc(en_vocab, words=["test"])])
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
matches = matcher(doc, as_spans=True)
assert len(matches) == 2
assert isinstance(matches[0], Span)
assert matches[0].text == "hello world"
assert matches[0].label_ == "A"
assert isinstance(matches[1], Span)
assert matches[1].text == "test"
assert matches[1].label_ == "B"
def test_phrase_matcher_deprecated(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
doc = Doc(en_vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | | `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | | _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
## Matcher.pipe {#pipe tag="method"} | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import Matcher
> matcher = Matcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"} ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

View File

@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------- | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | The document to match over. ~~Doc~~ | | `doc` | The document to match over. ~~Doc~~ |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | | _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning"> <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
</Infobox> </Infobox>
## PhraseMatcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import PhraseMatcher
> matcher = PhraseMatcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## PhraseMatcher.\_\_len\_\_ {#len tag="method"} ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
Get the number of rules added to the matcher. Note that this only returns the Get the number of rules added to the matcher. Note that this only returns the

View File

@ -493,6 +493,39 @@ you prefer.
| `i` | Index of the current match (`matches[i`]). ~~int~~ | | `i` | Index of the current match (`matches[i`]). ~~int~~ |
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ | | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
### Creating spans from matches {#matcher-spans}
Creating [`Span`](/api/span) objects from the returned matches is a very common
use case. spaCy makes this easy by giving you access to the `start` and `end`
token of each match, which you can use to construct a new span with an optional
label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
using the `match_id` as the span label.
```python
### {executable="true"}
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
doc = nlp("Barack Obama was the 44th president of the United States")
# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
# Create the matched span and assign the match_id as a label
span = Span(doc, start, end, label=match_id)
print(span.text, span.label_)
# 2. Return Span objects directly
matches = matcher(doc, as_spans=True)
for span in matches:
print(span.text, span.label_)
```
### Using custom pipeline components {#matcher-pipeline} ### Using custom pipeline components {#matcher-pipeline}
Let's say your data also contains some annoying pre-processing artifacts, like Let's say your data also contains some annoying pre-processing artifacts, like
@ -823,15 +856,6 @@ for token in doc:
print(token.text, token._.is_hashtag) print(token.text, token._.is_hashtag)
``` ```
To process a stream of social media posts, we can use
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
```python
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)
```
## Efficient phrase matching {#phrasematcher} ## Efficient phrase matching {#phrasematcher}
If you need to match large terminology lists, you can also use the If you need to match large terminology lists, you can also use the

View File

@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) | | `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | | `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | | `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |