mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #6003 from explosion/feature/matcher-as-spans
This commit is contained in:
commit
9af82f3f11
|
@ -112,6 +112,9 @@ class Warnings:
|
|||
"word segmenters: {supported}. Defaulting to {default}.")
|
||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||
"segmenter is '{current}'.")
|
||||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||
"call the {matcher} on each Doc object.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -176,18 +176,10 @@ cdef class Matcher:
|
|||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in docs:
|
||||
matches = self(doc)
|
||||
|
@ -203,13 +195,16 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
def __call__(self, object doclike):
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
"""
|
||||
if isinstance(doclike, Doc):
|
||||
doc = doclike
|
||||
|
@ -262,7 +257,10 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(key, None)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, final_matches)
|
||||
return final_matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
|
||||
else:
|
||||
return final_matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
|
|
|
@ -7,6 +7,7 @@ import warnings
|
|||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.span cimport Span
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..schemas import TokenPattern
|
||||
|
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
def __call__(self, doc):
|
||||
def __call__(self, doc, *, as_spans=False):
|
||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#call
|
||||
"""
|
||||
|
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
|
|||
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in matches]
|
||||
else:
|
||||
return matches
|
||||
|
||||
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
|
||||
cdef MapStruct* current_node = self.c_map
|
||||
|
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
|
|||
idx += 1
|
||||
|
||||
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#pipe
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in stream:
|
||||
matches = self(doc)
|
||||
|
|
|
@ -2,7 +2,8 @@ import pytest
|
|||
import re
|
||||
from mock import Mock
|
||||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.tokens import Doc, Token, Span
|
||||
|
||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||
|
||||
|
||||
|
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
|
|||
assert len(matcher(doc)) == 2
|
||||
assert len(matcher(span_js)) == 1
|
||||
assert len(matcher(span_java)) == 1
|
||||
|
||||
|
||||
def test_matcher_as_spans(matcher):
|
||||
"""Test the new as_spans=True API."""
|
||||
text = "JavaScript is good but Java is better"
|
||||
doc = Doc(matcher.vocab, words=text.split())
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "JavaScript"
|
||||
assert matches[0].label_ == "JS"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "Java"
|
||||
assert matches[1].label_ == "Java"
|
||||
|
||||
|
||||
def test_matcher_deprecated(matcher):
|
||||
doc = Doc(matcher.vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
import srsly
|
||||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Span
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
|
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
|
|||
# clunky way to vaguely check that callback is unpickled
|
||||
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
||||
assert isinstance(callbacks.get("TEST2"), Mock)
|
||||
|
||||
|
||||
def test_phrase_matcher_as_spans(en_vocab):
|
||||
"""Test the new as_spans=True API."""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["test"])])
|
||||
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "hello world"
|
||||
assert matches[0].label_ == "A"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "test"
|
||||
assert matches[1].label_ == "B"
|
||||
|
||||
|
||||
def test_phrase_matcher_deprecated(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
||||
|
||||
|
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
|
|||
|
||||
</Infobox>
|
||||
|
||||
## PhraseMatcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import PhraseMatcher
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
|
||||
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules added to the matcher. Note that this only returns the
|
||||
|
|
|
@ -493,6 +493,39 @@ you prefer.
|
|||
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
||||
|
||||
### Creating spans from matches {#matcher-spans}
|
||||
|
||||
Creating [`Span`](/api/span) objects from the returned matches is a very common
|
||||
use case. spaCy makes this easy by giving you access to the `start` and `end`
|
||||
token of each match, which you can use to construct a new span with an optional
|
||||
label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
|
||||
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
|
||||
using the `match_id` as the span label.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
|
||||
doc = nlp("Barack Obama was the 44th president of the United States")
|
||||
|
||||
# 1. Return (match_id, start, end) tuples
|
||||
matches = matcher(doc)
|
||||
for match_id, start, end in matches:
|
||||
# Create the matched span and assign the match_id as a label
|
||||
span = Span(doc, start, end, label=match_id)
|
||||
print(span.text, span.label_)
|
||||
|
||||
# 2. Return Span objects directly
|
||||
matches = matcher(doc, as_spans=True)
|
||||
for span in matches:
|
||||
print(span.text, span.label_)
|
||||
```
|
||||
|
||||
### Using custom pipeline components {#matcher-pipeline}
|
||||
|
||||
Let's say your data also contains some annoying pre-processing artifacts, like
|
||||
|
@ -823,15 +856,6 @@ for token in doc:
|
|||
print(token.text, token._.is_hashtag)
|
||||
```
|
||||
|
||||
To process a stream of social media posts, we can use
|
||||
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
|
||||
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
|
||||
|
||||
```python
|
||||
docs = nlp.pipe(LOTS_OF_TWEETS)
|
||||
matches = matcher.pipe(docs)
|
||||
```
|
||||
|
||||
## Efficient phrase matching {#phrasematcher}
|
||||
|
||||
If you need to match large terminology lists, you can also use the
|
||||
|
|
|
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
|
||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||
|
|
Loading…
Reference in New Issue
Block a user