Merge pull request #6003 from explosion/feature/matcher-as-spans

2025-09-22 12:06:43 +03:00 · 2020-08-31 17:50:56 +02:00 · 2020-08-31 17:50:56 +02:00 · 9af82f3f11
commit 9af82f3f11
parent 3ac620f09d add9de5487
9 changed files with 129 additions and 93 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -112,6 +112,9 @@ class Warnings:
            "word segmenters: {supported}. Defaulting to {default}.")
    W104 = ("Skipping modifications for '{target}' segmenter. The current "
            "segmenter is '{current}'.")
+    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
+            "need to match on a stream of documents, you can use nlp.pipe and "
+            "call the {matcher} on each Doc object.")


@add_codes
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -176,18 +176,10 @@ cdef class Matcher:
        return (self._callbacks[key], self._patterns[key])

    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
-
-        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
-        batch_size (int): Number of documents to accumulate into a working set.
-        return_matches (bool): Yield the match lists along with the docs, making
-            results (doc, matches) tuples.
-        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
-            and yield (result, context) tuples out.
-            If both return_matches and as_tuples are True, the output will
-            be a sequence of ((doc, matches), context) tuples.
-        YIELDS (Doc): Documents, in order.
+        """Match a stream of documents, yielding them in turn. Deprecated as of
+        spaCy v3.0.
        """
+        warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in docs:
                matches = self(doc)
@ -203,13 +195,16 @@ cdef class Matcher:
                else:
                    yield doc

-    def __call__(self, object doclike):
+    def __call__(self, object doclike, *, as_spans=False):
        """Find all token sequences matching the supplied pattern.

        doclike (Doc or Span): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
+            start, end) tuples.
+        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
+            to True, a list of Span objects is returned.
        """
        if isinstance(doclike, Doc):
            doc = doclike
@ -262,7 +257,10 @@ cdef class Matcher:
            on_match = self._callbacks.get(key, None)
            if on_match is not None:
                on_match(self, doc, i, final_matches)
-        return final_matches
+        if as_spans:
+            return [Span(doc, start, end, label=key) for key, start, end in final_matches]
+        else:
+            return final_matches

    def _normalize_key(self, key):
        if isinstance(key, basestring):
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -7,6 +7,7 @@ import warnings
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
+from ..tokens.span cimport Span
 from ..typedefs cimport attr_t

 from ..schemas import TokenPattern
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)

-    def __call__(self, doc):
+    def __call__(self, doc, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.

        doc (Doc): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
+            start, end) tuples.
+        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
+            to True, a list of Span objects is returned.

        DOCS: https://spacy.io/api/phrasematcher#call
        """
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
            on_match = self._callbacks.get(self.vocab.strings[ent_id])
            if on_match is not None:
                on_match(self, doc, i, matches)
-        return matches
+        if as_spans:
+            return [Span(doc, start, end, label=key) for key, start, end in matches]
+        else:
+            return matches

    cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
        cdef MapStruct* current_node = self.c_map
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
            idx += 1

    def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
-
-        docs (iterable): A stream of documents.
-        batch_size (int): Number of documents to accumulate into a working set.
-        return_matches (bool): Yield the match lists along with the docs, making
-            results (doc, matches) tuples.
-        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
-            and yield (result, context) tuples out.
-            If both return_matches and as_tuples are True, the output will
-            be a sequence of ((doc, matches), context) tuples.
-        YIELDS (Doc): Documents, in order.
-
-        DOCS: https://spacy.io/api/phrasematcher#pipe
+        """Match a stream of documents, yielding them in turn. Deprecated as of
+        spaCy v3.0.
        """
+        warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in stream:
                matches = self(doc)
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -2,7 +2,8 @@ import pytest
 import re
 from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
-from spacy.tokens import Doc, Token
+from spacy.tokens import Doc, Token, Span
+
 from ..doc.test_underscore import clean_underscore  # noqa: F401


@ -469,3 +470,26 @@ def test_matcher_span(matcher):
    assert len(matcher(doc)) == 2
    assert len(matcher(span_js)) == 1
    assert len(matcher(span_java)) == 1
+
+
+def test_matcher_as_spans(matcher):
+    """Test the new as_spans=True API."""
+    text = "JavaScript is good but Java is better"
+    doc = Doc(matcher.vocab, words=text.split())
+    matches = matcher(doc, as_spans=True)
+    assert len(matches) == 2
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "JavaScript"
+    assert matches[0].label_ == "JS"
+    assert isinstance(matches[1], Span)
+    assert matches[1].text == "Java"
+    assert matches[1].label_ == "Java"
+
+
+def test_matcher_deprecated(matcher):
+    doc = Doc(matcher.vocab, words=["hello", "world"])
+    with pytest.warns(DeprecationWarning) as record:
+        for _ in matcher.pipe([doc]):
+            pass
+        assert record.list
+        assert "spaCy v3.0" in str(record.list[0].message)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -2,7 +2,7 @@ import pytest
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from ..util import get_doc


@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
    # clunky way to vaguely check that callback is unpickled
    (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
    assert isinstance(callbacks.get("TEST2"), Mock)
+
+
+def test_phrase_matcher_as_spans(en_vocab):
+    """Test the new as_spans=True API."""
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
+    matcher.add("B", [Doc(en_vocab, words=["test"])])
+    doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
+    matches = matcher(doc, as_spans=True)
+    assert len(matches) == 2
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "hello world"
+    assert matches[0].label_ == "A"
+    assert isinstance(matches[1], Span)
+    assert matches[1].text == "test"
+    assert matches[1].label_ == "B"
+
+
+def test_phrase_matcher_deprecated(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.warns(DeprecationWarning) as record:
+        for _ in matcher.pipe([doc]):
+            pass
+        assert record.list
+        assert "spaCy v3.0" in str(record.list[0].message)
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > matches = matcher(doc)
 > ```

-| Name        | Description                                                                                                                                                                                             |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doclike`   | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                 |
-| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
-
-## Matcher.pipe {#pipe tag="method"}
-
-Match a stream of documents, yielding them in turn.
-
-> #### Example
->
-> ```python
-> from spacy.matcher import Matcher
-> matcher = Matcher(nlp.vocab)
-> for doc in matcher.pipe(docs, batch_size=50):
->     pass
-> ```
-
-| Name                                          | Description                                                                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~                                                                                                                                                                      |
-| `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
-| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
-| `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
-| **YIELDS**                                    | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                                                   |
+| Name                                  | Description                                                                                                                                                                                                                                                                                              |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doclike`                             | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |

 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > matches = matcher(doc)
 > ```

-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| `doc`       | The document to match over. ~~Doc~~ |
-| **RETURNS** | list                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
+| Name                                  | Description                                                                                                                                                                                                                                                                                              |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                 | The document to match over. ~~Doc~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |

 <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">

@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]

 </Infobox>

-## PhraseMatcher.pipe {#pipe tag="method"}
-
-Match a stream of documents, yielding them in turn.
-
-> #### Example
->
-> ```python
->   from spacy.matcher import PhraseMatcher
->   matcher = PhraseMatcher(nlp.vocab)
->   for doc in matcher.pipe(docs, batch_size=50):
->       pass
-> ```
-
-| Name                                          | Description                                                                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | A stream of documents. ~~Iterable[Doc]~~                                                                                                                                                                                            |
-| `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
-| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
-| `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
-| **YIELDS**                                    | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                    |
-
 ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}

 Get the number of rules added to the matcher. Note that this only returns the
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -493,6 +493,39 @@ you prefer.
 | `i`       | Index of the current match (`matches[i`]). ~~int~~                                                                                                 |
 | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |

+### Creating spans from matches {#matcher-spans}
+
+Creating [`Span`](/api/span) objects from the returned matches is a very common
+use case. spaCy makes this easy by giving you access to the `start` and `end`
+token of each match, which you can use to construct a new span with an optional
+label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
+matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
+using the `match_id` as the span label.
+
+```python
+### {executable="true"}
+import spacy
+from spacy.matcher import Matcher
+from spacy.tokens import Span
+
+nlp = spacy.blank("en")
+matcher = Matcher(nlp.vocab)
+matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
+doc = nlp("Barack Obama was the 44th president of the United States")
+
+# 1. Return (match_id, start, end) tuples
+matches = matcher(doc)
+for match_id, start, end in matches:
+    # Create the matched span and assign the match_id as a label
+    span = Span(doc, start, end, label=match_id)
+    print(span.text, span.label_)
+
+# 2. Return Span objects directly
+matches = matcher(doc, as_spans=True)
+for span in matches:
+    print(span.text, span.label_)
+```
+
 ### Using custom pipeline components {#matcher-pipeline}

 Let's say your data also contains some annoying pre-processing artifacts, like
@ -823,15 +856,6 @@ for token in doc:
    print(token.text, token._.is_hashtag)
 ```

-To process a stream of social media posts, we can use
-[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
-objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
-
-```python
-docs = nlp.pipe(LOTS_OF_TWEETS)
-matches = matcher.pipe(docs)
-```
-
 ## Efficient phrase matching {#phrasematcher}

 If you need to match large terminology lists, you can also use the
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | `GoldParse`                                              | [`Example`](/api/example)                                                                                    |
 | `GoldCorpus`                                             | [`Corpus`](/api/corpus)                                                                                      |
 | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`          | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                   |
+| `Matcher.pipe`, `PhraseMatcher.pipe`                     | not needed                                                                                                   |
 | `spacy init-model`                                       | [`spacy init model`](/api/cli#init-model)                                                                    |
 | `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)                                                                    |
 | `spacy profile`                                          | [`spacy debug profile`](/api/cli#debug-profile)                                                              |