Update matcher usage examples [ci skip]

2025-07-15 18:52:29 +03:00 · 2020-07-02 15:39:45 +02:00 · 2020-07-02 15:39:45 +02:00 · b5268955d7
commit b5268955d7
parent d36632553a
5 changed files with 70 additions and 89 deletions
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.

 ## Matcher.\_\_call\_\_ {#call tag="method"}

-Find all token sequences matching the supplied patterns on the `Doc`. As of
-spaCy v2.3, the `Matcher` can also be called on `Span` objects.
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.

 > #### Example
 >
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
 >
 > matcher = Matcher(nlp.vocab)
 > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-> matcher.add("HelloWorld", None, pattern)
+> matcher.add("HelloWorld", [pattern])
 > doc = nlp("hello world!")
 > matches = matcher(doc)
 > ```

 | Name        | Type         | Description                                                                                                                                                              |
 | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `doclike`   | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).                                                                                                                     |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
 | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |

-<Infobox title="Important note" variant="warning">
-
-By default, the matcher **does not perform any action** on matches, like tagging
-matched phrases with entity types. Instead, actions need to be specified when
-**adding patterns or entities**, by passing in a callback function as the
-`on_match` argument on [`add`](/api/matcher#add). This allows you to define
-custom actions per pattern within the same matcher. For example, you might only
-want to merge some entity types, and set custom flags for other matched
-patterns. For more details and examples, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
-
-</Infobox>
-
 ## Matcher.pipe {#pipe tag="method"}

 Match a stream of documents, yielding them in turn.
@ -92,7 +78,7 @@ patterns.
 > ```python
 > matcher = Matcher(nlp.vocab)
 > assert len(matcher) == 0
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert len(matcher) == 1
 > ```

@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
 >
 > ```python
 > matcher = Matcher(nlp.vocab)
-> assert 'Rule' not in matcher
-> matcher.add('Rule', None, [{'ORTH': 'test'}])
-> assert 'Rule' in matcher
+> assert "Rule" not in matcher
+> matcher.add("Rule", [[{'ORTH': 'test'}]])
+> assert "Rule" in matcher
 > ```

 | Name        | Type | Description                                           |
@ -129,39 +115,39 @@ overwritten.
 > #### Example
 >
 > ```python
->   def on_match(matcher, doc, id, matches):
->       print('Matched!', matches)
+> def on_match(matcher, doc, id, matches):
+>     print('Matched!', matches)
 >
->   matcher = Matcher(nlp.vocab)
->   matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
->   matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
->   doc = nlp("HELLO WORLD on Google Maps.")
->   matches = matcher(doc)
+> matcher = Matcher(nlp.vocab)
+> patterns = [
+>    [{"LOWER": "hello"}, {"LOWER": "world"}],
+>    [{"ORTH": "Google"}, {"ORTH": "Maps"}]
+> ]
+> matcher.add("TEST_PATTERNS", patterns)
+> doc = nlp("HELLO WORLD on Google Maps.")
+> matches = matcher(doc)
 > ```

-| Name        | Type               | Description                                                                                   |
-| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`  | str                | An ID for the thing you're matching.                                                          |
-| `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+<Infobox title="Changed in v3.0" variant="warning">

-<Infobox title="Changed in v2.2.2" variant="warning">
-
-As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
-the default in the future. The patterns are now the second argument and a list
+As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
 (instead of a variable number of arguments). The `on_match` callback becomes an
 optional keyword argument.

 ```diff
 patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
- matcher.add("GoogleNow", None, *patterns)
-+ matcher.add("GoogleNow", patterns)
 - matcher.add("GoogleNow", on_match, *patterns)
 + matcher.add("GoogleNow", patterns, on_match=on_match)
 ```

 </Infobox>

+| Name       | Type               | Description                                                                                   |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str                | An ID for the thing you're matching.                                                          |
+| `patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
 ## Matcher.remove {#remove tag="method" new="2"}

 Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@ -170,7 +156,7 @@ exist.
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert "Rule" in matcher
 > matcher.remove("Rule")
 > assert "Rule" not in matcher
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > on_match, patterns = matcher.get("Rule")
 > ```

--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > from spacy.matcher import PhraseMatcher
 >
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > doc = nlp("Barack Obama lifts America one last time in emotional farewell")
 > matches = matcher(doc)
 > ```
@ -104,7 +104,7 @@ patterns.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert len(matcher) == 0
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert len(matcher) == 1
 > ```

@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert "OBAMA" not in matcher
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert "OBAMA" in matcher
 > ```

@ -145,36 +145,32 @@ overwritten.
 >       print('Matched!', matches)
 >
 >   matcher = PhraseMatcher(nlp.vocab)
->   matcher.add("OBAMA", on_match, nlp("Barack Obama"))
->   matcher.add("HEALTH", on_match, nlp("health care reform"),
->                                   nlp("healthcare reform"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
+>   matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
 >   doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
 >   matches = matcher(doc)
 > ```

-| Name       | Type               | Description                                                                                   |
-| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | str                | An ID for the thing you're matching.                                                          |
-| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        |
+<Infobox title="Changed in v3.0" variant="warning">

-<Infobox title="Changed in v2.2.2" variant="warning">
-
-As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
-become the default in the future. The `Doc` patterns are now the second argument
-and a list (instead of a variable number of arguments). The `on_match` callback
+As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
+argument (instead of a variable number of arguments). The `on_match` callback
 becomes an optional keyword argument.

 ```diff
 patterns = [nlp("health care reform"), nlp("healthcare reform")]
- matcher.add("HEALTH", None, *patterns)
-+ matcher.add("HEALTH", patterns)
 - matcher.add("HEALTH", on_match, *patterns)
 + matcher.add("HEALTH", patterns, on_match=on_match)
 ```

 </Infobox>

+| Name       | Type               | Description                                                                                   |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str                | An ID for the thing you're matching.                                                          |
+| `docs`     | list               | `Doc` objects of the phrases to match.                                                        |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
 ## PhraseMatcher.remove {#remove tag="method" new="2.2"}

 Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@ -184,7 +180,7 @@ does not exist.
 >
 > ```python
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > assert "OBAMA" in matcher
 > matcher.remove("OBAMA")
 > assert "OBAMA" not in matcher
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -407,7 +407,7 @@ class EntityMatcher(object):
    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
-        self.matcher.add(label, None, *patterns)
+        self.matcher.add(label, patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -98,9 +98,7 @@ print([token.text for token in doc])

 First, we initialize the `Matcher` with a vocab. The matcher must always share
 the same vocab with the documents it will operate on. We can now call
-[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
-second argument lets you pass in an optional callback function to invoke on a
-successful match. For now, we set it to `None`.
+[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.

 ```python
 ### {executable="true"}
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 # Add match ID "HelloWorld" with no callback and one pattern
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])

 doc = nlp("Hello, world! Hello world!")
 matches = matcher(doc)
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
 also match sequences without punctuation between "hello" and "world":

 ```python
-matcher.add("HelloWorld", None,
-            [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
-            [{"LOWER": "hello"}, {"LOWER": "world"}])
+patterns = [
+    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
+    [{"LOWER": "hello"}, {"LOWER": "world"}]
+]
+matcher.add("HelloWorld", patterns)
 ```

 By default, the matcher will only return the matches and **not do anything
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab, validate=True)
 # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
 # 🚨 Raises an error:
 # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
 # Pattern 0:
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
    print(entity.text)

 pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-matcher.add("GoogleIO", add_event_ent, pattern)
+matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
 doc = nlp("This is a text about Google I/O")
 matches = matcher(doc)
 ```
@ -509,19 +509,18 @@ import spacy
 from spacy.matcher import Matcher
 from spacy.tokens import Token

-# We're using a class because the component needs to be initialised with
+# We're using a class because the component needs to be initialized with
 # the shared vocab via the nlp object
 class BadHTMLMerger(object):
    def __init__(self, nlp):
+        patterns = [
+            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
+            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
+        ]
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(nlp.vocab)
-        self.matcher.add(
-            "BAD_HTML",
-            None,
-            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
-            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
-        )
+        self.matcher.add("BAD_HTML", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):

 pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
-matcher.add("FacebookIs", collect_sents, pattern)  # add pattern
+matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
 doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
 matches = matcher(doc)

@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
-matcher.add("PHONE_NUMBER", None, pattern)
+matcher.add("PHONE_NUMBER", [pattern])

 doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
 print([t.text for t in doc])
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

-matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
-matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern
+matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
+matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern

 # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
-matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
+matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])

 doc = nlp("Hello world 😀 #MondayMotivation")
 matches = matcher(doc)
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
 terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
 # Only run nlp.make_doc to speed things up
 patterns = [nlp.make_doc(text) for text in terms]
-matcher.add("TerminologyList", None, *patterns)
+matcher.add("TerminologyList", patterns)

 doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
 patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
-matcher.add("Names", None, *patterns)
+matcher.add("Names", patterns)

 doc = nlp("angela merkel and us president barack Obama")
 for match_id, start, end in matcher(doc):
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher

 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
-matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
+matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])

 doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
 for match_id, start, end in matcher(doc):
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
 def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1

-pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
-matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o"
-matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji
+pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
+patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
+matcher.add("GoogleIO", patterns1)  # Match "Google I/O" or "Google i/o"
+matcher.add("HAPPY", patterns2, on_match=set_sentiment)  # Match one or more happy emoji

 doc = nlp("A text about Google I/O 😀😀")
 matches = matcher(doc)