diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 8210f7094..636354496 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
## Matcher.\_\_call\_\_ {#call tag="method"}
-Find all token sequences matching the supplied patterns on the `Doc`. As of
-spaCy v2.3, the `Matcher` can also be called on `Span` objects.
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> #### Example
>
@@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
>
> matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-> matcher.add("HelloWorld", None, pattern)
+> matcher.add("HelloWorld", [pattern])
> doc = nlp("hello world!")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). |
+| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
-
-
-By default, the matcher **does not perform any action** on matches, like tagging
-matched phrases with entity types. Instead, actions need to be specified when
-**adding patterns or entities**, by passing in a callback function as the
-`on_match` argument on [`add`](/api/matcher#add). This allows you to define
-custom actions per pattern within the same matcher. For example, you might only
-want to merge some entity types, and set custom flags for other matched
-patterns. For more details and examples, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
-
-
-
## Matcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
@@ -92,7 +78,7 @@ patterns.
> ```python
> matcher = Matcher(nlp.vocab)
> assert len(matcher) == 0
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
> assert len(matcher) == 1
> ```
@@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
>
> ```python
> matcher = Matcher(nlp.vocab)
-> assert 'Rule' not in matcher
-> matcher.add('Rule', None, [{'ORTH': 'test'}])
-> assert 'Rule' in matcher
+> assert "Rule" not in matcher
+> matcher.add("Rule", [[{'ORTH': 'test'}]])
+> assert "Rule" in matcher
> ```
| Name | Type | Description |
@@ -129,39 +115,39 @@ overwritten.
> #### Example
>
> ```python
-> def on_match(matcher, doc, id, matches):
-> print('Matched!', matches)
+> def on_match(matcher, doc, id, matches):
+> print('Matched!', matches)
>
-> matcher = Matcher(nlp.vocab)
-> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
-> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
-> doc = nlp("HELLO WORLD on Google Maps.")
-> matches = matcher(doc)
+> matcher = Matcher(nlp.vocab)
+> patterns = [
+> [{"LOWER": "hello"}, {"LOWER": "world"}],
+> [{"ORTH": "Google"}, {"ORTH": "Maps"}]
+> ]
+> matcher.add("TEST_PATTERNS", patterns)
+> doc = nlp("HELLO WORLD on Google Maps.")
+> matches = matcher(doc)
> ```
-| Name | Type | Description |
-| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | str | An ID for the thing you're matching. |
-| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
+
-
-
-As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
-the default in the future. The patterns are now the second argument and a list
+As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
(instead of a variable number of arguments). The `on_match` callback becomes an
optional keyword argument.
```diff
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-- matcher.add("GoogleNow", None, *patterns)
-+ matcher.add("GoogleNow", patterns)
- matcher.add("GoogleNow", on_match, *patterns)
+ matcher.add("GoogleNow", patterns, on_match=on_match)
```
+| Name | Type | Description |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str | An ID for the thing you're matching. |
+| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
## Matcher.remove {#remove tag="method" new="2"}
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@@ -170,7 +156,7 @@ exist.
> #### Example
>
> ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
> assert "Rule" in matcher
> matcher.remove("Rule")
> assert "Rule" not in matcher
@@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> #### Example
>
> ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
> on_match, patterns = matcher.get("Rule")
> ```
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index f02d81de9..9c722297d 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher
>
> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc)
> ```
@@ -104,7 +104,7 @@ patterns.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert len(matcher) == 1
> ```
@@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher
> ```
@@ -145,36 +145,32 @@ overwritten.
> print('Matched!', matches)
>
> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
-> matcher.add("HEALTH", on_match, nlp("health care reform"),
-> nlp("healthcare reform"))
+> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
+> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc)
> ```
-| Name | Type | Description |
-| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | str | An ID for the thing you're matching. |
-| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
+
-
-
-As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
-become the default in the future. The `Doc` patterns are now the second argument
-and a list (instead of a variable number of arguments). The `on_match` callback
+As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
+argument (instead of a variable number of arguments). The `on_match` callback
becomes an optional keyword argument.
```diff
patterns = [nlp("health care reform"), nlp("healthcare reform")]
-- matcher.add("HEALTH", None, *patterns)
-+ matcher.add("HEALTH", patterns)
- matcher.add("HEALTH", on_match, *patterns)
+ matcher.add("HEALTH", patterns, on_match=on_match)
```
+| Name | Type | Description |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str | An ID for the thing you're matching. |
+| `docs` | list | `Doc` objects of the phrases to match. |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@@ -184,7 +180,7 @@ does not exist.
>
> ```python
> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher
> matcher.remove("OBAMA")
> assert "OBAMA" not in matcher
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0ead27a49..6b32dc422 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -407,7 +407,7 @@ class EntityMatcher(object):
def __init__(self, nlp, terms, label):
patterns = [nlp.make_doc(text) for text in terms]
self.matcher = PhraseMatcher(nlp.vocab)
- self.matcher.add(label, None, *patterns)
+ self.matcher.add(label, patterns)
def __call__(self, doc):
matches = self.matcher(doc)
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 9a8f3da7b..d0ee44e49 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -98,9 +98,7 @@ print([token.text for token in doc])
First, we initialize the `Matcher` with a vocab. The matcher must always share
the same vocab with the documents it will operate on. We can now call
-[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
-second argument lets you pass in an optional callback function to invoke on a
-successful match. For now, we set it to `None`.
+[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
```python
### {executable="true"}
@@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
@@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
also match sequences without punctuation between "hello" and "world":
```python
-matcher.add("HelloWorld", None,
- [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
- [{"LOWER": "hello"}, {"LOWER": "world"}])
+patterns = [
+ [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
+ [{"LOWER": "hello"}, {"LOWER": "world"}]
+]
+matcher.add("HelloWorld", patterns)
```
By default, the matcher will only return the matches and **not do anything
@@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
# 🚨 Raises an error:
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0:
@@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
print(entity.text)
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-matcher.add("GoogleIO", add_event_ent, pattern)
+matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)
```
@@ -509,19 +509,18 @@ import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
-# We're using a class because the component needs to be initialised with
+# We're using a class because the component needs to be initialized with
# the shared vocab via the nlp object
class BadHTMLMerger(object):
def __init__(self, nlp):
+ patterns = [
+ [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
+ [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
+ ]
# Register a new token extension to flag bad HTML
Token.set_extension("bad_html", default=False)
self.matcher = Matcher(nlp.vocab)
- self.matcher.add(
- "BAD_HTML",
- None,
- [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
- [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
- )
+ self.matcher.add("BAD_HTML", patterns)
def __call__(self, doc):
# This method is invoked when the component is called on a Doc
@@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}]
-matcher.add("FacebookIs", collect_sents, pattern) # add pattern
+matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)
@@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
-matcher.add("PHONE_NUMBER", None, pattern)
+matcher.add("PHONE_NUMBER", [pattern])
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
@@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
elif doc.vocab.strings[match_id] == "SAD":
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
-matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern
-matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
+matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
+matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
-matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
+matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
@@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
-matcher.add("TerminologyList", None, *patterns)
+matcher.add("TerminologyList", patterns)
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
"converse in the Oval Office inside the White House in Washington, D.C.")
@@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
-matcher.add("Names", None, *patterns)
+matcher.add("Names", patterns)
doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
@@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
-matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
+matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 39d732724..aa8aa59af 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
-pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
-matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
-matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
+pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
+patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
+matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o"
+matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji
doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc)