mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Update matcher usage examples [ci skip]
This commit is contained in:
parent
d36632553a
commit
b5268955d7
|
@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
|
|||
|
||||
## Matcher.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Find all token sequences matching the supplied patterns on the `Doc`. As of
|
||||
spaCy v2.3, the `Matcher` can also be called on `Span` objects.
|
||||
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
|
|||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
> matcher.add("HelloWorld", None, pattern)
|
||||
> matcher.add("HelloWorld", [pattern])
|
||||
> doc = nlp("hello world!")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
By default, the matcher **does not perform any action** on matches, like tagging
|
||||
matched phrases with entity types. Instead, actions need to be specified when
|
||||
**adding patterns or entities**, by passing in a callback function as the
|
||||
`on_match` argument on [`add`](/api/matcher#add). This allows you to define
|
||||
custom actions per pattern within the same matcher. For example, you might only
|
||||
want to merge some entity types, and set custom flags for other matched
|
||||
patterns. For more details and examples, see the usage guide on
|
||||
[rule-based matching](/usage/rule-based-matching).
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
@ -92,7 +78,7 @@ patterns.
|
|||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
|
@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
|
|||
>
|
||||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert 'Rule' not in matcher
|
||||
> matcher.add('Rule', None, [{'ORTH': 'test'}])
|
||||
> assert 'Rule' in matcher
|
||||
> assert "Rule" not in matcher
|
||||
> matcher.add("Rule", [[{'ORTH': 'test'}]])
|
||||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
|
@ -129,39 +115,39 @@ overwritten.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> def on_match(matcher, doc, id, matches):
|
||||
> print('Matched!', matches)
|
||||
> def on_match(matcher, doc, id, matches):
|
||||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
|
||||
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
|
||||
> doc = nlp("HELLO WORLD on Google Maps.")
|
||||
> matches = matcher(doc)
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> patterns = [
|
||||
> [{"LOWER": "hello"}, {"LOWER": "world"}],
|
||||
> [{"ORTH": "Google"}, {"ORTH": "Maps"}]
|
||||
> ]
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> doc = nlp("HELLO WORLD on Google Maps.")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||
|
||||
As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
|
||||
the default in the future. The patterns are now the second argument and a list
|
||||
As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
|
||||
(instead of a variable number of arguments). The `on_match` callback becomes an
|
||||
optional keyword argument.
|
||||
|
||||
```diff
|
||||
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
||||
- matcher.add("GoogleNow", None, *patterns)
|
||||
+ matcher.add("GoogleNow", patterns)
|
||||
- matcher.add("GoogleNow", on_match, *patterns)
|
||||
+ matcher.add("GoogleNow", patterns, on_match=on_match)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## Matcher.remove {#remove tag="method" new="2"}
|
||||
|
||||
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
|
||||
|
@ -170,7 +156,7 @@ exist.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> assert "Rule" in matcher
|
||||
> matcher.remove("Rule")
|
||||
> assert "Rule" not in matcher
|
||||
|
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> from spacy.matcher import PhraseMatcher
|
||||
>
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
@ -104,7 +104,7 @@ patterns.
|
|||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
|
@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
|
|||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> assert "OBAMA" not in matcher
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert "OBAMA" in matcher
|
||||
> ```
|
||||
|
||||
|
@ -145,36 +145,32 @@ overwritten.
|
|||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
|
||||
> matcher.add("HEALTH", on_match, nlp("health care reform"),
|
||||
> nlp("healthcare reform"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
|
||||
> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
|
||||
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||
|
||||
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
|
||||
become the default in the future. The `Doc` patterns are now the second argument
|
||||
and a list (instead of a variable number of arguments). The `on_match` callback
|
||||
As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
|
||||
argument (instead of a variable number of arguments). The `on_match` callback
|
||||
becomes an optional keyword argument.
|
||||
|
||||
```diff
|
||||
patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||
- matcher.add("HEALTH", None, *patterns)
|
||||
+ matcher.add("HEALTH", patterns)
|
||||
- matcher.add("HEALTH", on_match, *patterns)
|
||||
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `docs` | list | `Doc` objects of the phrases to match. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
||||
|
||||
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
|
||||
|
@ -184,7 +180,7 @@ does not exist.
|
|||
>
|
||||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert "OBAMA" in matcher
|
||||
> matcher.remove("OBAMA")
|
||||
> assert "OBAMA" not in matcher
|
||||
|
|
|
@ -407,7 +407,7 @@ class EntityMatcher(object):
|
|||
def __init__(self, nlp, terms, label):
|
||||
patterns = [nlp.make_doc(text) for text in terms]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add(label, None, *patterns)
|
||||
self.matcher.add(label, patterns)
|
||||
|
||||
def __call__(self, doc):
|
||||
matches = self.matcher(doc)
|
||||
|
|
|
@ -98,9 +98,7 @@ print([token.text for token in doc])
|
|||
|
||||
First, we initialize the `Matcher` with a vocab. The matcher must always share
|
||||
the same vocab with the documents it will operate on. We can now call
|
||||
[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
|
||||
second argument lets you pass in an optional callback function to invoke on a
|
||||
successful match. For now, we set it to `None`.
|
||||
[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab)
|
||||
# Add match ID "HelloWorld" with no callback and one pattern
|
||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
|
||||
matcher.add("HelloWorld", None, pattern)
|
||||
matcher.add("HelloWorld", [pattern])
|
||||
|
||||
doc = nlp("Hello, world! Hello world!")
|
||||
matches = matcher(doc)
|
||||
|
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
|
|||
also match sequences without punctuation between "hello" and "world":
|
||||
|
||||
```python
|
||||
matcher.add("HelloWorld", None,
|
||||
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
|
||||
[{"LOWER": "hello"}, {"LOWER": "world"}])
|
||||
patterns = [
|
||||
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
|
||||
[{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
]
|
||||
matcher.add("HelloWorld", patterns)
|
||||
```
|
||||
|
||||
By default, the matcher will only return the matches and **not do anything
|
||||
|
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab, validate=True)
|
||||
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
|
||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
|
||||
matcher.add("HelloWorld", None, pattern)
|
||||
matcher.add("HelloWorld", [pattern])
|
||||
# 🚨 Raises an error:
|
||||
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
|
||||
# Pattern 0:
|
||||
|
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
|
|||
print(entity.text)
|
||||
|
||||
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||
matcher.add("GoogleIO", add_event_ent, pattern)
|
||||
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
|
||||
doc = nlp("This is a text about Google I/O")
|
||||
matches = matcher(doc)
|
||||
```
|
||||
|
@ -509,19 +509,18 @@ import spacy
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Token
|
||||
|
||||
# We're using a class because the component needs to be initialised with
|
||||
# We're using a class because the component needs to be initialized with
|
||||
# the shared vocab via the nlp object
|
||||
class BadHTMLMerger(object):
|
||||
def __init__(self, nlp):
|
||||
patterns = [
|
||||
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
|
||||
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
|
||||
]
|
||||
# Register a new token extension to flag bad HTML
|
||||
Token.set_extension("bad_html", default=False)
|
||||
self.matcher = Matcher(nlp.vocab)
|
||||
self.matcher.add(
|
||||
"BAD_HTML",
|
||||
None,
|
||||
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
|
||||
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
|
||||
)
|
||||
self.matcher.add("BAD_HTML", patterns)
|
||||
|
||||
def __call__(self, doc):
|
||||
# This method is invoked when the component is called on a Doc
|
||||
|
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
|
|||
|
||||
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
|
||||
{"POS": "ADJ"}]
|
||||
matcher.add("FacebookIs", collect_sents, pattern) # add pattern
|
||||
matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern
|
||||
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
|
||||
matches = matcher(doc)
|
||||
|
||||
|
@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab)
|
||||
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
|
||||
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
|
||||
matcher.add("PHONE_NUMBER", None, pattern)
|
||||
matcher.add("PHONE_NUMBER", [pattern])
|
||||
|
||||
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
|
||||
print([t.text for t in doc])
|
||||
|
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
|
|||
elif doc.vocab.strings[match_id] == "SAD":
|
||||
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
||||
|
||||
matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern
|
||||
matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
|
||||
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
|
||||
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
|
||||
|
||||
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
|
||||
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
|
||||
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
|
||||
|
||||
doc = nlp("Hello world 😀 #MondayMotivation")
|
||||
matches = matcher(doc)
|
||||
|
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
|
|||
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
|
||||
# Only run nlp.make_doc to speed things up
|
||||
patterns = [nlp.make_doc(text) for text in terms]
|
||||
matcher.add("TerminologyList", None, *patterns)
|
||||
matcher.add("TerminologyList", patterns)
|
||||
|
||||
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
|
||||
"converse in the Oval Office inside the White House in Washington, D.C.")
|
||||
|
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
|
|||
nlp = English()
|
||||
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
||||
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
|
||||
matcher.add("Names", None, *patterns)
|
||||
matcher.add("Names", patterns)
|
||||
|
||||
doc = nlp("angela merkel and us president barack Obama")
|
||||
for match_id, start, end in matcher(doc):
|
||||
|
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
|
|||
|
||||
nlp = English()
|
||||
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
|
||||
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
|
||||
matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
|
||||
|
||||
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
|
||||
for match_id, start, end in matcher(doc):
|
||||
|
|
|
@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
|
|||
def set_sentiment(matcher, doc, i, matches):
|
||||
doc.sentiment += 0.1
|
||||
|
||||
pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
|
||||
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
|
||||
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
|
||||
pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
|
||||
patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
|
||||
matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o"
|
||||
matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji
|
||||
|
||||
doc = nlp("A text about Google I/O 😀😀")
|
||||
matches = matcher(doc)
|
||||
|
|
Loading…
Reference in New Issue
Block a user