From b5268955d7c35dfa86f8d9ae23caf42569c6e098 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 2 Jul 2020 15:39:45 +0200 Subject: [PATCH] Update matcher usage examples [ci skip] --- website/docs/api/matcher.md | 68 +++++++++------------- website/docs/api/phrasematcher.md | 34 +++++------ website/docs/usage/processing-pipelines.md | 2 +- website/docs/usage/rule-based-matching.md | 47 ++++++++------- website/docs/usage/spacy-101.md | 8 +-- 5 files changed, 70 insertions(+), 89 deletions(-) diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 8210f7094..636354496 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names. ## Matcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. As of -spaCy v2.3, the `Matcher` can also be called on `Span` objects. +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > #### Example > @@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects. > > matcher = Matcher(nlp.vocab) > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] -> matcher.add("HelloWorld", None, pattern) +> matcher.add("HelloWorld", [pattern]) > doc = nlp("hello world!") > matches = matcher(doc) > ``` | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). | +| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | - - -By default, the matcher **does not perform any action** on matches, like tagging -matched phrases with entity types. Instead, actions need to be specified when -**adding patterns or entities**, by passing in a callback function as the -`on_match` argument on [`add`](/api/matcher#add). This allows you to define -custom actions per pattern within the same matcher. For example, you might only -want to merge some entity types, and set custom flags for other matched -patterns. For more details and examples, see the usage guide on -[rule-based matching](/usage/rule-based-matching). - - - ## Matcher.pipe {#pipe tag="method"} Match a stream of documents, yielding them in turn. @@ -92,7 +78,7 @@ patterns. > ```python > matcher = Matcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > assert len(matcher) == 1 > ``` @@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID. > > ```python > matcher = Matcher(nlp.vocab) -> assert 'Rule' not in matcher -> matcher.add('Rule', None, [{'ORTH': 'test'}]) -> assert 'Rule' in matcher +> assert "Rule" not in matcher +> matcher.add("Rule", [[{'ORTH': 'test'}]]) +> assert "Rule" in matcher > ``` | Name | Type | Description | @@ -129,39 +115,39 @@ overwritten. > #### Example > > ```python -> def on_match(matcher, doc, id, matches): -> print('Matched!', matches) +> def on_match(matcher, doc, id, matches): +> print('Matched!', matches) > -> matcher = Matcher(nlp.vocab) -> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) -> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) -> doc = nlp("HELLO WORLD on Google Maps.") -> matches = matcher(doc) +> matcher = Matcher(nlp.vocab) +> patterns = [ +> [{"LOWER": "hello"}, {"LOWER": "world"}], +> [{"ORTH": "Google"}, {"ORTH": "Maps"}] +> ] +> matcher.add("TEST_PATTERNS", patterns) +> doc = nlp("HELLO WORLD on Google Maps.") +> matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | -| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | + - - -As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become -the default in the future. The patterns are now the second argument and a list +As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. ```diff patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] -- matcher.add("GoogleNow", None, *patterns) -+ matcher.add("GoogleNow", patterns) - matcher.add("GoogleNow", on_match, *patterns) + matcher.add("GoogleNow", patterns, on_match=on_match) ``` +| Name | Type | Description | +| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + ## Matcher.remove {#remove tag="method" new="2"} Remove a rule from the matcher. A `KeyError` is raised if the match ID does not @@ -170,7 +156,7 @@ exist. > #### Example > > ```python -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > assert "Rule" in matcher > matcher.remove("Rule") > assert "Rule" not in matcher @@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > #### Example > > ```python -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > on_match, patterns = matcher.get("Rule") > ``` diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index f02d81de9..9c722297d 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. > from spacy.matcher import PhraseMatcher > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > doc = nlp("Barack Obama lifts America one last time in emotional farewell") > matches = matcher(doc) > ``` @@ -104,7 +104,7 @@ patterns. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert len(matcher) == 1 > ``` @@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert "OBAMA" not in matcher -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert "OBAMA" in matcher > ``` @@ -145,36 +145,32 @@ overwritten. > print('Matched!', matches) > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", on_match, nlp("Barack Obama")) -> matcher.add("HEALTH", on_match, nlp("health care reform"), -> nlp("healthcare reform")) +> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match) +> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match) > doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") > matches = matcher(doc) > ``` -| Name | Type | Description | -| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | -| `*docs` | `Doc` | `Doc` objects of the phrases to match. | + - - -As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will -become the default in the future. The `Doc` patterns are now the second argument -and a list (instead of a variable number of arguments). The `on_match` callback +As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second +argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. ```diff patterns = [nlp("health care reform"), nlp("healthcare reform")] -- matcher.add("HEALTH", None, *patterns) -+ matcher.add("HEALTH", patterns) - matcher.add("HEALTH", on_match, *patterns) + matcher.add("HEALTH", patterns, on_match=on_match) ``` +| Name | Type | Description | +| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `docs` | list | `Doc` objects of the phrases to match. | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + ## PhraseMatcher.remove {#remove tag="method" new="2.2"} Remove a rule from the matcher by match ID. A `KeyError` is raised if the key @@ -184,7 +180,7 @@ does not exist. > > ```python > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert "OBAMA" in matcher > matcher.remove("OBAMA") > assert "OBAMA" not in matcher diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0ead27a49..6b32dc422 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -407,7 +407,7 @@ class EntityMatcher(object): def __init__(self, nlp, terms, label): patterns = [nlp.make_doc(text) for text in terms] self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add(label, None, *patterns) + self.matcher.add(label, patterns) def __call__(self, doc): matches = self.matcher(doc) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 9a8f3da7b..d0ee44e49 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -98,9 +98,7 @@ print([token.text for token in doc]) First, we initialize the `Matcher` with a vocab. The matcher must always share the same vocab with the documents it will operate on. We can now call -[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The -second argument lets you pass in an optional callback function to invoke on a -successful match. For now, we set it to `None`. +[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns. ```python ### {executable="true"} @@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) # Add match ID "HelloWorld" with no callback and one pattern pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] -matcher.add("HelloWorld", None, pattern) +matcher.add("HelloWorld", [pattern]) doc = nlp("Hello, world! Hello world!") matches = matcher(doc) @@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to also match sequences without punctuation between "hello" and "world": ```python -matcher.add("HelloWorld", None, - [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], - [{"LOWER": "hello"}, {"LOWER": "world"}]) +patterns = [ + [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], + [{"LOWER": "hello"}, {"LOWER": "world"}] +] +matcher.add("HelloWorld", patterns) ``` By default, the matcher will only return the matches and **not do anything @@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab, validate=True) # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] -matcher.add("HelloWorld", None, pattern) +matcher.add("HelloWorld", [pattern]) # 🚨 Raises an error: # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' # Pattern 0: @@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches): print(entity.text) pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] -matcher.add("GoogleIO", add_event_ent, pattern) +matcher.add("GoogleIO", [pattern], on_match=add_event_ent) doc = nlp("This is a text about Google I/O") matches = matcher(doc) ``` @@ -509,19 +509,18 @@ import spacy from spacy.matcher import Matcher from spacy.tokens import Token -# We're using a class because the component needs to be initialised with +# We're using a class because the component needs to be initialized with # the shared vocab via the nlp object class BadHTMLMerger(object): def __init__(self, nlp): + patterns = [ + [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], + [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], + ] # Register a new token extension to flag bad HTML Token.set_extension("bad_html", default=False) self.matcher = Matcher(nlp.vocab) - self.matcher.add( - "BAD_HTML", - None, - [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], - [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], - ) + self.matcher.add("BAD_HTML", patterns) def __call__(self, doc): # This method is invoked when the component is called on a Doc @@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches): pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}] -matcher.add("FacebookIs", collect_sents, pattern) # add pattern +matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?") matches = matcher(doc) @@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] -matcher.add("PHONE_NUMBER", None, pattern) +matcher.add("PHONE_NUMBER", [pattern]) doc = nlp("Call me at (123) 456 789 or (123) 456 789!") print([t.text for t in doc]) @@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches): elif doc.vocab.strings[match_id] == "SAD": doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment -matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern -matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern +matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern +matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern # Add pattern for valid hashtag, i.e. '#' plus any ASCII token -matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) +matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]]) doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) @@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab) terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(text) for text in terms] -matcher.add("TerminologyList", None, *patterns) +matcher.add("TerminologyList", patterns) doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " "converse in the Oval Office inside the White House in Washington, D.C.") @@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="LOWER") patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] -matcher.add("Names", None, *patterns) +matcher.add("Names", patterns) doc = nlp("angela merkel and us president barack Obama") for match_id, start, end in matcher(doc): @@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") -matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) +matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")]) doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") for match_id, start, end in matcher(doc): diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 39d732724..aa8aa59af 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab) def set_sentiment(matcher, doc, i, matches): doc.sentiment += 0.1 -pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] -pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] -matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" -matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji +pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]] +patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] +matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o" +matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji doc = nlp("A text about Google I/O 😀😀") matches = matcher(doc)