2016-10-31 21:04:15 +03:00
|
|
|
|
//- 💫 DOCS > API > MATCHER
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
include ../_includes/_mixins
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-11-07 14:00:43 +03:00
|
|
|
|
+infobox("Changed in v2.0", "⚠️")
|
2017-05-29 02:06:49 +03:00
|
|
|
|
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
|
|
|
|
| are deprecated and have been replaced with a simpler
|
|
|
|
|
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
|
|
|
|
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
|
|
|
|
| is now called #[+api("matcher#get") #[code matcher.get]].
|
|
|
|
|
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
2017-05-30 14:53:06 +03:00
|
|
|
|
| and #[code Matcher.has_entity] (now redundant) have been removed. The
|
|
|
|
|
| concept of "acceptor functions" has also been retired – this logic can
|
|
|
|
|
| now be handled in the callback functions.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+h(2, "init") Matcher.__init__
|
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-19 22:47:06 +03:00
|
|
|
|
p Create the rule-based #[code Matcher].
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
|
2017-05-22 14:54:45 +03:00
|
|
|
|
patterns = {'HelloWorld': [{'LOWER': 'hello'}, {'LOWER': 'world'}]}
|
2017-05-19 22:47:06 +03:00
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code vocab]
|
|
|
|
|
+cell #[code Vocab]
|
|
|
|
|
+cell
|
|
|
|
|
| The vocabulary object, which must be shared with the documents
|
|
|
|
|
| the matcher will operate on.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code patterns]
|
|
|
|
|
+cell dict
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+cell Patterns to add to the matcher, keyed by ID.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell #[code Matcher]
|
|
|
|
|
+cell The newly constructed object.
|
|
|
|
|
|
|
|
|
|
+h(2, "call") Matcher.__call__
|
|
|
|
|
+tag method
|
|
|
|
|
|
2017-05-19 22:47:06 +03:00
|
|
|
|
p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2017-05-22 14:54:45 +03:00
|
|
|
|
pattern = [{'LOWER': "hello"}, {'LOWER': "world"}]
|
2017-05-23 12:36:02 +03:00
|
|
|
|
matcher.add("HelloWorld", None, pattern)
|
2017-05-19 22:47:06 +03:00
|
|
|
|
doc = nlp(u'hello world!')
|
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code doc]
|
|
|
|
|
+cell #[code Doc]
|
|
|
|
|
+cell The document to match over.
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell returns
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell list
|
|
|
|
|
+cell
|
2017-05-20 02:38:34 +03:00
|
|
|
|
| A list of #[code (match_id, start, end)] tuples, describing the
|
|
|
|
|
| matches. A match tuple describes a span #[code doc[start:end]].
|
|
|
|
|
| The #[code match_id] is the ID of the added match pattern.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-26 13:43:16 +03:00
|
|
|
|
+infobox("Important note")
|
|
|
|
|
| By default, the matcher #[strong does not perform any action] on matches,
|
|
|
|
|
| like tagging matched phrases with entity types. Instead, actions need to
|
|
|
|
|
| be specified when #[strong adding patterns or entities], by
|
|
|
|
|
| passing in a callback function as the #[code on_match] argument on
|
|
|
|
|
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
|
|
|
|
| actions per pattern within the same matcher. For example, you might only
|
|
|
|
|
| want to merge some entity types, and set custom flags for other matched
|
2017-05-28 17:41:01 +03:00
|
|
|
|
| patterns. For more details and examples, see the usage guide on
|
2017-10-03 15:27:22 +03:00
|
|
|
|
| #[+a("/usage/linguistic-features#rule-based-matching") rule-based matching].
|
2017-05-26 13:43:16 +03:00
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+h(2, "pipe") Matcher.pipe
|
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
|
|
p Match a stream of documents, yielding them in turn.
|
|
|
|
|
|
2017-05-20 15:26:10 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2018-02-08 13:28:44 +03:00
|
|
|
|
for doc in matcher.pipe(docs, batch_size=50, n_threads=4):
|
2017-05-20 15:26:10 +03:00
|
|
|
|
pass
|
|
|
|
|
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code docs]
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+cell iterable
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell A stream of documents.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code batch_size]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell The number of documents to accumulate into a working set.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code n_threads]
|
|
|
|
|
+cell int
|
|
|
|
|
+cell
|
|
|
|
|
| The number of threads with which to work on the buffer in
|
|
|
|
|
| parallel, if the #[code Matcher] implementation supports
|
|
|
|
|
| multi-threading.
|
|
|
|
|
|
2018-02-18 16:13:58 +03:00
|
|
|
|
+row
|
|
|
|
|
+cell #[code return_matches]
|
2018-02-18 16:15:18 +03:00
|
|
|
|
+tag-new(2.1)
|
2018-02-18 16:13:58 +03:00
|
|
|
|
+cell bool
|
|
|
|
|
+cell
|
|
|
|
|
| Yield the match lists along with the docs, making results
|
|
|
|
|
| #[code (doc, matches)] tuples.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code as_tuples]
|
2018-02-18 16:15:18 +03:00
|
|
|
|
+tag-new(2.1)
|
2018-02-18 16:13:58 +03:00
|
|
|
|
+cell bool
|
|
|
|
|
+cell
|
|
|
|
|
| Interpret the input stream as #[code (doc, context)] tuples, and
|
|
|
|
|
| yield #[code (result, context)] tuples out. If both
|
|
|
|
|
| #[code return_matches] and #[code as_tuples] are #[code True],
|
|
|
|
|
| the output will be a sequence of
|
|
|
|
|
| #[code ((doc, matches), context)] tuples.
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-19 01:02:34 +03:00
|
|
|
|
+cell yields
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+cell #[code Doc]
|
|
|
|
|
+cell Documents, in order.
|
|
|
|
|
|
2017-05-20 15:26:10 +03:00
|
|
|
|
+h(2, "len") Matcher.__len__
|
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
2017-05-20 15:32:34 +03:00
|
|
|
|
p
|
|
|
|
|
| Get the number of rules added to the matcher. Note that this only returns
|
|
|
|
|
| the number of rules (identical with the number of IDs), not the number
|
|
|
|
|
| of individual patterns.
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
|
|
|
|
assert len(matcher) == 0
|
2017-05-22 14:54:45 +03:00
|
|
|
|
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
2017-05-20 15:26:10 +03:00
|
|
|
|
assert len(matcher) == 1
|
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-20 15:26:10 +03:00
|
|
|
|
+cell returns
|
|
|
|
|
+cell int
|
|
|
|
|
+cell The number of rules.
|
|
|
|
|
|
|
|
|
|
+h(2, "contains") Matcher.__contains__
|
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
|
|
p Check whether the matcher contains rules for a match ID.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2017-05-29 02:06:49 +03:00
|
|
|
|
assert 'Rule' not in matcher
|
2017-05-22 14:54:45 +03:00
|
|
|
|
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
2017-05-29 02:06:49 +03:00
|
|
|
|
assert 'Rule' in matcher
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code key]
|
|
|
|
|
+cell unicode
|
|
|
|
|
+cell The match ID.
|
2017-10-03 15:27:22 +03:00
|
|
|
|
|
|
|
|
|
+row("foot")
|
2017-05-20 15:26:10 +03:00
|
|
|
|
+cell returns
|
|
|
|
|
+cell int
|
|
|
|
|
+cell Whether the matcher contains rules for this match ID.
|
|
|
|
|
|
|
|
|
|
+h(2, "add") Matcher.add
|
2016-10-31 21:04:15 +03:00
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-19 22:47:06 +03:00
|
|
|
|
p
|
2017-05-20 14:54:53 +03:00
|
|
|
|
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
|
2017-05-20 15:26:10 +03:00
|
|
|
|
| a callback function to act on the matches. The callback function will
|
|
|
|
|
| receive the arguments #[code matcher], #[code doc], #[code i] and
|
|
|
|
|
| #[code matches]. If a pattern already exists for the given ID, the
|
|
|
|
|
| patterns will be extended. An #[code on_match] callback will be
|
|
|
|
|
| overwritten.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+aside-code("Example").
|
|
|
|
|
def on_match(matcher, doc, id, matches):
|
|
|
|
|
print('Matched!', matches)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-19 22:47:06 +03:00
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2017-05-22 14:54:45 +03:00
|
|
|
|
matcher.add('HelloWorld', on_match, [{'LOWER': 'hello'}, {'LOWER': 'world'}])
|
|
|
|
|
matcher.add('GoogleMaps', on_match, [{'ORTH': 'Google'}, {'ORTH': 'Maps'}])
|
2017-05-19 22:47:06 +03:00
|
|
|
|
doc = nlp(u'HELLO WORLD on Google Maps.')
|
|
|
|
|
matches = matcher(doc)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+cell #[code match_id]
|
|
|
|
|
+cell unicode
|
|
|
|
|
+cell An ID for the thing you're matching.
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+row
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+cell #[code on_match]
|
2017-05-21 14:17:40 +03:00
|
|
|
|
+cell callable or #[code None]
|
2017-05-19 22:47:06 +03:00
|
|
|
|
+cell
|
|
|
|
|
| Callback function to act on matches. Takes the arguments
|
2017-05-20 02:38:34 +03:00
|
|
|
|
| #[code matcher], #[code doc], #[code i] and #[code matches].
|
2017-05-20 13:59:03 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code *patterns]
|
|
|
|
|
+cell list
|
|
|
|
|
+cell
|
|
|
|
|
| Match pattern. A pattern consists of a list of dicts, where each
|
|
|
|
|
| dict describes a token.
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
2017-11-07 14:00:43 +03:00
|
|
|
|
+infobox("Changed in v2.0", "⚠️")
|
2017-11-01 16:13:08 +03:00
|
|
|
|
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
|
|
|
|
| are deprecated and have been replaced with a simpler
|
|
|
|
|
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
|
|
|
|
| patterns and a callback for a given match ID.
|
|
|
|
|
|
|
|
|
|
+code-wrapper
|
|
|
|
|
+code-new.
|
|
|
|
|
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
|
|
|
|
|
|
|
|
|
+code-old.
|
|
|
|
|
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
|
|
|
|
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
2017-05-26 13:43:16 +03:00
|
|
|
|
|
2017-05-20 15:26:10 +03:00
|
|
|
|
+h(2, "remove") Matcher.remove
|
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
|
|
|
|
|
| ID does not exist.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
2017-05-22 14:54:45 +03:00
|
|
|
|
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
2017-05-29 02:06:49 +03:00
|
|
|
|
assert 'Rule' in matcher
|
2017-05-20 15:32:34 +03:00
|
|
|
|
matcher.remove('Rule')
|
2017-05-29 02:06:49 +03:00
|
|
|
|
assert 'Rule' not in matcher
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code key]
|
|
|
|
|
+cell unicode
|
|
|
|
|
+cell The ID of the match rule.
|
2017-05-20 15:43:10 +03:00
|
|
|
|
|
|
|
|
|
+h(2, "get") Matcher.get
|
|
|
|
|
+tag method
|
2017-05-26 13:42:36 +03:00
|
|
|
|
+tag-new(2)
|
2017-05-20 15:43:10 +03:00
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Retrieve the pattern stored for a key. Returns the rule as an
|
|
|
|
|
| #[code (on_match, patterns)] tuple containing the callback and available
|
|
|
|
|
| patterns.
|
|
|
|
|
|
|
|
|
|
+aside-code("Example").
|
2017-05-22 14:54:45 +03:00
|
|
|
|
pattern = [{'ORTH': 'test'}]
|
2017-05-20 15:43:10 +03:00
|
|
|
|
matcher.add('Rule', None, pattern)
|
2017-05-29 02:06:49 +03:00
|
|
|
|
on_match, patterns = matcher.get('Rule')
|
2017-05-20 15:43:10 +03:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code key]
|
|
|
|
|
+cell unicode
|
|
|
|
|
+cell The ID of the match rule.
|
|
|
|
|
|
2017-10-03 15:27:22 +03:00
|
|
|
|
+row("foot")
|
2017-05-20 15:43:10 +03:00
|
|
|
|
+cell returns
|
|
|
|
|
+cell tuple
|
|
|
|
|
+cell The rule, as an #[code (on_match, patterns)] tuple.
|