spaCy/website/docs/api/matcher.jade

246 lines
7.3 KiB
Plaintext
Raw Normal View History

2016-10-31 21:04:15 +03:00
//- 💫 DOCS > API > MATCHER
include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules.
2017-05-19 22:47:06 +03:00
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
| is now called #[+api("matcher#get") #[code matcher.get]].
2017-05-19 22:47:06 +03:00
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
| and #[code Matcher.has_entity] (now redundant) have been removed.
2016-10-31 21:04:15 +03:00
+h(2, "init") Matcher.__init__
+tag method
2017-05-19 22:47:06 +03:00
p Create the rule-based #[code Matcher].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
matcher = Matcher(nlp.vocab)
2016-10-31 21:04:15 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell
| The vocabulary object, which must be shared with the documents
| the matcher will operate on.
+row
+cell #[code patterns]
+cell dict
2017-05-19 22:47:06 +03:00
+cell Patterns to add to the matcher, keyed by ID.
2016-10-31 21:04:15 +03:00
+footrow
+cell returns
2016-10-31 21:04:15 +03:00
+cell #[code Matcher]
+cell The newly constructed object.
+h(2, "call") Matcher.__call__
+tag method
2017-05-19 22:47:06 +03:00
p Find all token sequences matching the supplied patterns on the #[code Doc].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
matcher = Matcher(nlp.vocab)
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
matcher.add("HelloWorld", on_match=None, pattern)
2017-05-19 22:47:06 +03:00
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
2016-10-31 21:04:15 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document to match over.
+footrow
+cell returns
2016-10-31 21:04:15 +03:00
+cell list
+cell
2017-05-20 02:38:34 +03:00
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
2016-10-31 21:04:15 +03:00
+h(2, "pipe") Matcher.pipe
+tag method
p Match a stream of documents, yielding them in turn.
+aside-code("Example").
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
pass
2016-10-31 21:04:15 +03:00
+table(["Name", "Type", "Description"])
+row
+cell #[code docs]
2017-05-19 22:47:06 +03:00
+cell iterable
2016-10-31 21:04:15 +03:00
+cell A stream of documents.
+row
+cell #[code batch_size]
+cell int
+cell The number of documents to accumulate into a working set.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads with which to work on the buffer in
| parallel, if the #[code Matcher] implementation supports
| multi-threading.
+footrow
+cell yields
2016-10-31 21:04:15 +03:00
+cell #[code Doc]
+cell Documents, in order.
+h(2, "len") Matcher.__len__
+tag method
p
| Get the number of rules added to the matcher. Note that this only returns
| the number of rules (identical with the number of IDs), not the number
| of individual patterns.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert len(matcher) == 0
matcher.add('Rule', None, [{ORTH: 'test'}])
assert len(matcher) == 1
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of rules.
+h(2, "contains") Matcher.__contains__
+tag method
p Check whether the matcher contains rules for a match ID.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert 'Rule' in matcher == False
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
+cell unicode
+cell The match ID.
+footrow
+cell returns
+cell int
+cell Whether the matcher contains rules for this match ID.
+h(2, "add") Matcher.add
2016-10-31 21:04:15 +03:00
+tag method
2017-05-19 22:47:06 +03:00
p
2017-05-20 14:54:53 +03:00
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
| a callback function to act on the matches. The callback function will
| receive the arguments #[code matcher], #[code doc], #[code i] and
| #[code matches]. If a pattern already exists for the given ID, the
| patterns will be extended. An #[code on_match] callback will be
| overwritten.
2016-10-31 21:04:15 +03:00
2017-05-19 22:47:06 +03:00
+aside-code("Example").
def on_match(matcher, doc, id, matches):
print('Matched!', matches)
2016-10-31 21:04:15 +03:00
2017-05-19 22:47:06 +03:00
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
2017-05-19 22:47:06 +03:00
doc = nlp(u'HELLO WORLD on Google Maps.')
matches = matcher(doc)
2016-10-31 21:04:15 +03:00
+table(["Name", "Type", "Description"])
+row
2017-05-19 22:47:06 +03:00
+cell #[code match_id]
+cell unicode
+cell An ID for the thing you're matching.
2016-10-31 21:04:15 +03:00
+row
2017-05-19 22:47:06 +03:00
+cell #[code on_match]
+cell callable or #[code None]
2017-05-19 22:47:06 +03:00
+cell
| Callback function to act on matches. Takes the arguments
2017-05-20 02:38:34 +03:00
| #[code matcher], #[code doc], #[code i] and #[code matches].
+row
+cell #[code *patterns]
+cell list
+cell
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+h(2, "remove") Matcher.remove
+tag method
p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
| ID does not exist.
+aside-code("Example").
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
matcher.remove('Rule')
assert 'Rule' in matcher == False
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
+cell unicode
+cell The ID of the match rule.
+h(2, "get") Matcher.get
+tag method
p
| Retrieve the pattern stored for a key. Returns the rule as an
| #[code (on_match, patterns)] tuple containing the callback and available
| patterns.
+aside-code("Example").
pattern = [{ORTH: 'test'}]
matcher.add('Rule', None, pattern)
(on_match, patterns) = matcher.get('Rule')
assert patterns = [pattern]
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
+cell unicode
+cell The ID of the match rule.
+footrow
+cell returns
+cell tuple
+cell The rule, as an #[code (on_match, patterns)] tuple.