mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			244 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			244 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > MATCHER
 | |
| 
 | |
| include ../../_includes/_mixins
 | |
| 
 | |
| p Match sequences of tokens, based on pattern rules.
 | |
| 
 | |
| +infobox("⚠️ Deprecation note")
 | |
|     |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
 | |
|     |  are deprecated and have been replaced with a simpler
 | |
|     |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
 | |
|     |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
 | |
|     |  is now called #[+api("matcher#get") #[code matcher.get]].
 | |
|     |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
 | |
|     |  and #[code Matcher.has_entity] (now redundant) have been removed.
 | |
| 
 | |
| +h(2, "init") Matcher.__init__
 | |
|     +tag method
 | |
| 
 | |
| p Create the rule-based #[code Matcher].
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.matcher import Matcher
 | |
| 
 | |
|     patterns = {'HelloWorld': [{'LOWER': 'hello'}, {'LOWER': 'world'}]}
 | |
|     matcher = Matcher(nlp.vocab)
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code vocab]
 | |
|         +cell #[code Vocab]
 | |
|         +cell
 | |
|             |  The vocabulary object, which must be shared with the documents
 | |
|             |  the matcher will operate on.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code patterns]
 | |
|         +cell dict
 | |
|         +cell Patterns to add to the matcher, keyed by ID.
 | |
| 
 | |
|     +footrow
 | |
|         +cell returns
 | |
|         +cell #[code Matcher]
 | |
|         +cell The newly constructed object.
 | |
| 
 | |
| +h(2, "call") Matcher.__call__
 | |
|     +tag method
 | |
| 
 | |
| p Find all token sequences matching the supplied patterns on the #[code Doc].
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.matcher import Matcher
 | |
| 
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     pattern = [{'LOWER': "hello"}, {'LOWER': "world"}]
 | |
|     matcher.add("HelloWorld", None, pattern)
 | |
|     doc = nlp(u'hello world!')
 | |
|     matches = matcher(doc)
 | |
| 
 | |
| +infobox("Important note")
 | |
|     |  By default, the matcher #[strong does not perform any action] on matches,
 | |
|     |  like tagging matched phrases with entity types. Instead, actions need to
 | |
|     |  be specified when #[strong adding patterns or entities], by
 | |
|     |  passing in a callback function as the #[code on_match] argument on
 | |
|     |  #[+api("matcher#add") #[code add]]. This allows you to define custom
 | |
|     |  actions per pattern within the same matcher. For example, you might only
 | |
|     |  want to merge some entity types, and set custom flags for other matched
 | |
|     |  patterns. For more details and examples, see the usage workflow on
 | |
|     |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code doc]
 | |
|         +cell #[code Doc]
 | |
|         +cell The document to match over.
 | |
| 
 | |
|     +footrow
 | |
|         +cell returns
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  A list of #[code (match_id, start, end)] tuples, describing the
 | |
|             |  matches. A match tuple describes a span #[code doc[start:end]].
 | |
|             |  The #[code match_id] is the ID of the added match pattern.
 | |
| 
 | |
| +h(2, "pipe") Matcher.pipe
 | |
|     +tag method
 | |
| 
 | |
| p Match a stream of documents, yielding them in turn.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.matcher import Matcher
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
 | |
|         pass
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code docs]
 | |
|         +cell iterable
 | |
|         +cell A stream of documents.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code batch_size]
 | |
|         +cell int
 | |
|         +cell The number of documents to accumulate into a working set.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code n_threads]
 | |
|         +cell int
 | |
|         +cell
 | |
|             |  The number of threads with which to work on the buffer in
 | |
|             |  parallel, if the #[code Matcher] implementation supports
 | |
|             |  multi-threading.
 | |
| 
 | |
|     +footrow
 | |
|         +cell yields
 | |
|         +cell #[code Doc]
 | |
|         +cell Documents, in order.
 | |
| 
 | |
| +h(2, "len") Matcher.__len__
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Get the number of rules added to the matcher. Note that this only returns
 | |
|     |  the number of rules (identical with the number of IDs), not the number
 | |
|     |  of individual patterns.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     assert len(matcher) == 0
 | |
|     matcher.add('Rule', None, [{'ORTH': 'test'}])
 | |
|     assert len(matcher) == 1
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +footrow
 | |
|         +cell returns
 | |
|         +cell int
 | |
|         +cell The number of rules.
 | |
| 
 | |
| +h(2, "contains") Matcher.__contains__
 | |
|     +tag method
 | |
| 
 | |
| p Check whether the matcher contains rules for a match ID.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     assert 'Rule' in matcher == False
 | |
|     matcher.add('Rule', None, [{'ORTH': 'test'}])
 | |
|     assert 'Rule' in matcher == True
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code key]
 | |
|         +cell unicode
 | |
|         +cell The match ID.
 | |
|     +footrow
 | |
|         +cell returns
 | |
|         +cell int
 | |
|         +cell Whether the matcher contains rules for this match ID.
 | |
| 
 | |
| +h(2, "add") Matcher.add
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
 | |
|     |  a callback function to act on the matches. The callback function will
 | |
|     |  receive the arguments #[code matcher], #[code doc], #[code i] and
 | |
|     |  #[code matches]. If a pattern already exists for the given ID, the
 | |
|     |  patterns will be extended. An #[code on_match] callback will be
 | |
|     |  overwritten.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     def on_match(matcher, doc, id, matches):
 | |
|         print('Matched!', matches)
 | |
| 
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     matcher.add('HelloWorld', on_match, [{'LOWER': 'hello'}, {'LOWER': 'world'}])
 | |
|     matcher.add('GoogleMaps', on_match, [{'ORTH': 'Google'}, {'ORTH': 'Maps'}])
 | |
|     doc = nlp(u'HELLO WORLD on Google Maps.')
 | |
|     matches = matcher(doc)
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code match_id]
 | |
|         +cell unicode
 | |
|         +cell An ID for the thing you're matching.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code on_match]
 | |
|         +cell callable or #[code None]
 | |
|         +cell
 | |
|             |  Callback function to act on matches. Takes the arguments
 | |
|             |  #[code matcher], #[code doc], #[code i] and #[code matches].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code *patterns]
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  Match pattern. A pattern consists of a list of dicts, where each
 | |
|             |  dict describes a token.
 | |
| 
 | |
| +h(2, "remove") Matcher.remove
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Remove a rule from the matcher. A #[code KeyError] is raised if the match
 | |
|     |  ID does not exist.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     matcher.add('Rule', None, [{'ORTH': 'test'}])
 | |
|     assert 'Rule' in matcher == True
 | |
|     matcher.remove('Rule')
 | |
|     assert 'Rule' in matcher == False
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code key]
 | |
|         +cell unicode
 | |
|         +cell The ID of the match rule.
 | |
| 
 | |
| +h(2, "get") Matcher.get
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Retrieve the pattern stored for a key. Returns the rule as an
 | |
|     |  #[code (on_match, patterns)] tuple containing the callback and available
 | |
|     |  patterns.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     pattern = [{'ORTH': 'test'}]
 | |
|     matcher.add('Rule', None, pattern)
 | |
|     (on_match, patterns) = matcher.get('Rule')
 | |
|     assert patterns = [pattern]
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code key]
 | |
|         +cell unicode
 | |
|         +cell The ID of the match rule.
 | |
| 
 | |
|     +footrow
 | |
|         +cell returns
 | |
|         +cell tuple
 | |
|         +cell The rule, as an #[code (on_match, patterns)] tuple.
 |