mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			75 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			75 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| include ../../_includes/_mixins
 | |
| 
 | |
| p.u-text-large spaCy features a rule-matching engine that operates over tokens. The rules can refer to token annotations and flags, and matches support callbacks to accept, modify and/or act on the match. The rule matcher also allows you to associate patterns with entity IDs, to allow some basic entity linking or disambiguation.
 | |
| 
 | |
| +code("python", "Matcher Example").
 | |
|     from spacy.matcher import Matcher
 | |
|     from spacy.attrs import *
 | |
|     import spacy
 | |
| 
 | |
|     nlp = spacy.load('en', parser=False, entity=False)
 | |
| 
 | |
|     def merge_phrases(matcher, doc, i, matches):
 | |
|         '''
 | |
|         Merge a phrase. We have to be careful here because we'll change the token indices.
 | |
|         To avoid problems, merge all the phrases once we're called on the last match.
 | |
|         '''
 | |
|         if i != len(matches)-1:
 | |
|             return None
 | |
|         # Get Span objects
 | |
|         spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
 | |
|         for ent_id, label, span in spans:
 | |
|             span.merge(label=label, tag='NNP' if label else span.root.tag_)
 | |
| 
 | |
|     matcher = Matcher(nlp.vocab)
 | |
| 
 | |
|     matcher.add_entity(
 | |
|         "GoogleNow", # Entity ID -- Helps you act on the match.
 | |
|         {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
 | |
|         acceptor=None, # Accept or modify the match
 | |
|         on_match=merge_phrases # Callback to act on the matches
 | |
|     )
 | |
| 
 | |
|     matcher.add_pattern(
 | |
|         "GoogleNow", # Entity ID -- Created if doesn't exist.
 | |
|         [ # The pattern is a list of *Token Specifiers*.
 | |
|             { # This Token Specifier matches tokens whose orth field is "Google"
 | |
|               ORTH: "Google"
 | |
|             },
 | |
|             { # This Token Specifier matches tokens whose orth field is "Now"
 | |
|               ORTH: "Now"
 | |
|             }
 | |
|         ],
 | |
|         label=None # Can associate a label to the pattern-match, to handle it better.
 | |
|     )
 | |
|     doc = nlp(u"I prefer Siri to Google Now.")
 | |
|     matches = matcher(doc)
 | |
|     for ent_id, label, start, end in matches:
 | |
|         print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
 | |
|         entity = matcher.get_entity(ent_id)
 | |
|         print(entity)
 | |
| 
 | |
|     matcher.add_pattern(
 | |
|         "GoogleNow",
 | |
|         [ # This Surface Form matches "google now", verbatim, and requires
 | |
|           # "google" to have the NNP tag. This helps prevent the pattern from
 | |
|           # matching cases like "I will google now to look up the time"
 | |
|           {
 | |
|             ORTH: "google",
 | |
|             TAG: "NNP"
 | |
|           },
 | |
|           {
 | |
|             ORTH: "now"
 | |
|           }
 | |
|         ]
 | |
|     )
 | |
| 
 | |
|     doc = nlp(u"I'll google now to find out how the google now service works.")
 | |
|     matches = matcher(doc)
 | |
|     for ent_id, label, start, end in matches:
 | |
|         print(ent_id, label, start, end, doc[start : end].text)
 | |
|     # Because we specified the on_match=merge_phrases callback,
 | |
|     # we should see 'google now' as a single token.
 | |
|     for token in doc:
 | |
|         print(token.text, token.lemma_, token.tag_, token.ent_type_)
 |