mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			216 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			216 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > RULE-BASED MATCHING
 | ||
| 
 | ||
| include ../../_includes/_mixins
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy features a rule-matching engine that operates over tokens, similar
 | ||
|     |  to regular expressions. The rules can refer to token annotations (e.g.
 | ||
|     |  the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
 | ||
|     |  The rule matcher also lets you pass in a custom callback
 | ||
|     |  to act on matches – for example, to merge entities and apply custom labels.
 | ||
|     |  You can also associate patterns with entity IDs, to allow some basic
 | ||
|     |  entity linking or disambiguation.
 | ||
| 
 | ||
| +aside("What about \"real\" regular expressions?")
 | ||
| 
 | ||
| +h(2, "adding-patterns") Adding patterns
 | ||
| 
 | ||
| p
 | ||
|     |  Let's say we want to enable spaCy to find a combination of three tokens:
 | ||
| 
 | ||
| +list("numbers")
 | ||
|     +item
 | ||
|         |  A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
 | ||
|         |  or "HELLO".
 | ||
|     +item
 | ||
|         |  A token whose #[strong #[code is_punct] flag is set to #[code True]],
 | ||
|         |  i.e. any punctuation.
 | ||
|     +item
 | ||
|         |  A token whose #[strong lower-case form matches "world"], e.g. "World"
 | ||
|         |  or "WORLD".
 | ||
| 
 | ||
| +code.
 | ||
|     [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
 | ||
| 
 | ||
| p
 | ||
|     |  First, we initialise the #[code Matcher] with a vocab. The matcher must
 | ||
|     |  always share the same vocab with the documents it will operate on. We
 | ||
|     |  can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
 | ||
|     |  our custom pattern. The second argument lets you pass in an optional
 | ||
|     |  callback function to invoke on a successful match. For now, we set it
 | ||
|     |  to #[code None].
 | ||
| 
 | ||
| +code.
 | ||
|     import spacy
 | ||
|     from spacy.matcher import Matcher
 | ||
| 
 | ||
|     nlp = spacy.load('en')
 | ||
|     matcher = Matcher(nlp.vocab)
 | ||
|     # add match ID "HelloWorld" with no callback and one pattern
 | ||
|     matcher.add('HelloWorld', on_match=None,
 | ||
|                 [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}])
 | ||
| 
 | ||
|     doc = nlp(u'Hello, world! Hello world!')
 | ||
|     matches = matcher(doc)
 | ||
| 
 | ||
| p
 | ||
|     |  The matcher returns a list of #[code (match_id, start, end)] tuples – in
 | ||
|     |  this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
 | ||
|     |  #[code doc[0:2]] of our original document. Optionally, we could also
 | ||
|     |  choose to add more than one pattern, for example to also match sequences
 | ||
|     |  without punctuation between "hello" and "world":
 | ||
| 
 | ||
| +code.
 | ||
|     matcher.add('HelloWorld', on_match=None,
 | ||
|                 [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}],
 | ||
|                 [{'LOWER': 'hello'}, {'LOWER': 'world'}])
 | ||
| 
 | ||
| p
 | ||
|     |  By default, the matcher will only return the matches and
 | ||
|     |  #[strong not do anything else], like merge entities or assign labels.
 | ||
|     |  This is all up to you and can be defined individually for each pattern,
 | ||
|     |  by passing in a callback function as the #[code on_match] argument on
 | ||
|     |  #[code add()]. This is useful, because it lets you write entirely custom
 | ||
|     |  and #[strong pattern-specific logic]. For example, you might want to
 | ||
|     |  merge #[em some] patterns into one token, while adding entity labels for
 | ||
|     |  other pattern types. You shouldn't have to create different matchers for
 | ||
|     |  each of those processes.
 | ||
| 
 | ||
| +h(2, "on_match") Adding #[code on_match] rules
 | ||
| 
 | ||
| p
 | ||
|     |  To move on to a more realistic example, let's say you're working with a
 | ||
|     |  large corpus of blog articles, and you want to match all mentions of
 | ||
|     |  "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
 | ||
|     |  To be safe, you only match on the uppercase versions, in case someone has
 | ||
|     |  written it as "Google i/o". You also add a second pattern with an added
 | ||
|     |  #[code {IS_DIGIT: True}] token – this will make sure you also match on
 | ||
|     |  "Google I/O 2017". If your pattern matches, spaCy should execute your
 | ||
|     |  custom callback function #[code add_event_ent].
 | ||
| 
 | ||
| +code.
 | ||
|     import spacy
 | ||
|     from spacy.matcher import Matcher
 | ||
| 
 | ||
|     nlp = spacy.load('en')
 | ||
|     matcher = Matcher(nlp.vocab)
 | ||
| 
 | ||
|     matcher.add('GoogleIO', on_match=add_event_ent,
 | ||
|                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
 | ||
|                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
 | ||
| 
 | ||
|     # Get the ID of the 'EVENT' entity type. This is required to set an entity.
 | ||
|     EVENT = nlp.vocab.strings['EVENT']
 | ||
| 
 | ||
|     def add_event_ent(matcher, doc, i, matches):
 | ||
|         # Get the current match and create tuple of entity label, start and end.
 | ||
|         # Append entity to the doc's entity. (Don't overwrite doc.ents!)
 | ||
|         match_id, start, end = matches[i]
 | ||
|         doc.ents += ((EVENT, start, end),)
 | ||
| 
 | ||
| p
 | ||
|     |  In addition to mentions of "Google I/O", your data also contains some
 | ||
|     |  annoying pre-processing artefacts, like leftover HTML line breaks
 | ||
|     |  (e.g. #[code <br>] or #[code <BR/>]). While you're at it,
 | ||
|     |  you want to merge those into one token and flag them, to make sure you
 | ||
|     |  can easily ignore them later. So you add a second pattern and pass in a
 | ||
|     |  function #[code merge_and_flag]:
 | ||
| 
 | ||
| +code.
 | ||
|     matcher.add('BAD_HTML', on_match=merge_and_flag,
 | ||
|                 [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
 | ||
|                 [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}])
 | ||
| 
 | ||
|     # Add a new custom flag to the vocab, which is always False by default.
 | ||
|     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
 | ||
|     BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
 | ||
| 
 | ||
|     def merge_and_flag(matcher, doc, i, matches):
 | ||
|         match_id, start, end = matches[i]
 | ||
|         span = doc[start : end]
 | ||
|         span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
 | ||
|         span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
 | ||
| 
 | ||
| +aside("Tip: Visualizing matches")
 | ||
|     |  When working with entities, you can use #[+api("displacy") displaCy]
 | ||
|     |  to quickly generate a NER visualization from your updated #[code Doc],
 | ||
|     |  which can be exported as an HTML file:
 | ||
| 
 | ||
|     +code.o-no-block.
 | ||
|         from spacy import displacy
 | ||
|         html = displacy.render(doc, style='ent', page=True,
 | ||
|                                options={'ents': ['EVENT']})
 | ||
| 
 | ||
|     |  For more info and examples, see the usage workflow on
 | ||
|     |  #[+a("/docs/usage/visualizers") visualizing spaCy].
 | ||
| 
 | ||
| p
 | ||
|     |  We can now call the matcher on our documents. The patterns will be
 | ||
|     |  matched in the order they occur in the text.
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(LOTS_OF_TEXT)
 | ||
|     matcher(doc)
 | ||
| 
 | ||
| +h(3, "on_match-callback") The callback function
 | ||
| 
 | ||
| p
 | ||
|     |  The matcher will first collect all matches over the document. It will
 | ||
|     |  then iterate over the matches, lookup the callback for the entity ID
 | ||
|     |  that was matched, and invoke it. When the callback is invoked, it is
 | ||
|     |  passed four arguments: the matcher itself, the document, the position of
 | ||
|     |  the current match, and the total list of matches. This allows you to
 | ||
|     |  write callbacks that consider the entire set of matched phrases, so that
 | ||
|     |  you can resolve overlaps and other conflicts in whatever way you prefer.
 | ||
| 
 | ||
| +table(["Argument", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code matcher]
 | ||
|         +cell #[code Matcher]
 | ||
|         +cell The matcher instance.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code doc]
 | ||
|         +cell #[code Doc]
 | ||
|         +cell The document the matcher was used on.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code i]
 | ||
|         +cell int
 | ||
|         +cell Index of the current match (#[code matches[i]]).
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code matches]
 | ||
|         +cell list
 | ||
|         +cell
 | ||
|             |  A list of #[code (match_id, start, end)] tuples, describing the
 | ||
|             |  matches. A match tuple describes a span #[code doc[start:end]].
 | ||
|             |  The #[code match_id] is the ID of the added match pattern.
 | ||
| 
 | ||
| +h(2, "quantifiers") Using quantifiers
 | ||
| 
 | ||
| +table([ "Name", "Description", "Example"])
 | ||
|     +row
 | ||
|         +cell #[code !]
 | ||
|         +cell match exactly 0 times
 | ||
|         +cell negation
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code *]
 | ||
|         +cell match 0 or more times
 | ||
|         +cell optional, variable number
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code +]
 | ||
|         +cell match 1 or more times
 | ||
|         +cell mandatory, variable number
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[code ?]
 | ||
|         +cell match 0 or 1 times
 | ||
|         +cell optional, max one
 | ||
| 
 | ||
| p
 | ||
|     |  There are no nested or scoped quantifiers. You can build those
 | ||
|     |  behaviours with #[code on_match] callbacks.
 |