mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			182 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			182 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
//- 💫 DOCS > API > PHRASEMATCHER
 | 
						|
 | 
						|
include ../_includes/_mixins
 | 
						|
 | 
						|
p
 | 
						|
    |  The #[code PhraseMatcher] lets you efficiently match large terminology
 | 
						|
    |  lists. While the #[+api("matcher") #[code Matcher]] lets you match
 | 
						|
    |  squences based on lists of token descriptions, the #[code PhraseMatcher]
 | 
						|
    |  accepts match patterns in the form of #[code Doc] objects.
 | 
						|
 | 
						|
+h(2, "init") PhraseMatcher.__init__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Create the rule-based #[code PhraseMatcher].
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    from spacy.matcher import PhraseMatcher
 | 
						|
    matcher = PhraseMatcher(nlp.vocab, max_length=6)
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code vocab]
 | 
						|
        +cell #[code Vocab]
 | 
						|
        +cell
 | 
						|
            |  The vocabulary object, which must be shared with the documents
 | 
						|
            |  the matcher will operate on.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code max_length]
 | 
						|
        +cell int
 | 
						|
        +cell Mamimum length of a phrase pattern to add.
 | 
						|
 | 
						|
    +row("foot")
 | 
						|
        +cell returns
 | 
						|
        +cell #[code PhraseMatcher]
 | 
						|
        +cell The newly constructed object.
 | 
						|
 | 
						|
+h(2, "call") PhraseMatcher.__call__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Find all token sequences matching the supplied patterns on the #[code Doc].
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    from spacy.matcher import PhraseMatcher
 | 
						|
 | 
						|
    matcher = PhraseMatcher(nlp.vocab)
 | 
						|
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
 | 
						|
    doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")
 | 
						|
    matches = matcher(doc)
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code doc]
 | 
						|
        +cell #[code Doc]
 | 
						|
        +cell The document to match over.
 | 
						|
 | 
						|
    +row("foot")
 | 
						|
        +cell returns
 | 
						|
        +cell list
 | 
						|
        +cell
 | 
						|
            |  A list of #[code (match_id, start, end)] tuples, describing the
 | 
						|
            |  matches. A match tuple describes a span #[code doc[start:end]].
 | 
						|
            |  The #[code match_id] is the ID of the added match pattern.
 | 
						|
 | 
						|
+h(2, "pipe") PhraseMatcher.pipe
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Match a stream of documents, yielding them in turn.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    from spacy.matcher import PhraseMatcher
 | 
						|
    matcher = PhraseMatcher(nlp.vocab)
 | 
						|
    for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
 | 
						|
        pass
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code docs]
 | 
						|
        +cell iterable
 | 
						|
        +cell A stream of documents.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code batch_size]
 | 
						|
        +cell int
 | 
						|
        +cell The number of documents to accumulate into a working set.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code n_threads]
 | 
						|
        +cell int
 | 
						|
        +cell
 | 
						|
            |  The number of threads with which to work on the buffer in
 | 
						|
            |  parallel, if the #[code PhraseMatcher] implementation supports
 | 
						|
            |  multi-threading.
 | 
						|
 | 
						|
    +row("foot")
 | 
						|
        +cell yields
 | 
						|
        +cell #[code Doc]
 | 
						|
        +cell Documents, in order.
 | 
						|
 | 
						|
+h(2, "len") PhraseMatcher.__len__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p
 | 
						|
    |  Get the number of rules added to the matcher. Note that this only returns
 | 
						|
    |  the number of rules (identical with the number of IDs), not the number
 | 
						|
    |  of individual patterns.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    matcher = PhraseMatcher(nlp.vocab)
 | 
						|
    assert len(matcher) == 0
 | 
						|
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
 | 
						|
    assert len(matcher) == 1
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row("foot")
 | 
						|
        +cell returns
 | 
						|
        +cell int
 | 
						|
        +cell The number of rules.
 | 
						|
 | 
						|
+h(2, "contains") PhraseMatcher.__contains__
 | 
						|
    +tag method
 | 
						|
 | 
						|
p Check whether the matcher contains rules for a match ID.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    matcher = PhraseMatcher(nlp.vocab)
 | 
						|
    assert 'OBAMA' not in matcher
 | 
						|
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
 | 
						|
    assert 'OBAMA' in matcher
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code key]
 | 
						|
        +cell unicode
 | 
						|
        +cell The match ID.
 | 
						|
 | 
						|
    +row("foot")
 | 
						|
        +cell returns
 | 
						|
        +cell int
 | 
						|
        +cell Whether the matcher contains rules for this match ID.
 | 
						|
 | 
						|
+h(2, "add") PhraseMatcher.add
 | 
						|
    +tag method
 | 
						|
 | 
						|
p
 | 
						|
    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
 | 
						|
    |  a callback function to act on the matches. The callback function will
 | 
						|
    |  receive the arguments #[code matcher], #[code doc], #[code i] and
 | 
						|
    |  #[code matches]. If a pattern already exists for the given ID, the
 | 
						|
    |  patterns will be extended. An #[code on_match] callback will be
 | 
						|
    |  overwritten.
 | 
						|
 | 
						|
+aside-code("Example").
 | 
						|
    def on_match(matcher, doc, id, matches):
 | 
						|
        print('Matched!', matches)
 | 
						|
 | 
						|
    matcher = PhraseMatcher(nlp.vocab)
 | 
						|
    matcher.add('OBAMA', on_match, nlp(u"Barack Obama"))
 | 
						|
    matcher.add('HEALTH', on_match, nlp(u"health care reform"),
 | 
						|
                                    nlp(u"healthcare reform"))
 | 
						|
    doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")
 | 
						|
    matches = matcher(doc)
 | 
						|
 | 
						|
+table(["Name", "Type", "Description"])
 | 
						|
    +row
 | 
						|
        +cell #[code match_id]
 | 
						|
        +cell unicode
 | 
						|
        +cell An ID for the thing you're matching.
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code on_match]
 | 
						|
        +cell callable or #[code None]
 | 
						|
        +cell
 | 
						|
            |  Callback function to act on matches. Takes the arguments
 | 
						|
            |  #[code matcher], #[code doc], #[code i] and #[code matches].
 | 
						|
 | 
						|
    +row
 | 
						|
        +cell #[code *docs]
 | 
						|
        +cell list
 | 
						|
        +cell
 | 
						|
            |  #[code Doc] objects of the phrases to match.
 |