//- 💫 DOCS > API > PHRASEMATCHER include ../_includes/_mixins p | The #[code PhraseMatcher] lets you efficiently match large terminology | lists. While the #[+api("matcher") #[code Matcher]] lets you match | squences based on lists of token descriptions, the #[code PhraseMatcher] | accepts match patterns in the form of #[code Doc] objects. +h(2, "init") PhraseMatcher.__init__ +tag method p Create the rule-based #[code PhraseMatcher]. +aside-code("Example"). from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab, max_length=6) +table(["Name", "Type", "Description"]) +row +cell #[code vocab] +cell #[code Vocab] +cell | The vocabulary object, which must be shared with the documents | the matcher will operate on. +row +cell #[code max_length] +cell int +cell Mamimum length of a phrase pattern to add. +row("foot") +cell returns +cell #[code PhraseMatcher] +cell The newly constructed object. +h(2, "call") PhraseMatcher.__call__ +tag method p Find all token sequences matching the supplied patterns on the #[code Doc]. +aside-code("Example"). from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab) matcher.add('OBAMA', None, nlp(u"Barack Obama")) doc = nlp(u"Barack Obama lifts America one last time in emotional farewell") matches = matcher(doc) +table(["Name", "Type", "Description"]) +row +cell #[code doc] +cell #[code Doc] +cell The document to match over. +row("foot") +cell returns +cell list +cell | A list of #[code (match_id, start, end)] tuples, describing the | matches. A match tuple describes a span #[code doc[start:end]]. | The #[code match_id] is the ID of the added match pattern. +h(2, "pipe") PhraseMatcher.pipe +tag method p Match a stream of documents, yielding them in turn. +aside-code("Example"). from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab) for doc in matcher.pipe(texts, batch_size=50, n_threads=4): pass +table(["Name", "Type", "Description"]) +row +cell #[code docs] +cell iterable +cell A stream of documents. +row +cell #[code batch_size] +cell int +cell The number of documents to accumulate into a working set. +row +cell #[code n_threads] +cell int +cell | The number of threads with which to work on the buffer in | parallel, if the #[code PhraseMatcher] implementation supports | multi-threading. +row("foot") +cell yields +cell #[code Doc] +cell Documents, in order. +h(2, "len") PhraseMatcher.__len__ +tag method p | Get the number of rules added to the matcher. Note that this only returns | the number of rules (identical with the number of IDs), not the number | of individual patterns. +aside-code("Example"). matcher = PhraseMatcher(nlp.vocab) assert len(matcher) == 0 matcher.add('OBAMA', None, nlp(u"Barack Obama")) assert len(matcher) == 1 +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of rules. +h(2, "contains") PhraseMatcher.__contains__ +tag method p Check whether the matcher contains rules for a match ID. +aside-code("Example"). matcher = PhraseMatcher(nlp.vocab) assert len(matcher) == 0 matcher.add('OBAMA', None, nlp(u"Barack Obama")) assert len(matcher) == 1 +table(["Name", "Type", "Description"]) +row +cell #[code key] +cell unicode +cell The match ID. +row("foot") +cell returns +cell int +cell Whether the matcher contains rules for this match ID. +h(2, "add") PhraseMatcher.add +tag method p | Add a rule to the matcher, consisting of an ID key, one or more patterns, and | a callback function to act on the matches. The callback function will | receive the arguments #[code matcher], #[code doc], #[code i] and | #[code matches]. If a pattern already exists for the given ID, the | patterns will be extended. An #[code on_match] callback will be | overwritten. +aside-code("Example"). def on_match(matcher, doc, id, matches): print('Matched!', matches) matcher = PhraseMatcher(nlp.vocab) matcher.add('OBAMA', on_match, nlp(u"Barack Obama")) matcher.add('HEALTH', on_match, nlp(u"health care reform"), nlp(u"healthcare reform")) doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms") matches = matcher(doc) +table(["Name", "Type", "Description"]) +row +cell #[code match_id] +cell unicode +cell An ID for the thing you're matching. +row +cell #[code on_match] +cell callable or #[code None] +cell | Callback function to act on matches. Takes the arguments | #[code matcher], #[code doc], #[code i] and #[code matches]. +row +cell #[code *docs] +cell list +cell | #[code Doc] objects of the phrases to match.