spaCy/website/docs/tutorials/rule-based-matcher.jade

include ../../_includes/_mixins

p.u-text-large spaCy features a rule-matching engine that operates over tokens. The rules can refer to token annotations and flags, and matches support callbacks to accept, modify and/or act on the match. The rule matcher also allows you to associate patterns with entity IDs, to allow some basic entity linking or disambiguation.

+code("python", "Matcher Example").
    from spacy.matcher import Matcher
    from spacy.attrs import *
    import spacy

    nlp = spacy.load('en', parser=False, entity=False)

    def merge_phrases(matcher, doc, i, matches):
        '''
        Merge a phrase. We have to be careful here because we'll change the token indices.
        To avoid problems, merge all the phrases once we're called on the last match.
        '''
        if i != len(matches)-1:
            return None
        # Get Span objects
        spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
        for ent_id, label, span in spans:
            span.merge(label=label, tag='NNP' if label else span.root.tag_)

    matcher = Matcher(nlp.vocab)

    matcher.add_entity(
        "GoogleNow", # Entity ID -- Helps you act on the match.
        {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
        acceptor=None, # Accept or modify the match
        on_match=merge_phrases # Callback to act on the matches
    )

    matcher.add_pattern(
        "GoogleNow", # Entity ID -- Created if doesn't exist.
        [ # The pattern is a list of *Token Specifiers*.
            { # This Token Specifier matches tokens whose orth field is "Google"
              ORTH: "Google"
            },
            { # This Token Specifier matches tokens whose orth field is "Now"
              ORTH: "Now"
            }
        ],
        label=None # Can associate a label to the pattern-match, to handle it better.
    )
    doc = nlp(u"I prefer Siri to Google Now.")
    matches = matcher(doc)
    for ent_id, label, start, end in matches:
        print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
        entity = matcher.get_entity(ent_id)
        print(entity)

    matcher.add_pattern(
        "GoogleNow",
        [ # This Surface Form matches "google now", verbatim, and requires
          # "google" to have the NNP tag. This helps prevent the pattern from
          # matching cases like "I will google now to look up the time"
          {
            ORTH: "google",
            TAG: "NNP"
          },
          {
            ORTH: "now"
          }
        ]
    )

    doc = nlp(u"I'll google now to find out how the google now service works.")
    matches = matcher(doc)
    for ent_id, label, start, end in matches:
        print(ent_id, label, start, end, doc[start : end].text)
    # Because we specified the on_match=merge_phrases callback,
    # we should see 'google now' as a single token.
    for token in doc:
        print(token.text, token.lemma_, token.tag_, token.ent_type_)