spaCy/website/docs/api/matcher.jade

//- 💫 DOCS > API > MATCHER

include ../../_includes/_mixins

p Match sequences of tokens, based on pattern rules.

+infobox("⚠️ Deprecation note")
    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
    |  are deprecated and have been replaced with a simpler
    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
    |  patterns and a callback for a given match ID.
    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
    |  #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant)
    |  have been removed.

+h(2, "init") Matcher.__init__
    +tag method

p Create the rule-based #[code Matcher].

+aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER

    patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
    matcher = Matcher(nlp.vocab)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell
            |  The vocabulary object, which must be shared with the documents
            |  the matcher will operate on.

    +row
        +cell #[code patterns]
        +cell dict
        +cell Patterns to add to the matcher, keyed by ID.

    +footrow
        +cell returns
        +cell #[code Matcher]
        +cell The newly constructed object.

+h(2, "call") Matcher.__call__
    +tag method

p Find all token sequences matching the supplied patterns on the #[code Doc].

+aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER

    matcher = Matcher(nlp.vocab)
    pattern = [{LOWER: "hello"}, {LOWER: "world"}]
    matcher.add_pattern("HelloWorld", pattern, on_match=None)
    doc = nlp(u'hello world!')
    matches = matcher(doc)

+infobox("Important note")
    |  By default, the matcher #[strong does not perform any action] on matches,
    |  like tagging matched phrases with entity types. Instead, actions need to
    |  be specified when #[strong adding patterns or entities], by
    |  passing in a callback function as the #[code on_match] argument on
    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
    |  actions per pattern within the same matcher. For example, you might only
    |  want to merge some entity types, and set custom flags for other matched
    |  patterns. For more details and examples, see the usage workflow on
    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].

+table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document to match over.

    +footrow
        +cell returns
        +cell list
        +cell
            |  A list of #[code (match_id, start, end)] tuples, describing the
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.

+h(2, "pipe") Matcher.pipe
    +tag method

p Match a stream of documents, yielding them in turn.

+table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
        +cell iterable
        +cell A stream of documents.

    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of documents to accumulate into a working set.

    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads with which to work on the buffer in
            |  parallel, if the #[code Matcher] implementation supports
            |  multi-threading.

    +footrow
        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

+h(2, "add_pattern") Matcher.add
    +tag method

p
    |  Add one or more patterns to the matcher, along with a callback function
    |  to handle the matches. The callback function will receive the arguments
    |  #[code matcher], #[code doc], #[code i] and #[code matches].

+aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER, ORTH

    def on_match(matcher, doc, id, matches):
        print('Matched!', matches)

    matcher = Matcher(nlp.vocab)
    matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
    matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)

    doc = nlp(u'HELLO WORLD on Google Maps.')
    matches = matcher(doc)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code match_id]
        +cell unicode
        +cell An ID for the thing you're matching.

    +row
        +cell #[code *patterns]
        +cell list
        +cell
            |  Match pattern. A pattern consists of a list of dicts, where each
            |  dict describes a token.

    +row
        +cell #[code on_match]
        +cell function
        +cell
            |  Callback function to act on matches. Takes the arguments
            |  #[code matcher], #[code doc], #[code i] and #[code matches].