From e10c48210d632bddf81f8d12556fc39de9a45571 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 20 May 2017 12:59:03 +0200 Subject: [PATCH] Update Matcher API and workflow to reflect new API on_match is now the second positional argument, to easily allow a variable number of patterns while keeping the method clean and readable. --- website/docs/api/matcher.jade | 18 +++++++-------- website/docs/usage/rule-based-matching.jade | 25 ++++++++++++--------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 245f32eec..523c1660b 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -128,8 +128,8 @@ p print('Matched!', matches) matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match) - matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match) + matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}]) + matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}]) doc = nlp(u'HELLO WORLD on Google Maps.') matches = matcher(doc) @@ -140,16 +140,16 @@ p +cell unicode +cell An ID for the thing you're matching. - +row - +cell #[code *patterns] - +cell list - +cell - | Match pattern. A pattern consists of a list of dicts, where each - | dict describes a token. - +row +cell #[code on_match] +cell function +cell | Callback function to act on matches. Takes the arguments | #[code matcher], #[code doc], #[code i] and #[code matches]. + + +row + +cell #[code *patterns] + +cell list + +cell + | Match pattern. A pattern consists of a list of dicts, where each + | dict describes a token. diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index 077c0f9e6..2e14e12a9 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -36,7 +36,9 @@ p | First, we initialise the #[code Matcher] with a vocab. The matcher must | always share the same vocab with the documents it will operate on. We | can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and - | our custom pattern: + | our custom pattern. The second argument lets you pass in an optional + | callback function to invoke on a successful match. For now, we set it + | to #[code None]. +code. import spacy @@ -45,7 +47,9 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]) + # add match ID "HelloWorld" with no callback and one pattern + matcher.add('HelloWorld', on_match=None, + [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]) doc = nlp(u'Hello, world! Hello world!') matches = matcher(doc) @@ -58,8 +62,9 @@ p | without punctuation between "hello" and "world": +code. - matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], - [{LOWER: 'hello'}, {LOWER: 'world'}]) + matcher.add('HelloWorld', on_match=None, + [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], + [{LOWER: 'hello'}, {LOWER: 'world'}]) p | By default, the matcher will only return the matches and @@ -92,9 +97,9 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - matcher.add('GoogleIO', [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}], - [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}], - on_match=add_event_ent) + matcher.add('GoogleIO', on_match=add_event_ent, + [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}], + [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}]) # Get the ID of the 'EVENT' entity type. This is required to set an entity. EVENT = nlp.vocab.strings['EVENT'] @@ -114,9 +119,9 @@ p | function #[code merge_and_flag]: +code. - matcher.add('BAD_HTML', [{ORTH: '<'}, {LOWER: 'br'}, {ORTH: '>'}], - [{ORTH: '<'}, {LOWER: 'br/'}, {ORTH: '>'}] - on_match=merge_and_flag) + matcher.add('BAD_HTML', on_match=merge_and_flag, + [{ORTH: '<'}, {LOWER: 'br'}, {ORTH: '>'}], + [{ORTH: '<'}, {LOWER: 'br/'}, {ORTH: '>'}]) # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.