mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update Matcher API and workflow to reflect new API
on_match is now the second positional argument, to easily allow a variable number of patterns while keeping the method clean and readable.
This commit is contained in:
parent
eb521af267
commit
e10c48210d
|
@ -128,8 +128,8 @@ p
|
|||
print('Matched!', matches)
|
||||
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
|
||||
matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
|
||||
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
|
||||
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
|
||||
|
||||
doc = nlp(u'HELLO WORLD on Google Maps.')
|
||||
matches = matcher(doc)
|
||||
|
@ -140,16 +140,16 @@ p
|
|||
+cell unicode
|
||||
+cell An ID for the thing you're matching.
|
||||
|
||||
+row
|
||||
+cell #[code *patterns]
|
||||
+cell list
|
||||
+cell
|
||||
| Match pattern. A pattern consists of a list of dicts, where each
|
||||
| dict describes a token.
|
||||
|
||||
+row
|
||||
+cell #[code on_match]
|
||||
+cell function
|
||||
+cell
|
||||
| Callback function to act on matches. Takes the arguments
|
||||
| #[code matcher], #[code doc], #[code i] and #[code matches].
|
||||
|
||||
+row
|
||||
+cell #[code *patterns]
|
||||
+cell list
|
||||
+cell
|
||||
| Match pattern. A pattern consists of a list of dicts, where each
|
||||
| dict describes a token.
|
||||
|
|
|
@ -36,7 +36,9 @@ p
|
|||
| First, we initialise the #[code Matcher] with a vocab. The matcher must
|
||||
| always share the same vocab with the documents it will operate on. We
|
||||
| can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
|
||||
| our custom pattern:
|
||||
| our custom pattern. The second argument lets you pass in an optional
|
||||
| callback function to invoke on a successful match. For now, we set it
|
||||
| to #[code None].
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
|
@ -45,7 +47,9 @@ p
|
|||
|
||||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
|
||||
# add match ID "HelloWorld" with no callback and one pattern
|
||||
matcher.add('HelloWorld', on_match=None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
|
||||
|
||||
doc = nlp(u'Hello, world! Hello world!')
|
||||
matches = matcher(doc)
|
||||
|
@ -58,8 +62,9 @@ p
|
|||
| without punctuation between "hello" and "world":
|
||||
|
||||
+code.
|
||||
matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
matcher.add('HelloWorld', on_match=None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
|
||||
p
|
||||
| By default, the matcher will only return the matches and
|
||||
|
@ -92,9 +97,9 @@ p
|
|||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
matcher.add('GoogleIO', [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
|
||||
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}],
|
||||
on_match=add_event_ent)
|
||||
matcher.add('GoogleIO', on_match=add_event_ent,
|
||||
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
|
||||
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
|
||||
|
||||
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
|
||||
EVENT = nlp.vocab.strings['EVENT']
|
||||
|
@ -114,9 +119,9 @@ p
|
|||
| function #[code merge_and_flag]:
|
||||
|
||||
+code.
|
||||
matcher.add('BAD_HTML', [{ORTH: '<'}, {LOWER: 'br'}, {ORTH: '>'}],
|
||||
[{ORTH: '<'}, {LOWER: 'br/'}, {ORTH: '>'}]
|
||||
on_match=merge_and_flag)
|
||||
matcher.add('BAD_HTML', on_match=merge_and_flag,
|
||||
[{ORTH: '<'}, {LOWER: 'br'}, {ORTH: '>'}],
|
||||
[{ORTH: '<'}, {LOWER: 'br/'}, {ORTH: '>'}])
|
||||
|
||||
# Add a new custom flag to the vocab, which is always False by default.
|
||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||
|
|
Loading…
Reference in New Issue
Block a user