mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
75 lines
3.0 KiB
Plaintext
75 lines
3.0 KiB
Plaintext
include ../../_includes/_mixins
|
|
|
|
p.u-text-large spaCy features a rule-matching engine that operates over tokens. The rules can refer to token annotations and flags, and matches support callbacks to accept, modify and/or act on the match. The rule matcher also allows you to associate patterns with entity IDs, to allow some basic entity linking or disambiguation.
|
|
|
|
+code("python", "Matcher Example").
|
|
from spacy.matcher import Matcher
|
|
from spacy.attrs import *
|
|
import spacy
|
|
|
|
nlp = spacy.load('en', parser=False, entity=False)
|
|
|
|
def merge_phrases(matcher, doc, i, matches):
|
|
'''
|
|
Merge a phrase. We have to be careful here because we'll change the token indices.
|
|
To avoid problems, merge all the phrases once we're called on the last match.
|
|
'''
|
|
if i != len(matches)-1:
|
|
return None
|
|
# Get Span objects
|
|
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
|
|
for ent_id, label, span in spans:
|
|
span.merge(label=label, tag='NNP' if label else span.root.tag_)
|
|
|
|
matcher = Matcher(nlp.vocab)
|
|
|
|
matcher.add_entity(
|
|
"GoogleNow", # Entity ID -- Helps you act on the match.
|
|
{"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
|
|
acceptor=None, # Accept or modify the match
|
|
on_match=merge_phrases # Callback to act on the matches
|
|
)
|
|
|
|
matcher.add_pattern(
|
|
"GoogleNow", # Entity ID -- Created if doesn't exist.
|
|
[ # The pattern is a list of *Token Specifiers*.
|
|
{ # This Token Specifier matches tokens whose orth field is "Google"
|
|
ORTH: "Google"
|
|
},
|
|
{ # This Token Specifier matches tokens whose orth field is "Now"
|
|
ORTH: "Now"
|
|
}
|
|
],
|
|
label=None # Can associate a label to the pattern-match, to handle it better.
|
|
)
|
|
doc = nlp(u"I prefer Siri to Google Now.")
|
|
matches = matcher(doc)
|
|
for ent_id, label, start, end in matches:
|
|
print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
|
|
entity = matcher.get_entity(ent_id)
|
|
print(entity)
|
|
|
|
matcher.add_pattern(
|
|
"GoogleNow",
|
|
[ # This Surface Form matches "google now", verbatim, and requires
|
|
# "google" to have the NNP tag. This helps prevent the pattern from
|
|
# matching cases like "I will google now to look up the time"
|
|
{
|
|
ORTH: "google",
|
|
TAG: "NNP"
|
|
},
|
|
{
|
|
ORTH: "now"
|
|
}
|
|
]
|
|
)
|
|
|
|
doc = nlp(u"I'll google now to find out how the google now service works.")
|
|
matches = matcher(doc)
|
|
for ent_id, label, start, end in matches:
|
|
print(ent_id, label, start, end, doc[start : end].text)
|
|
# Because we specified the on_match=merge_phrases callback,
|
|
# we should see 'google now' as a single token.
|
|
for token in doc:
|
|
print(token.text, token.lemma_, token.tag_, token.ent_type_)
|