mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Add examples for Matcher, to answer Issue #105. TODO: Integrate into docs properly.
This commit is contained in:
parent
60fbbfcaa2
commit
c17e2f2f20
133
examples/matcher_example.py
Normal file
133
examples/matcher_example.py
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
import spacy.en
|
||||||
|
import spacy.matcher
|
||||||
|
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
|
||||||
|
|
||||||
|
import plac
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nlp = spacy.en.English()
|
||||||
|
example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
|
||||||
|
before = nlp(example)
|
||||||
|
print("Before")
|
||||||
|
for ent in before.ents:
|
||||||
|
print(ent.text, ent.label_, [w.tag_ for w in ent])
|
||||||
|
nlp.matcher.add(
|
||||||
|
"GoogleNow", # Entity ID: Not really used at the moment.
|
||||||
|
"PRODUCT", # Entity type: should be one of the types in the NER data
|
||||||
|
{"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
|
||||||
|
[ # List of patterns that can be Surface Forms of the entity
|
||||||
|
|
||||||
|
# This Surface Form matches "Google Now", verbatim
|
||||||
|
[ # Each Surface Form is a list of Token Specifiers.
|
||||||
|
{ # This Token Specifier matches tokens whose orth field is "Google"
|
||||||
|
ORTH: "Google"
|
||||||
|
},
|
||||||
|
{ # This Token Specifier matches tokens whose orth field is "Now"
|
||||||
|
ORTH: "Now"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
[ # This Surface Form matches "google now", verbatim, and requires
|
||||||
|
# "google" to have the NNP tag. This helps prevent the pattern from
|
||||||
|
# matching cases like "I will google now to look up the time"
|
||||||
|
{
|
||||||
|
ORTH: "google",
|
||||||
|
TAG: "NNP"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ORTH: "now"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
)
|
||||||
|
after = nlp(example)
|
||||||
|
print("After")
|
||||||
|
for ent in after.ents:
|
||||||
|
print(ent.text, ent.label_, [w.tag_ for w in ent])
|
||||||
|
# You can customize attribute values in the lexicon, and then refer to the
|
||||||
|
# new attributes in your Token Specifiers.
|
||||||
|
# This is particularly good for word-set membership.
|
||||||
|
#
|
||||||
|
australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
|
||||||
|
'Darwin', 'Adelaide', 'Perth']
|
||||||
|
# Internally, the tokenizer immediately maps each token to a pointer to a
|
||||||
|
# LexemeC struct. These structs hold various features, e.g. the integer IDs
|
||||||
|
# of the normalized string forms.
|
||||||
|
# For our purposes, the key attribute is a 64-bit integer, used as a bit field.
|
||||||
|
# spaCy currently only uses 12 of the bits for its built-in features, so
|
||||||
|
# the others are available for use. It's best to use the higher bits, as
|
||||||
|
# future versions of spaCy may add more flags. For instance, we might add
|
||||||
|
# a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
|
||||||
|
# FLAG63 here.
|
||||||
|
is_australian_capital = FLAG63
|
||||||
|
# Now we need to set the flag value. It's False on all tokens by default,
|
||||||
|
# so we just need to set it to True for the tokens we want.
|
||||||
|
# Here we iterate over the strings, and set it on only the literal matches.
|
||||||
|
for string in australian_capitals:
|
||||||
|
lexeme = nlp.vocab[string]
|
||||||
|
lexeme.set_flag(is_australian_capital, True)
|
||||||
|
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
|
||||||
|
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
|
||||||
|
# If we want case-insensitive matching, we have to be a little bit more
|
||||||
|
# round-about, as there's no case-insensitive index to the vocabulary. So
|
||||||
|
# we have to iterate over the vocabulary.
|
||||||
|
# We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
|
||||||
|
target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
|
||||||
|
for lexeme in nlp.vocab:
|
||||||
|
if lexeme.lower in target_ids:
|
||||||
|
lexeme.set_flag(is_australian_capital, True)
|
||||||
|
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
|
||||||
|
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
|
||||||
|
print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
|
||||||
|
# Now, let's use this in a pattern
|
||||||
|
nlp.matcher.add("AuCitySportsTeam", "ORG", {},
|
||||||
|
[
|
||||||
|
[
|
||||||
|
{LOWER: "the"},
|
||||||
|
{is_australian_capital: True},
|
||||||
|
{TAG: "NNS"}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{LOWER: "the"},
|
||||||
|
{is_australian_capital: True},
|
||||||
|
{TAG: "NNPS"}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{LOWER: "the"},
|
||||||
|
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
|
||||||
|
{is_australian_capital: True},
|
||||||
|
{TAG: "NNS"}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{LOWER: "the"},
|
||||||
|
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
|
||||||
|
{is_australian_capital: True},
|
||||||
|
{TAG: "NNPS"}
|
||||||
|
]
|
||||||
|
])
|
||||||
|
doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
|
||||||
|
for ent in doc.ents:
|
||||||
|
print(ent.text, ent.label_)
|
||||||
|
|
||||||
|
# Output
|
||||||
|
# Before
|
||||||
|
# Google ORG [u'NNP']
|
||||||
|
# google ORG [u'VB']
|
||||||
|
# google ORG [u'NNP']
|
||||||
|
# After
|
||||||
|
# Google Now PRODUCT [u'NNP', u'RB']
|
||||||
|
# google ORG [u'VB']
|
||||||
|
# google now PRODUCT [u'NNP', u'RB']
|
||||||
|
# Sydney True
|
||||||
|
# sydney False
|
||||||
|
# Sydney True
|
||||||
|
# sydney True
|
||||||
|
# SYDNEY True
|
||||||
|
# the Brisbane Broncos ORG
|
||||||
|
# the South Darwin Spiders ORG
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user