Merge branch 'master' of ssh://github.com/explosion/spaCy

This commit is contained in:
Matthew Honnibal 2016-10-23 14:33:54 +02:00
commit 7638f439e5
2 changed files with 19 additions and 1 deletions

View File

@ -179,6 +179,11 @@ Install a version of Visual Studio Express or higher that matches the version
that was used to compile your Python interpreter. For official distributions
these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and VS 2015 (Python 3.5).
If you don't want to install the entire Visual Studio, you can install a
stand-alone compiler. Make sure that you install the correct version for
your version of Python. See https://wiki.python.org/moin/WindowsCompilers for
links to download these.
Run tests
=========

View File

@ -9,6 +9,18 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
nlp = spacy.load('en', parser=False, entity=False)
def merge_phrases(matcher, doc, i, matches):
'''
Merge a phrase. We have to be careful here because we'll change the token indices.
To avoid problems, merge all the phrases once we're called on the last match.
'''
if i != len(matches)-1:
return None
# Get Span objects
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
for ent_id, label, span in spans:
span.merge(label=label, tag='NNP' if label else span.root.tag_)
matcher = Matcher(nlp.vocab)
matcher.add_entity(
@ -17,6 +29,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
acceptor=None, # Accept or modify the match
on_match=merge_phrases # Callback to act on the matches
)
matcher.add_pattern(
"GoogleNow", # Entity ID -- Created if doesn't exist.
[ # The pattern is a list of *Token Specifiers*.
@ -32,7 +45,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
doc = nlp(u"I prefer Siri to Google Now.")
matches = matcher(doc)
for ent_id, label, start, end in matches:
print(nlp.strings[ent_id], nlp.strings[label], doc[start : end].text)
print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
entity = matcher.get_entity(ent_id)
print(entity)