From 6b30cbaf0b1d2d16c920400ba50c93bef109a75a Mon Sep 17 00:00:00 2001 From: chssch Date: Sat, 22 Oct 2016 15:05:41 +0200 Subject: [PATCH 1/2] Strings has be to on vocab object --- website/docs/tutorials/rule-based-matcher.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/tutorials/rule-based-matcher.jade b/website/docs/tutorials/rule-based-matcher.jade index 8c8949631..53d76e145 100644 --- a/website/docs/tutorials/rule-based-matcher.jade +++ b/website/docs/tutorials/rule-based-matcher.jade @@ -32,7 +32,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens. doc = nlp(u"I prefer Siri to Google Now.") matches = matcher(doc) for ent_id, label, start, end in matches: - print(nlp.strings[ent_id], nlp.strings[label], doc[start : end].text) + print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text) entity = matcher.get_entity(ent_id) print(entity) From cf7b6f7a9db1756135280a5c274fb53642d3c34c Mon Sep 17 00:00:00 2001 From: chssch Date: Sat, 22 Oct 2016 15:07:56 +0200 Subject: [PATCH 2/2] Add merge phrases from https://github.com/explosion/spaCy/issues/523#issuecomment-255172782 --- website/docs/tutorials/rule-based-matcher.jade | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/website/docs/tutorials/rule-based-matcher.jade b/website/docs/tutorials/rule-based-matcher.jade index 53d76e145..900a86a63 100644 --- a/website/docs/tutorials/rule-based-matcher.jade +++ b/website/docs/tutorials/rule-based-matcher.jade @@ -9,6 +9,18 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens. nlp = spacy.load('en', parser=False, entity=False) + def merge_phrases(matcher, doc, i, matches): + ''' + Merge a phrase. We have to be careful here because we'll change the token indices. + To avoid problems, merge all the phrases once we're called on the last match. + ''' + if i != len(matches)-1: + return None + # Get Span objects + spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches] + for ent_id, label, span in spans: + span.merge(label=label, tag='NNP' if label else span.root.tag_) + matcher = Matcher(nlp.vocab) matcher.add_entity( @@ -17,6 +29,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens. acceptor=None, # Accept or modify the match on_match=merge_phrases # Callback to act on the matches ) + matcher.add_pattern( "GoogleNow", # Entity ID -- Created if doesn't exist. [ # The pattern is a list of *Token Specifiers*.