Merge pull request #561 from chssch/master

Working version of rule matcher tutorial
2026-01-09 02:01:22 +03:00 · 2016-10-23 02:52:20 +11:00 · 2016-10-23 02:52:20 +11:00 · 30ebb84e73
commit 30ebb84e73
parent 8356a2893a cf7b6f7a9d
1 changed files with 14 additions and 1 deletions
--- a/website/docs/tutorials/rule-based-matcher.jade
+++ b/website/docs/tutorials/rule-based-matcher.jade
@ -9,6 +9,18 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.

    nlp = spacy.load('en', parser=False, entity=False)

+    def merge_phrases(matcher, doc, i, matches):
+        '''
+        Merge a phrase. We have to be careful here because we'll change the token indices.
+        To avoid problems, merge all the phrases once we're called on the last match.
+        '''
+        if i != len(matches)-1:
+            return None
+        # Get Span objects
+        spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
+        for ent_id, label, span in spans:
+            span.merge(label=label, tag='NNP' if label else span.root.tag_)
+
    matcher = Matcher(nlp.vocab)

    matcher.add_entity(
@ -17,6 +29,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
        acceptor=None, # Accept or modify the match
        on_match=merge_phrases # Callback to act on the matches
    )
+
    matcher.add_pattern(
        "GoogleNow", # Entity ID -- Created if doesn't exist.
        [ # The pattern is a list of *Token Specifiers*.
@ -32,7 +45,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
    doc = nlp(u"I prefer Siri to Google Now.")
    matches = matcher(doc)
    for ent_id, label, start, end in matches:
-        print(nlp.strings[ent_id], nlp.strings[label], doc[start : end].text)
+        print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
        entity = matcher.get_entity(ent_id)
        print(entity)