From 5ee645d742193293f8933de1ed63cd44a08aba70 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 14 Aug 2015 18:59:08 +0200
Subject: [PATCH] * Add Displacy mixin. Needs to pull the data from the web

---
 docs/redesign/online_demo.jade |  18 +++++
 examples/twitter_filter.py     | 141 +++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 examples/twitter_filter.py

diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade
index e69de29bb..0e2bbb331 100644
--- a/docs/redesign/online_demo.jade
+++ b/docs/redesign/online_demo.jade
@@ -0,0 +1,18 @@
+mixin Displacy(sentence, caption_text, height)
+  - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20")
+
+  .displacy
+    iframe.displacy(src="displacy/displacy_demo.html" height=height)
+    
+    a.view-displacy(href=url)
+      | View on displaCy
+
+    p.caption.
+      #{caption_text}
+
+
++Displacy(
+  "Click the button to see this sentence in displaCy.",
+  "The best parse-tree visualizer and annotation tool in all the land.",
+  275
+)
diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py
new file mode 100644
index 000000000..f842acdd4
--- /dev/null
+++ b/examples/twitter_filter.py
@@ -0,0 +1,141 @@
+from __future__ import unicode_literals, print_function
+import plac
+import codecs
+import sys
+import math
+
+import spacy.en
+from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
+
+from termcolor import colored
+from twython import TwythonStreamer
+
+from os import path
+from math import sqrt
+
+from numpy import dot
+from numpy.linalg import norm
+
+
+class Meaning(object):
+    def __init__(self, vectors):
+        if vectors:
+            self.vector = sum(vectors) / len(vectors)
+            self.norm = norm(self.vector)
+        else:
+            self.vector = None
+            self.norm = 0
+
+    @classmethod
+    def from_path(cls, nlp, loc):
+        with codecs.open(loc, 'r', 'utf8') as file_:
+            terms = file_.read().strip().split()
+        return cls.from_terms(nlp, terms)
+
+    @classmethod
+    def from_tokens(cls, nlp, tokens):
+        vectors = [t.repvec for t in tokens]
+        return cls(vectors)
+
+    @classmethod
+    def from_terms(cls, nlp, examples):
+        lexemes = [nlp.vocab[eg] for eg in examples]
+        vectors = [eg.repvec for eg in lexemes]
+        return cls(vectors)
+
+    def similarity(self, other):
+        if not self.norm or not other.norm:
+            return -1
+        return dot(self.vector, other.vector) / (self.norm * other.norm)
+
+
+def print_colored(model, stream=sys.stdout):
+    if model['is_match']:
+        color = 'green'
+    elif model['is_reject']:
+        color = 'red'
+    else:
+        color = 'grey'
+    
+    if not model['is_rare'] and model['is_match'] and not model['is_reject']:
+        match_score = colored('%.3f' % model['match_score'], 'green')
+        reject_score = colored('%.3f' % model['reject_score'], 'red')
+        prob = '%.5f' % model['prob']
+
+        print(match_score, reject_score, prob)
+        print(repr(model['text']), color)
+        print('')
+
+
+class TextMatcher(object):
+    def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
+        self.nlp = nlp
+        self.get_target = get_target
+        self.get_reject = get_reject
+        self.min_prob = min_prob
+        self.min_match = min_match
+        self.max_reject = max_reject
+
+    def __call__(self, text):
+        tweet = self.nlp(text)
+        target_terms = self.get_target()
+        reject_terms = self.get_reject()
+
+        prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
+        meaning = Meaning.from_tokens(self, tweet)
+        
+        match_score = meaning.similarity(self.get_target())
+        reject_score = meaning.similarity(self.get_reject())
+        return {
+            'text': tweet.string,
+            'prob': prob,
+            'match_score': match_score,
+            'reject_score': reject_score,
+            'is_rare': prob < self.min_prob,
+            'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
+            'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
+        }
+
+
+class Connection(TwythonStreamer):
+    def __init__(self, keys_dir, handler, view):
+        keys = Secrets(keys_dir)
+        TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) 
+        self.handler = handler
+        self.view = view
+
+    def on_success(self, data):
+        text = data.get('text', u'')
+        # Twython returns either bytes or unicode, depending on tweet.
+        # #APIshaming
+        try:
+            model = self.handler(text)
+        except TypeError:
+            model = self.handler(text.decode('utf8'))
+        status = self.view(model, sys.stdin)
+
+    def on_error(self, status_code, data):
+        print(status_code)
+
+
+class Secrets(object):
+    def __init__(self, key_dir):
+        self.key = open(path.join(key_dir, 'key.txt')).read().strip()
+        self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
+        self.token = open(path.join(key_dir, 'token.txt')).read().strip()
+        self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
+
+
+def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
+    # We don't need the parser for this demo, so may as well save the loading time
+    nlp = spacy.en.English(Parser=None)
+    get_target = lambda: Meaning.from_path(nlp, target_loc)
+    get_reject = lambda: Meaning.from_path(nlp, reject_loc)
+    matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
+
+    twitter = Connection(keys_dir, matcher, print_colored)
+    twitter.statuses.filter(track=term)
+
+
+if __name__ == '__main__':
+    plac.call(main)