From 5ee645d742193293f8933de1ed63cd44a08aba70 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Aug 2015 18:59:08 +0200 Subject: [PATCH] * Add Displacy mixin. Needs to pull the data from the web --- docs/redesign/online_demo.jade | 18 +++++ examples/twitter_filter.py | 141 +++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 examples/twitter_filter.py diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade index e69de29bb..0e2bbb331 100644 --- a/docs/redesign/online_demo.jade +++ b/docs/redesign/online_demo.jade @@ -0,0 +1,18 @@ +mixin Displacy(sentence, caption_text, height) + - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20") + + .displacy + iframe.displacy(src="displacy/displacy_demo.html" height=height) + + a.view-displacy(href=url) + | View on displaCy + + p.caption. + #{caption_text} + + ++Displacy( + "Click the button to see this sentence in displaCy.", + "The best parse-tree visualizer and annotation tool in all the land.", + 275 +) diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py new file mode 100644 index 000000000..f842acdd4 --- /dev/null +++ b/examples/twitter_filter.py @@ -0,0 +1,141 @@ +from __future__ import unicode_literals, print_function +import plac +import codecs +import sys +import math + +import spacy.en +from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ + +from termcolor import colored +from twython import TwythonStreamer + +from os import path +from math import sqrt + +from numpy import dot +from numpy.linalg import norm + + +class Meaning(object): + def __init__(self, vectors): + if vectors: + self.vector = sum(vectors) / len(vectors) + self.norm = norm(self.vector) + else: + self.vector = None + self.norm = 0 + + @classmethod + def from_path(cls, nlp, loc): + with codecs.open(loc, 'r', 'utf8') as file_: + terms = file_.read().strip().split() + return cls.from_terms(nlp, terms) + + @classmethod + def from_tokens(cls, nlp, tokens): + vectors = [t.repvec for t in tokens] + return cls(vectors) + + @classmethod + def from_terms(cls, nlp, examples): + lexemes = [nlp.vocab[eg] for eg in examples] + vectors = [eg.repvec for eg in lexemes] + return cls(vectors) + + def similarity(self, other): + if not self.norm or not other.norm: + return -1 + return dot(self.vector, other.vector) / (self.norm * other.norm) + + +def print_colored(model, stream=sys.stdout): + if model['is_match']: + color = 'green' + elif model['is_reject']: + color = 'red' + else: + color = 'grey' + + if not model['is_rare'] and model['is_match'] and not model['is_reject']: + match_score = colored('%.3f' % model['match_score'], 'green') + reject_score = colored('%.3f' % model['reject_score'], 'red') + prob = '%.5f' % model['prob'] + + print(match_score, reject_score, prob) + print(repr(model['text']), color) + print('') + + +class TextMatcher(object): + def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): + self.nlp = nlp + self.get_target = get_target + self.get_reject = get_reject + self.min_prob = min_prob + self.min_match = min_match + self.max_reject = max_reject + + def __call__(self, text): + tweet = self.nlp(text) + target_terms = self.get_target() + reject_terms = self.get_reject() + + prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) + meaning = Meaning.from_tokens(self, tweet) + + match_score = meaning.similarity(self.get_target()) + reject_score = meaning.similarity(self.get_reject()) + return { + 'text': tweet.string, + 'prob': prob, + 'match_score': match_score, + 'reject_score': reject_score, + 'is_rare': prob < self.min_prob, + 'is_match': prob >= self.min_prob and match_score >= self.min_match, + 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject + } + + +class Connection(TwythonStreamer): + def __init__(self, keys_dir, handler, view): + keys = Secrets(keys_dir) + TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) + self.handler = handler + self.view = view + + def on_success(self, data): + text = data.get('text', u'') + # Twython returns either bytes or unicode, depending on tweet. + # #APIshaming + try: + model = self.handler(text) + except TypeError: + model = self.handler(text.decode('utf8')) + status = self.view(model, sys.stdin) + + def on_error(self, status_code, data): + print(status_code) + + +class Secrets(object): + def __init__(self, key_dir): + self.key = open(path.join(key_dir, 'key.txt')).read().strip() + self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() + self.token = open(path.join(key_dir, 'token.txt')).read().strip() + self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() + + +def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): + # We don't need the parser for this demo, so may as well save the loading time + nlp = spacy.en.English(Parser=None) + get_target = lambda: Meaning.from_path(nlp, target_loc) + get_reject = lambda: Meaning.from_path(nlp, reject_loc) + matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) + + twitter = Connection(keys_dir, matcher, print_colored) + twitter.statuses.filter(track=term) + + +if __name__ == '__main__': + plac.call(main)