doctype html html(lang='en') head meta(charset='utf-8') title spaCy Blog meta(name='description', content='') meta(name='author', content='Matthew Honnibal') link(rel='stylesheet', href='css/style.css') //if lt IE 9 script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') body#blog header(role='banner') h1.logo spaCy Blog .slogan Blog main#content(role='main') article.post header h2 Finding Relevant Tweets .subhead | by a(href='#', rel='author') Matthew Honnibal | on time(datetime='2015-08-14') December details summary: h4 Imports pre.language-python | from __future__ import unicode_literals, print_function | import plac | import codecs | import sys | import math | | import spacy.en | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ | | from termcolor import colored | from twython import TwythonStreamer | | from os import path | from math import sqrt | | from numpy import dot | from numpy.linalg import norm | | details summary: h4 Simple vector-averaging similarity pre.language-python: code | class Meaning(object): | def __init__(self, vectors): | if vectors: | self.vector = sum(vectors) / len(vectors) | self.norm = norm(self.vector) | else: | self.vector = None | self.norm = 0 | | @classmethod | def from_path(cls, nlp, loc): | with codecs.open(loc, 'r', 'utf8') as file_: | terms = file_.read().strip().split() | return cls.from_terms(nlp, terms) | | @classmethod | def from_tokens(cls, nlp, tokens): | vectors = [t.repvec for t in tokens] | return cls(vectors) | | @classmethod | def from_terms(cls, nlp, examples): | lexemes = [nlp.vocab[eg] for eg in examples] | vectors = [eg.repvec for eg in lexemes] | return cls(vectors) | | def similarity(self, other): | if not self.norm or not other.norm: | return -1 | return dot(self.vector, other.vector) / (self.norm * other.norm) | details summary: h4 Print matches pre.language-python: code | | def print_colored(model, stream=sys.stdout): | if model['is_match']: | color = 'green' | elif model['is_reject']: | color = 'red' | else: | color = 'grey' | | if not model['is_rare'] and model['is_match'] and not model['is_reject']: | match_score = colored('%.3f' % model['match_score'], 'green') | reject_score = colored('%.3f' % model['reject_score'], 'red') | prob = '%.5f' % model['prob'] | | print(match_score, reject_score, prob) | print(repr(model['text']), color) | print('') | | details summary: h4 TextMatcher: Process the tweets using spaCy pre.language-python: code | class TextMatcher(object): | def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): | self.nlp = nlp | self.get_target = get_target | self.get_reject = get_reject | self.min_prob = min_prob | self.min_match = min_match | self.max_reject = max_reject | | def __call__(self, text): | tweet = self.nlp(text) | target_terms = self.get_target() | reject_terms = self.get_reject() | | prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) | meaning = Meaning.from_tokens(self, tweet) | | match_score = meaning.similarity(self.get_target()) | reject_score = meaning.similarity(self.get_reject()) | return { | 'text': tweet.string, | 'prob': prob, | 'match_score': match_score, | 'reject_score': reject_score, | 'is_rare': prob < self.min_prob, | 'is_match': prob >= self.min_prob and match_score >= self.min_match, | 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject | } | | details summary: h4 Connect to Twitter and stream tweets pre.language-python: code | class Connection(TwythonStreamer): | def __init__(self, keys_dir, handler, view): | keys = Secrets(keys_dir) | TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) | self.handler = handler | self.view = view | | def on_success(self, data): | text = data.get('text', u'') | # Twython returns either bytes or unicode, depending on tweet. | # #APIshaming | try: | model = self.handler(text) | except TypeError: | model = self.handler(text.decode('utf8')) | status = self.view(model, sys.stdin) | | def on_error(self, status_code, data): | print(status_code) | | | class Secrets(object): | def __init__(self, key_dir): | self.key = open(path.join(key_dir, 'key.txt')).read().strip() | self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() | self.token = open(path.join(key_dir, 'token.txt')).read().strip() | self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() | | details summary: h4 Command-line interface pre.language-python: code | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): | # We don't need the parser for this demo, so may as well save the loading time | nlp = spacy.en.English(Parser=None) | get_target = lambda: Meaning.from_path(nlp, target_loc) | get_reject = lambda: Meaning.from_path(nlp, reject_loc) | matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) | | twitter = Connection(keys_dir, matcher, print_colored) | twitter.statuses.filter(track=term) | | | if __name__ == '__main__': | plac.call(main) | footer(role='contentinfo') script(src='js/prism.js')