doctype html
html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    main#content(role='main')
      article.post
        header
          h2 Finding Relevant Tweets
          .subhead
            | by 
            a(href='#', rel='author') Matthew Honnibal
            |  on 
            time(datetime='2015-08-14') December
          
        details
          summary: h4 Imports
          pre.language-python

            | from __future__ import unicode_literals, print_function
            | import plac
            | import codecs
            | import sys
            | import math
            | 
            | import spacy.en
            | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
            | 
            | from termcolor import colored
            | from twython import TwythonStreamer
            | 
            | from os import path
            | from math import sqrt
            | 
            | from numpy import dot
            | from numpy.linalg import norm
            | 
            | 

        details
          summary: h4 Simple vector-averaging similarity

          pre.language-python: code

            | class Meaning(object):
            |     def __init__(self, vectors):
            |         if vectors:
            |             self.vector = sum(vectors) / len(vectors)
            |             self.norm = norm(self.vector)
            |         else:
            |             self.vector = None
            |             self.norm = 0
            | 
            |     @classmethod
            |     def from_path(cls, nlp, loc):
            |         with codecs.open(loc, 'r', 'utf8') as file_:
            |             terms = file_.read().strip().split()
            |         return cls.from_terms(nlp, terms)
            | 
            |     @classmethod
            |     def from_tokens(cls, nlp, tokens):
            |         vectors = [t.repvec for t in tokens]
            |         return cls(vectors)
            | 
            |     @classmethod
            |     def from_terms(cls, nlp, examples):
            |         lexemes = [nlp.vocab[eg] for eg in examples]
            |         vectors = [eg.repvec for eg in lexemes]
            |         return cls(vectors)
            | 
            |     def similarity(self, other):
            |         if not self.norm or not other.norm:
            |             return -1
            |         return dot(self.vector, other.vector) / (self.norm * other.norm)
            | 

        details
          summary: h4 Print matches
              
          pre.language-python: code

            | 
            | def print_colored(model, stream=sys.stdout):
            |     if model['is_match']:
            |         color = 'green'
            |     elif model['is_reject']:
            |         color = 'red'
            |     else:
            |         color = 'grey'
            |     
            |     if not model['is_rare'] and model['is_match'] and not model['is_reject']:
            |         match_score = colored('%.3f' % model['match_score'], 'green')
            |         reject_score = colored('%.3f' % model['reject_score'], 'red')
            |         prob = '%.5f' % model['prob']
            | 
            |         print(match_score, reject_score, prob)
            |         print(repr(model['text']), color)
            |         print('')
            | 
            | 

        details
          summary: h4 TextMatcher: Process the tweets using spaCy

          pre.language-python: code

            | class TextMatcher(object):
            |     def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
            |         self.nlp = nlp
            |         self.get_target = get_target
            |         self.get_reject = get_reject
            |         self.min_prob = min_prob
            |         self.min_match = min_match
            |         self.max_reject = max_reject
            | 
            |     def __call__(self, text):
            |         tweet = self.nlp(text)
            |         target_terms = self.get_target()
            |         reject_terms = self.get_reject()
            | 
            |         prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
            |         meaning = Meaning.from_tokens(self, tweet)
            |         
            |         match_score = meaning.similarity(self.get_target())
            |         reject_score = meaning.similarity(self.get_reject())
            |         return {
            |             'text': tweet.string,
            |             'prob': prob,
            |             'match_score': match_score,
            |             'reject_score': reject_score,
            |             'is_rare': prob < self.min_prob,
            |             'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
            |             'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
            |         }
            | 
            | 

        details
          summary: h4 Connect to Twitter and stream tweets

          pre.language-python: code

            | class Connection(TwythonStreamer):
            |     def __init__(self, keys_dir, handler, view):
            |         keys = Secrets(keys_dir)
            |         TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) 
            |         self.handler = handler
            |         self.view = view
            | 
            |     def on_success(self, data):
            |         text = data.get('text', u'')
            |         # Twython returns either bytes or unicode, depending on tweet.
            |         # #APIshaming
            |         try:
            |             model = self.handler(text)
            |         except TypeError:
            |             model = self.handler(text.decode('utf8'))
            |         status = self.view(model, sys.stdin)
            | 
            |     def on_error(self, status_code, data):
            |         print(status_code)
            | 
            | 
            | class Secrets(object):
            |     def __init__(self, key_dir):
            |         self.key = open(path.join(key_dir, 'key.txt')).read().strip()
            |         self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
            |         self.token = open(path.join(key_dir, 'token.txt')).read().strip()
            |         self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
            | 
            | 

        details
          summary: h4 Command-line interface

          pre.language-python: code

            | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
            |     # We don't need the parser for this demo, so may as well save the loading time
            |     nlp = spacy.en.English(Parser=None)
            |     get_target = lambda: Meaning.from_path(nlp, target_loc)
            |     get_reject = lambda: Meaning.from_path(nlp, reject_loc)
            |     matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
            | 
            |     twitter = Connection(keys_dir, matcher, print_colored)
            |     twitter.statuses.filter(track=term)
            | 
            | 
            | if __name__ == '__main__':
            |     plac.call(main)
            |   

  footer(role='contentinfo')
  script(src='js/prism.js')