* Add Displacy mixin. Needs to pull the data from the web

2025-07-15 10:42:34 +03:00 · 2015-08-14 18:59:08 +02:00 · 2015-08-14 18:59:08 +02:00 · 5ee645d742
commit 5ee645d742
parent 005074c31e
2 changed files with 159 additions and 0 deletions
--- a/docs/redesign/online_demo.jade
+++ b/docs/redesign/online_demo.jade
@ -0,0 +1,18 @@
 mixin Displacy(sentence, caption_text, height)
  - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20")
  .displacy
    iframe.displacy(src="displacy/displacy_demo.html" height=height)
    a.view-displacy(href=url)
      | View on displaCy
    p.caption.
      #{caption_text}
 +Displacy(
  "Click the button to see this sentence in displaCy.",
  "The best parse-tree visualizer and annotation tool in all the land.",
  275
 )
--- a/examples/twitter_filter.py
+++ b/examples/twitter_filter.py
@ -0,0 +1,141 @@
 from __future__ import unicode_literals, print_function
 import plac
 import codecs
 import sys
 import math
 import spacy.en
 from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
 from termcolor import colored
 from twython import TwythonStreamer
 from os import path
 from math import sqrt
 from numpy import dot
 from numpy.linalg import norm
 class Meaning(object):
    def __init__(self, vectors):
        if vectors:
            self.vector = sum(vectors) / len(vectors)
            self.norm = norm(self.vector)
        else:
            self.vector = None
            self.norm = 0
    @classmethod
    def from_path(cls, nlp, loc):
        with codecs.open(loc, 'r', 'utf8') as file_:
            terms = file_.read().strip().split()
        return cls.from_terms(nlp, terms)
    @classmethod
    def from_tokens(cls, nlp, tokens):
        vectors = [t.repvec for t in tokens]
        return cls(vectors)
    @classmethod
    def from_terms(cls, nlp, examples):
        lexemes = [nlp.vocab[eg] for eg in examples]
        vectors = [eg.repvec for eg in lexemes]
        return cls(vectors)
    def similarity(self, other):
        if not self.norm or not other.norm:
            return -1
        return dot(self.vector, other.vector) / (self.norm * other.norm)
 def print_colored(model, stream=sys.stdout):
    if model['is_match']:
        color = 'green'
    elif model['is_reject']:
        color = 'red'
    else:
        color = 'grey'
    if not model['is_rare'] and model['is_match'] and not model['is_reject']:
        match_score = colored('%.3f' % model['match_score'], 'green')
        reject_score = colored('%.3f' % model['reject_score'], 'red')
        prob = '%.5f' % model['prob']
        print(match_score, reject_score, prob)
        print(repr(model['text']), color)
        print('')
 class TextMatcher(object):
    def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
        self.nlp = nlp
        self.get_target = get_target
        self.get_reject = get_reject
        self.min_prob = min_prob
        self.min_match = min_match
        self.max_reject = max_reject
    def __call__(self, text):
        tweet = self.nlp(text)
        target_terms = self.get_target()
        reject_terms = self.get_reject()
        prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
        meaning = Meaning.from_tokens(self, tweet)
        match_score = meaning.similarity(self.get_target())
        reject_score = meaning.similarity(self.get_reject())
        return {
            'text': tweet.string,
            'prob': prob,
            'match_score': match_score,
            'reject_score': reject_score,
            'is_rare': prob < self.min_prob,
            'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
            'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
        }
 class Connection(TwythonStreamer):
    def __init__(self, keys_dir, handler, view):
        keys = Secrets(keys_dir)
        TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) 
        self.handler = handler
        self.view = view
    def on_success(self, data):
        text = data.get('text', u'')
        # Twython returns either bytes or unicode, depending on tweet.
        # #APIshaming
        try:
            model = self.handler(text)
        except TypeError:
            model = self.handler(text.decode('utf8'))
        status = self.view(model, sys.stdin)
    def on_error(self, status_code, data):
        print(status_code)
 class Secrets(object):
    def __init__(self, key_dir):
        self.key = open(path.join(key_dir, 'key.txt')).read().strip()
        self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
        self.token = open(path.join(key_dir, 'token.txt')).read().strip()
        self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
 def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
    # We don't need the parser for this demo, so may as well save the loading time
    nlp = spacy.en.English(Parser=None)
    get_target = lambda: Meaning.from_path(nlp, target_loc)
    get_reject = lambda: Meaning.from_path(nlp, reject_loc)
    matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
    twitter = Connection(keys_dir, matcher, print_colored)
    twitter.statuses.filter(track=term)
 if __name__ == '__main__':
    plac.call(main)