* Add Displacy mixin. Needs to pull the data from the web

This commit is contained in:
Matthew Honnibal 2015-08-14 18:59:08 +02:00
parent 005074c31e
commit 5ee645d742
2 changed files with 159 additions and 0 deletions

View File

@ -0,0 +1,18 @@
mixin Displacy(sentence, caption_text, height)
- var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20")
.displacy
iframe.displacy(src="displacy/displacy_demo.html" height=height)
a.view-displacy(href=url)
| View on displaCy
p.caption.
#{caption_text}
+Displacy(
"Click the button to see this sentence in displaCy.",
"The best parse-tree visualizer and annotation tool in all the land.",
275
)

141
examples/twitter_filter.py Normal file
View File

@ -0,0 +1,141 @@
from __future__ import unicode_literals, print_function
import plac
import codecs
import sys
import math
import spacy.en
from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
from termcolor import colored
from twython import TwythonStreamer
from os import path
from math import sqrt
from numpy import dot
from numpy.linalg import norm
class Meaning(object):
def __init__(self, vectors):
if vectors:
self.vector = sum(vectors) / len(vectors)
self.norm = norm(self.vector)
else:
self.vector = None
self.norm = 0
@classmethod
def from_path(cls, nlp, loc):
with codecs.open(loc, 'r', 'utf8') as file_:
terms = file_.read().strip().split()
return cls.from_terms(nlp, terms)
@classmethod
def from_tokens(cls, nlp, tokens):
vectors = [t.repvec for t in tokens]
return cls(vectors)
@classmethod
def from_terms(cls, nlp, examples):
lexemes = [nlp.vocab[eg] for eg in examples]
vectors = [eg.repvec for eg in lexemes]
return cls(vectors)
def similarity(self, other):
if not self.norm or not other.norm:
return -1
return dot(self.vector, other.vector) / (self.norm * other.norm)
def print_colored(model, stream=sys.stdout):
if model['is_match']:
color = 'green'
elif model['is_reject']:
color = 'red'
else:
color = 'grey'
if not model['is_rare'] and model['is_match'] and not model['is_reject']:
match_score = colored('%.3f' % model['match_score'], 'green')
reject_score = colored('%.3f' % model['reject_score'], 'red')
prob = '%.5f' % model['prob']
print(match_score, reject_score, prob)
print(repr(model['text']), color)
print('')
class TextMatcher(object):
def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
self.nlp = nlp
self.get_target = get_target
self.get_reject = get_reject
self.min_prob = min_prob
self.min_match = min_match
self.max_reject = max_reject
def __call__(self, text):
tweet = self.nlp(text)
target_terms = self.get_target()
reject_terms = self.get_reject()
prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
meaning = Meaning.from_tokens(self, tweet)
match_score = meaning.similarity(self.get_target())
reject_score = meaning.similarity(self.get_reject())
return {
'text': tweet.string,
'prob': prob,
'match_score': match_score,
'reject_score': reject_score,
'is_rare': prob < self.min_prob,
'is_match': prob >= self.min_prob and match_score >= self.min_match,
'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
}
class Connection(TwythonStreamer):
def __init__(self, keys_dir, handler, view):
keys = Secrets(keys_dir)
TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
self.handler = handler
self.view = view
def on_success(self, data):
text = data.get('text', u'')
# Twython returns either bytes or unicode, depending on tweet.
# #APIshaming
try:
model = self.handler(text)
except TypeError:
model = self.handler(text.decode('utf8'))
status = self.view(model, sys.stdin)
def on_error(self, status_code, data):
print(status_code)
class Secrets(object):
def __init__(self, key_dir):
self.key = open(path.join(key_dir, 'key.txt')).read().strip()
self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
self.token = open(path.join(key_dir, 'token.txt')).read().strip()
self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
# We don't need the parser for this demo, so may as well save the loading time
nlp = spacy.en.English(Parser=None)
get_target = lambda: Meaning.from_path(nlp, target_loc)
get_reject = lambda: Meaning.from_path(nlp, reject_loc)
matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
twitter = Connection(keys_dir, matcher, print_colored)
twitter.statuses.filter(track=term)
if __name__ == '__main__':
plac.call(main)