mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
|
# encoding: utf8
|
||
|
from __future__ import unicode_literals, print_function
|
||
|
|
||
|
from math import sqrt
|
||
|
from numpy import dot
|
||
|
from numpy.linalg import norm
|
||
|
|
||
|
|
||
|
def handle_tweet(spacy, tweet_data, query):
|
||
|
text = tweet_data.get('text', u'')
|
||
|
# Twython returns either bytes or unicode, depending on tweet.
|
||
|
# ಠ_ಠ #APIshaming
|
||
|
try:
|
||
|
match_tweet(spacy, text, query)
|
||
|
except TypeError:
|
||
|
match_tweet(spacy, text.decode('utf8'), query)
|
||
|
|
||
|
|
||
|
def match_tweet(spacy, text, query):
|
||
|
def get_vector(word):
|
||
|
return spacy.vocab[word].repvec
|
||
|
|
||
|
tweet = spacy(text)
|
||
|
tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
|
||
|
if tweet:
|
||
|
accept = map(get_vector, 'child classroom teach'.split())
|
||
|
reject = map(get_vector, 'mouth hands giveaway'.split())
|
||
|
|
||
|
y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
|
||
|
n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
|
||
|
|
||
|
if (y / (y + n)) >= 0.5 or True:
|
||
|
print(text)
|
||
|
|
||
|
|
||
|
def cos(v1, v2):
|
||
|
return dot(v1, v2) / (norm(v1) * norm(v2))
|