mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 07:27:28 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			38 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			38 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # encoding: utf8
 | |
| from __future__ import unicode_literals, print_function
 | |
| 
 | |
| from math import sqrt
 | |
| from numpy import dot
 | |
| from numpy.linalg import norm
 | |
| 
 | |
| 
 | |
| def handle_tweet(spacy, tweet_data, query):
 | |
|     text = tweet_data.get('text', u'')
 | |
|     # Twython returns either bytes or unicode, depending on tweet.
 | |
|     # ಠ_ಠ #APIshaming
 | |
|     try:
 | |
|         match_tweet(spacy, text, query)
 | |
|     except TypeError:
 | |
|         match_tweet(spacy, text.decode('utf8'), query)
 | |
| 
 | |
| 
 | |
| def match_tweet(spacy, text, query):
 | |
|     def get_vector(word):
 | |
|         return spacy.vocab[word].repvec
 | |
| 
 | |
|     tweet = spacy(text)
 | |
|     tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
 | |
|     if tweet:
 | |
|         accept = map(get_vector, 'child classroom teach'.split())
 | |
|         reject = map(get_vector, 'mouth hands giveaway'.split())
 | |
|         
 | |
|         y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
 | |
|         n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
 | |
|         
 | |
|         if (y / (y + n)) >= 0.5 or True:
 | |
|             print(text)
 | |
| 
 | |
| 
 | |
| def cos(v1, v2):
 | |
|     return dot(v1, v2) / (norm(v1) * norm(v2))
 |