mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			85 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			85 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # -*- coding: utf-8 -*-
 | ||
| """Sphinx doctest is just too hard. Manually paste doctest examples here"""
 | ||
| from spacy.en.attrs import IS_LOWER
 | ||
| 
 | ||
| 
 | ||
| def test_1():
 | ||
|     import spacy.en
 | ||
|     from spacy.parts_of_speech import ADV
 | ||
|     # Load the pipeline, and call it with some text.
 | ||
|     nlp = spacy.en.English()
 | ||
|     tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
 | ||
|                 tag=True, parse=False)
 | ||
|     o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
 | ||
|     assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
 | ||
| 
 | ||
|     o = nlp.vocab[u'back'].prob
 | ||
|     assert o == -7.403977394104004
 | ||
|     o = nlp.vocab[u'not'].prob
 | ||
|     assert o == -5.407193660736084
 | ||
|     o = nlp.vocab[u'quietly'].prob
 | ||
|     assert o == -11.07155704498291
 | ||
| 
 | ||
| 
 | ||
| def test2():
 | ||
|     import spacy.en
 | ||
|     from spacy.parts_of_speech import ADV
 | ||
|     nlp = spacy.en.English()
 | ||
|     # Find log probability of Nth most frequent word
 | ||
|     probs = [lex.prob for lex in nlp.vocab]
 | ||
|     probs.sort()
 | ||
|     is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
 | ||
|     tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
 | ||
|     o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
 | ||
|     o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
 | ||
| 
 | ||
|     nlp.vocab[u'back'].prob
 | ||
|     -7.403977394104004
 | ||
|     nlp.vocab[u'not'].prob
 | ||
|     -5.407193660736084
 | ||
|     nlp.vocab[u'quietly'].prob
 | ||
|     -11.07155704498291
 | ||
| 
 | ||
| 
 | ||
| def test3():
 | ||
|     import spacy.en
 | ||
|     from spacy.parts_of_speech import ADV
 | ||
|     nlp = spacy.en.English()
 | ||
|     # Find log probability of Nth most frequent word
 | ||
|     probs = [lex.prob for lex in nlp.vocab]
 | ||
|     probs.sort()
 | ||
|     is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
 | ||
|     tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
 | ||
|     o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
 | ||
|     assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
 | ||
| 
 | ||
|     pleaded = tokens[7]
 | ||
|     assert pleaded.repvec.shape == (300,)
 | ||
|     o = pleaded.repvec[:5]
 | ||
|     assert sum(o) != 0
 | ||
|     from numpy import dot
 | ||
|     from numpy.linalg import norm
 | ||
| 
 | ||
|     cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
 | ||
|     words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec]
 | ||
|     words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
 | ||
|     words.reverse()
 | ||
|     o = [w.orth_ for w in words[0:20]]
 | ||
|     assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
 | ||
|                  u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
 | ||
|                  u'countersued', u'remonstrated', u'begged', u'apologised',
 | ||
|                  u'consented', u'acquiesced', u'petitioned', u'quarreled',
 | ||
|                  u'appealed', u'pleading']
 | ||
|     o = [w.orth_ for w in words[50:60]]
 | ||
|     assert o == [u'counselled', u'bragged', u'backtracked', u'caucused', u'refiled',
 | ||
|                  u'dueled', u'mused', u'dissented', u'yearned', u'confesses']
 | ||
|     o = [w.orth_ for w in words[100:110]]
 | ||
|     assert o == [u'cabled', u'ducked', u'sentenced', u'perjured', u'absconded',
 | ||
|                  u'bargained', u'overstayed', u'clerked', u'confided', u'sympathizes']
 | ||
|     #o = [w.orth_ for w in words[1000:1010]]
 | ||
|     #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
 | ||
|     #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
 | ||
|     #o = [w.orth_ for w in words[50000:50010]]
 | ||
|     #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
 | ||
|     #             u'dirty', u'rims', u'artists']
 |