mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Remove old unused files
This commit is contained in:
		
							parent
							
								
									8e962de39f
								
							
						
					
					
						commit
						3a9c6a9563
					
				|  | @ -1,81 +0,0 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| """Sphinx doctest is just too hard. Manually paste doctest examples here""" | ||||
| import pytest | ||||
| 
 | ||||
| #@pytest.mark.models | ||||
| #def test_1(): | ||||
| #    import spacy.en | ||||
| #    from spacy.parts_of_speech import ADV | ||||
| #    # Load the pipeline, and call it with some text. | ||||
| #    nlp = spacy.en.English() | ||||
| #    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", | ||||
| #                tag=True, parse=False) | ||||
| #    o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) | ||||
| #    assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" | ||||
| # | ||||
| #    o = nlp.vocab[u'back'].prob | ||||
| #    assert o == -7.033305644989014 | ||||
| #    o = nlp.vocab[u'not'].prob | ||||
| #    assert o == -5.332601070404053 | ||||
| #    o = nlp.vocab[u'quietly'].prob | ||||
| #    assert o == -11.994928359985352 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.m | ||||
| #def test2(): | ||||
| #    import spacy.en | ||||
| #    from spacy.parts_of_speech import ADV | ||||
| #    nlp = spacy.en.English() | ||||
| #    # Find log probability of Nth most frequent word | ||||
| #    probs = [lex.prob for lex in nlp.vocab] | ||||
| #    probs.sort() | ||||
| #    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] | ||||
| #    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") | ||||
| #    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) | ||||
| #    o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' | ||||
| # | ||||
| #@pytest.mark.models | ||||
| #def test3(): | ||||
| #    import spacy.en | ||||
| #    from spacy.parts_of_speech import ADV | ||||
| #    nlp = spacy.en.English() | ||||
| #    # Find log probability of Nth most frequent word | ||||
| #    probs = [lex.prob for lex in nlp.vocab] | ||||
| #    probs.sort() | ||||
| #    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] | ||||
| #    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") | ||||
| #    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) | ||||
| #    assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' | ||||
| # | ||||
| #    pleaded = tokens[7] | ||||
| #    assert pleaded.repvec.shape == (300,) | ||||
| #    o = pleaded.repvec[:5] | ||||
| #    assert sum(o) != 0 | ||||
| #    from numpy import dot | ||||
| #    from numpy.linalg import norm | ||||
| # | ||||
| #    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) | ||||
| #    words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] | ||||
| #    words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) | ||||
| #    words.reverse() | ||||
| #    o = [w.orth_ for w in words[0:20]] | ||||
| #    assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', | ||||
| #                 u'pleads', u'testified', u'conspired', u'motioned', u'demurred', | ||||
| #                 u'countersued', u'remonstrated', u'begged', u'apologised', | ||||
| #                 u'consented', u'acquiesced', u'petitioned', u'quarreled', | ||||
| #                 u'appealed', u'pleading'] | ||||
| #    o = [w.orth_ for w in words[50:60]] | ||||
| #    assert o == [u'martialed', u'counselled', u'bragged', | ||||
| #                 u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', | ||||
| #                 u'dissented', u'yearned'] | ||||
| #    o = [w.orth_ for w in words[100:110]] | ||||
| #    assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', | ||||
| #                 u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', | ||||
| #                 u'clerked'] | ||||
| #     | ||||
| #    #o = [w.orth_ for w in words[1000:1010]] | ||||
| #    #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', | ||||
| #    #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] | ||||
| #    #o = [w.orth_ for w in words[50000:50010]] | ||||
| #    #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', | ||||
| #    #             u'dirty', u'rims', u'artists'] | ||||
|  | @ -1,82 +0,0 @@ | |||
| #!/bin/sed -f | ||||
| 
 | ||||
| # Sed script to produce Penn Treebank tokenization on arbitrary raw text. | ||||
| # Yeah, sure. | ||||
| 
 | ||||
| # expected input: raw text with ONE SENTENCE TOKEN PER LINE | ||||
| 
 | ||||
| # by Robert MacIntyre, University of Pennsylvania, late 1995. | ||||
| 
 | ||||
| # If this wasn't such a trivial program, I'd include all that stuff about | ||||
| # no warrantee, free use, etc. from the GNU General Public License.  If you | ||||
| # want to be picky, assume that all of its terms apply.  Okay? | ||||
| 
 | ||||
| # attempt to get correct directional quotes | ||||
| s=^"=`` =g | ||||
| s=\([ ([{<]\)"=\1 `` =g | ||||
| # close quotes handled at end | ||||
| 
 | ||||
| s=\.\.\.= ... =g | ||||
| s=[,;:@#$%&]= & =g | ||||
| 
 | ||||
| # Assume sentence tokenization has been done first, so split FINAL periods | ||||
| # only. | ||||
| s=\([^.]\)\([.]\)\([])}>"']*\)[ 	]*$=\1 \2\3 =g | ||||
| # however, we may as well split ALL question marks and exclamation points, | ||||
| # since they shouldn't have the abbrev.-marker ambiguity problem | ||||
| s=[?!]= & =g | ||||
| 
 | ||||
| # parentheses, brackets, etc. | ||||
| s=[][(){}<>]= & =g | ||||
| # Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file | ||||
| # version of these symbols. | ||||
| # UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. | ||||
| # s/(/-LRB-/g | ||||
| # s/)/-RRB-/g | ||||
| # s/\[/-LSB-/g | ||||
| # s/\]/-RSB-/g | ||||
| # s/{/-LCB-/g | ||||
| # s/}/-RCB-/g | ||||
| 
 | ||||
| s=--= -- =g | ||||
| 
 | ||||
| # NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since | ||||
| # you might someday want to know how the words originally fit together -- | ||||
| # but it's too late to make a better system now, given the millions of | ||||
| # words we've already done "wrong". | ||||
| 
 | ||||
| # First off, add a space to the beginning and end of each line, to reduce | ||||
| # necessary number of regexps. | ||||
| s=$= = | ||||
| s=^= = | ||||
| 
 | ||||
| s="= '' =g | ||||
| # possessive or close-single-quote | ||||
| s=\([^']\)' =\1 ' =g | ||||
| # as in it's, I'm, we'd | ||||
| s='\([sSmMdD]\) = '\1 =g | ||||
| s='ll = 'll =g | ||||
| s='re = 're =g | ||||
| s='ve = 've =g | ||||
| s=n't = n't =g | ||||
| s='LL = 'LL =g | ||||
| s='RE = 'RE =g | ||||
| s='VE = 'VE =g | ||||
| s=N'T = N'T =g | ||||
| 
 | ||||
| s= \([Cc]\)annot = \1an not =g | ||||
| s= \([Dd]\)'ye = \1' ye =g | ||||
| s= \([Gg]\)imme = \1im me =g | ||||
| s= \([Gg]\)onna = \1on na =g | ||||
| s= \([Gg]\)otta = \1ot ta =g | ||||
| s= \([Ll]\)emme = \1em me =g | ||||
| s= \([Mm]\)ore'n = \1ore 'n =g | ||||
| s= '\([Tt]\)is = '\1 is =g | ||||
| s= '\([Tt]\)was = '\1 was =g | ||||
| s= \([Ww]\)anna = \1an na =g | ||||
| # s= \([Ww]\)haddya = \1ha dd ya =g | ||||
| # s= \([Ww]\)hatcha = \1ha t cha =g | ||||
| 
 | ||||
| # clean out extra spaces | ||||
| s=  *= =g | ||||
| s=^ *==g | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user