mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Remove old unused files
This commit is contained in:
parent
8e962de39f
commit
3a9c6a9563
|
@ -1,81 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||
import pytest
|
||||
|
||||
#@pytest.mark.models
|
||||
#def test_1():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# # Load the pipeline, and call it with some text.
|
||||
# nlp = spacy.en.English()
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
# tag=True, parse=False)
|
||||
# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
|
||||
#
|
||||
# o = nlp.vocab[u'back'].prob
|
||||
# assert o == -7.033305644989014
|
||||
# o = nlp.vocab[u'not'].prob
|
||||
# assert o == -5.332601070404053
|
||||
# o = nlp.vocab[u'quietly'].prob
|
||||
# assert o == -11.994928359985352
|
||||
#
|
||||
#
|
||||
#@pytest.mark.m
|
||||
#def test2():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# nlp = spacy.en.English()
|
||||
# # Find log probability of Nth most frequent word
|
||||
# probs = [lex.prob for lex in nlp.vocab]
|
||||
# probs.sort()
|
||||
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
#
|
||||
#@pytest.mark.models
|
||||
#def test3():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# nlp = spacy.en.English()
|
||||
# # Find log probability of Nth most frequent word
|
||||
# probs = [lex.prob for lex in nlp.vocab]
|
||||
# probs.sort()
|
||||
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
#
|
||||
# pleaded = tokens[7]
|
||||
# assert pleaded.repvec.shape == (300,)
|
||||
# o = pleaded.repvec[:5]
|
||||
# assert sum(o) != 0
|
||||
# from numpy import dot
|
||||
# from numpy.linalg import norm
|
||||
#
|
||||
# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
|
||||
# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
# words.reverse()
|
||||
# o = [w.orth_ for w in words[0:20]]
|
||||
# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
|
||||
# u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
|
||||
# u'countersued', u'remonstrated', u'begged', u'apologised',
|
||||
# u'consented', u'acquiesced', u'petitioned', u'quarreled',
|
||||
# u'appealed', u'pleading']
|
||||
# o = [w.orth_ for w in words[50:60]]
|
||||
# assert o == [u'martialed', u'counselled', u'bragged',
|
||||
# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
|
||||
# u'dissented', u'yearned']
|
||||
# o = [w.orth_ for w in words[100:110]]
|
||||
# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
|
||||
# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
|
||||
# u'clerked']
|
||||
#
|
||||
# #o = [w.orth_ for w in words[1000:1010]]
|
||||
# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
|
||||
# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
|
||||
# #o = [w.orth_ for w in words[50000:50010]]
|
||||
# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
|
||||
# # u'dirty', u'rims', u'artists']
|
|
@ -1,82 +0,0 @@
|
|||
#!/bin/sed -f
|
||||
|
||||
# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
|
||||
# Yeah, sure.
|
||||
|
||||
# expected input: raw text with ONE SENTENCE TOKEN PER LINE
|
||||
|
||||
# by Robert MacIntyre, University of Pennsylvania, late 1995.
|
||||
|
||||
# If this wasn't such a trivial program, I'd include all that stuff about
|
||||
# no warrantee, free use, etc. from the GNU General Public License. If you
|
||||
# want to be picky, assume that all of its terms apply. Okay?
|
||||
|
||||
# attempt to get correct directional quotes
|
||||
s=^"=`` =g
|
||||
s=\([ ([{<]\)"=\1 `` =g
|
||||
# close quotes handled at end
|
||||
|
||||
s=\.\.\.= ... =g
|
||||
s=[,;:@#$%&]= & =g
|
||||
|
||||
# Assume sentence tokenization has been done first, so split FINAL periods
|
||||
# only.
|
||||
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
|
||||
# however, we may as well split ALL question marks and exclamation points,
|
||||
# since they shouldn't have the abbrev.-marker ambiguity problem
|
||||
s=[?!]= & =g
|
||||
|
||||
# parentheses, brackets, etc.
|
||||
s=[][(){}<>]= & =g
|
||||
# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
|
||||
# version of these symbols.
|
||||
# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
|
||||
# s/(/-LRB-/g
|
||||
# s/)/-RRB-/g
|
||||
# s/\[/-LSB-/g
|
||||
# s/\]/-RSB-/g
|
||||
# s/{/-LCB-/g
|
||||
# s/}/-RCB-/g
|
||||
|
||||
s=--= -- =g
|
||||
|
||||
# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since
|
||||
# you might someday want to know how the words originally fit together --
|
||||
# but it's too late to make a better system now, given the millions of
|
||||
# words we've already done "wrong".
|
||||
|
||||
# First off, add a space to the beginning and end of each line, to reduce
|
||||
# necessary number of regexps.
|
||||
s=$= =
|
||||
s=^= =
|
||||
|
||||
s="= '' =g
|
||||
# possessive or close-single-quote
|
||||
s=\([^']\)' =\1 ' =g
|
||||
# as in it's, I'm, we'd
|
||||
s='\([sSmMdD]\) = '\1 =g
|
||||
s='ll = 'll =g
|
||||
s='re = 're =g
|
||||
s='ve = 've =g
|
||||
s=n't = n't =g
|
||||
s='LL = 'LL =g
|
||||
s='RE = 'RE =g
|
||||
s='VE = 'VE =g
|
||||
s=N'T = N'T =g
|
||||
|
||||
s= \([Cc]\)annot = \1an not =g
|
||||
s= \([Dd]\)'ye = \1' ye =g
|
||||
s= \([Gg]\)imme = \1im me =g
|
||||
s= \([Gg]\)onna = \1on na =g
|
||||
s= \([Gg]\)otta = \1ot ta =g
|
||||
s= \([Ll]\)emme = \1em me =g
|
||||
s= \([Mm]\)ore'n = \1ore 'n =g
|
||||
s= '\([Tt]\)is = '\1 is =g
|
||||
s= '\([Tt]\)was = '\1 was =g
|
||||
s= \([Ww]\)anna = \1an na =g
|
||||
# s= \([Ww]\)haddya = \1ha dd ya =g
|
||||
# s= \([Ww]\)hatcha = \1ha t cha =g
|
||||
|
||||
# clean out extra spaces
|
||||
s= *= =g
|
||||
s=^ *==g
|
Loading…
Reference in New Issue
Block a user