mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Tests passing for new Word object version
This commit is contained in:
		
							parent
							
								
									9815c7649e
								
							
						
					
					
						commit
						3b793cf4f7
					
				
							
								
								
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
								
							
							
						
						
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
								
							
							
						
						
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -1,104 +0,0 @@ | |||
| # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions | ||||
| #  21:09, 25 June 2014 | ||||
| #*--*  -- | ||||
| #*---* --- | ||||
| #*'s  's | ||||
| 
 | ||||
| cannot  can not | ||||
| d'ye    d' ye | ||||
| gimme   gim me | ||||
| gonna   gon na | ||||
| lemme   lem me | ||||
| more'n  more 'n | ||||
| 'tis    't is | ||||
| 'twas   't was | ||||
| wanna   wan na | ||||
| whaddya wha dd ya | ||||
| whatcha wha t cha | ||||
| ain't   ai n't | ||||
| aren't  are n't | ||||
| can't   can n't | ||||
| could've    could 've | ||||
| couldn't    could n't | ||||
| couldn't've could n't 've | ||||
| didn't  did n't | ||||
| doesn't does n't | ||||
| don't   do n't | ||||
| hadn't  had n't | ||||
| hadn't've   had n't 've | ||||
| hasn't  has n't | ||||
| haven't have n't | ||||
| he'd    he 'd | ||||
| he'd've he 'd 've | ||||
| he'll   he 'll | ||||
| he's    he 's | ||||
| how'd   he 'd | ||||
| how'll  he 'll | ||||
| how's   how 's | ||||
| I'd I 'd | ||||
| I'd've  I 'd 've | ||||
| I'll    I 'll | ||||
| I'm I 'm | ||||
| I've    I 've | ||||
| isn't   is n't | ||||
| it'd    it 'd | ||||
| it'd've it 'd 've | ||||
| it'll   it 'll | ||||
| it's    it 's | ||||
| let's   let 's | ||||
| mightn't    might n't | ||||
| mightn't've might n't 've | ||||
| might've    might 've | ||||
| mustn't must n't | ||||
| must've must 've | ||||
| needn't need n't | ||||
| not've  not h've | ||||
| shan't  sha n't | ||||
| she'd   she 'd | ||||
| she'd've    she 'd 've | ||||
| she'll  she 'll | ||||
| she's   she 's | ||||
| should've   should 've | ||||
| shouldn't   should n't | ||||
| shouldn't've    should n't 've | ||||
| that's  that 's | ||||
| there'd there 'd | ||||
| there'd've  there 'd 've | ||||
| there's there 's | ||||
| they'd  there 'd | ||||
| they'd've   they 'd 've | ||||
| they'll they 'll | ||||
| they're they 're | ||||
| they've they 've | ||||
| wasn't  was n't | ||||
| we'd    we 'd | ||||
| we'd've we 'd h've | ||||
| we'll   we 'll | ||||
| we're   we 're | ||||
| we've   we h've | ||||
| weren't were n't | ||||
| what'll what 'll | ||||
| what're what 're | ||||
| what's  what 's | ||||
| what've what 've | ||||
| when's  when 's | ||||
| where'd where 'd | ||||
| where's where 's | ||||
| where've    where 've | ||||
| who'd   who 'd | ||||
| who'll  who 'll | ||||
| who're  who 're | ||||
| who's   who 's | ||||
| who've  who 've | ||||
| why'll  why 'll | ||||
| why're  why 're | ||||
| why's   why 's | ||||
| won't   will n't | ||||
| would've    would 've | ||||
| wouldn't    would n't | ||||
| wouldn't've would n't 've | ||||
| you'd   you 'd | ||||
| you'd've    you 'd 've | ||||
| you'll  you 'll | ||||
| you're  you 're | ||||
| you've  you 've | ||||
|  | @ -2,103 +2,4 @@ | |||
| #  21:09, 25 June 2014 | ||||
| #*--*  -- | ||||
| #*---* --- | ||||
| #*'s  's | ||||
| 
 | ||||
| cannot  can not | ||||
| d'ye    d' ye | ||||
| gimme   gim me | ||||
| gonna   gon na | ||||
| lemme   lem me | ||||
| more'n  more 'n | ||||
| 'tis    't is | ||||
| 'twas   't was | ||||
| wanna   wan na | ||||
| whaddya wha dd ya | ||||
| whatcha wha t cha | ||||
| ain't   ai n't | ||||
| aren't  are n't | ||||
| can't   can n't | ||||
| could've    could 've | ||||
| couldn't    could n't | ||||
| couldn't've could n't 've | ||||
| didn't  did n't | ||||
| doesn't does n't | ||||
| don't   do n't | ||||
| hadn't  had n't | ||||
| hadn't've   had n't 've | ||||
| hasn't  has n't | ||||
| haven't have n't | ||||
| he'd    he 'd | ||||
| he'd've he 'd 've | ||||
| he'll   he 'll | ||||
| he's    he 's | ||||
| how'd   he 'd | ||||
| how'll  he 'll | ||||
| how's   how 's | ||||
| I'd I 'd | ||||
| I'd've  I 'd 've | ||||
| I'll    I 'll | ||||
| I'm I 'm | ||||
| I've    I 've | ||||
| isn't   is n't | ||||
| it'd    it 'd | ||||
| it'd've it 'd 've | ||||
| it'll   it 'll | ||||
| it's    it 's | ||||
| let's   let 's | ||||
| mightn't    might n't | ||||
| mightn't've might n't 've | ||||
| might've    might 've | ||||
| mustn't must n't | ||||
| must've must 've | ||||
| needn't need n't | ||||
| not've  not h've | ||||
| shan't  sha n't | ||||
| she'd   she 'd | ||||
| she'd've    she 'd 've | ||||
| she'll  she 'll | ||||
| she's   she 's | ||||
| should've   should 've | ||||
| shouldn't   should n't | ||||
| shouldn't've    should n't 've | ||||
| that's  that 's | ||||
| there'd there 'd | ||||
| there'd've  there 'd 've | ||||
| there's there 's | ||||
| they'd  there 'd | ||||
| they'd've   they 'd 've | ||||
| they'll they 'll | ||||
| they're they 're | ||||
| they've they 've | ||||
| wasn't  was n't | ||||
| we'd    we 'd | ||||
| we'd've we 'd h've | ||||
| we'll   we 'll | ||||
| we're   we 're | ||||
| we've   we h've | ||||
| weren't were n't | ||||
| what'll what 'll | ||||
| what're what 're | ||||
| what's  what 's | ||||
| what've what 've | ||||
| when's  when 's | ||||
| where'd where 'd | ||||
| where's where 's | ||||
| where've    where 've | ||||
| who'd   who 'd | ||||
| who'll  who 'll | ||||
| who're  who 're | ||||
| who's   who 's | ||||
| who've  who 've | ||||
| why'll  why 'll | ||||
| why're  why 're | ||||
| why's   why 's | ||||
| won't   will n't | ||||
| would've    would 've | ||||
| wouldn't    would n't | ||||
| wouldn't've would n't 've | ||||
| you'd   you 'd | ||||
| you'd've    you 'd 've | ||||
| you'll  you 'll | ||||
| you're  you 're | ||||
| you've  you 've | ||||
|  |  | |||
|  | @ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon | |||
|      | ||||
|     guide/overview | ||||
|     guide/install | ||||
|     api/languages/index.rst | ||||
|     api/modules/index.rst | ||||
|     api/index.rst | ||||
| 
 | ||||
| Source (GitHub) | ||||
| ---------------- | ||||
|  |  | |||
							
								
								
									
										1
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							|  | @ -9,7 +9,6 @@ def clean(): | |||
| def docs(): | ||||
|     with lcd('docs'): | ||||
|         local('sphinx-build -b html . ./_build') | ||||
|         local('open _build/index.html') | ||||
| 
 | ||||
| def test(): | ||||
|     local('py.test -x') | ||||
|  |  | |||
							
								
								
									
										7
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -45,16 +45,13 @@ else: | |||
| 
 | ||||
| 
 | ||||
| exts = [ | ||||
|     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), | ||||
|     #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.en", ["spacy/en.pyx"], language="c++", | ||||
|               include_dirs=includes), | ||||
|     Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", | ||||
|     Extension("spacy.word", ["spacy/word.pyx"], language="c++", | ||||
|               include_dirs=includes), | ||||
|     Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++", | ||||
|               include_dirs=includes) | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -5,17 +5,17 @@ scheme in several important respects: | |||
| 
 | ||||
| * Whitespace is added as tokens, except for single spaces. e.g., | ||||
| 
 | ||||
|     >>> tokenize(u'\\nHello  \\tThere').strings | ||||
|     >>> [w.string for w in tokenize(u'\\nHello  \\tThere')] | ||||
|     [u'\\n', u'Hello', u' ', u'\\t', u'There'] | ||||
| 
 | ||||
| * Contractions are normalized, e.g. | ||||
| 
 | ||||
|     >>> tokenize(u"isn't ain't won't he's").strings | ||||
|     >>> [w.string for w in u"isn't ain't won't he's")] | ||||
|     [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] | ||||
|    | ||||
| * Hyphenated words are split, with the hyphen preserved, e.g.: | ||||
|      | ||||
|     >>> tokenize(u'New York-based').strings | ||||
|     >>> [w.string for w in tokenize(u'New York-based')] | ||||
|     [u'New', u'York', u'-', u'based'] | ||||
| 
 | ||||
| Other improvements: | ||||
|  | @ -39,25 +39,11 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from libc.stdlib cimport malloc, calloc, free | ||||
| from libc.stdint cimport uint64_t | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| cimport spacy | ||||
| 
 | ||||
| 
 | ||||
| from spacy.orthography.latin cimport * | ||||
| 
 | ||||
| from .orthography.latin import * | ||||
| from .lexeme import * | ||||
| 
 | ||||
| 
 | ||||
| cdef class English(spacy.Language): | ||||
|     # How to ensure the order here aligns with orthography.latin? | ||||
|     view_funcs = [ | ||||
|         get_normalized, | ||||
|         get_word_shape, | ||||
|         get_last3 | ||||
|     ] | ||||
| 
 | ||||
|     cdef int find_split(self, unicode word): | ||||
|         cdef size_t length = len(word) | ||||
|         cdef int i = 0 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user