mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Tests passing for new Word object version
This commit is contained in:
		
							parent
							
								
									9815c7649e
								
							
						
					
					
						commit
						3b793cf4f7
					
				
							
								
								
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
								
							
							
						
						
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
								
							
							
						
						
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -1,104 +0,0 @@ | ||||||
| # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions |  | ||||||
| #  21:09, 25 June 2014 |  | ||||||
| #*--*  -- |  | ||||||
| #*---* --- |  | ||||||
| #*'s  's |  | ||||||
| 
 |  | ||||||
| cannot  can not |  | ||||||
| d'ye    d' ye |  | ||||||
| gimme   gim me |  | ||||||
| gonna   gon na |  | ||||||
| lemme   lem me |  | ||||||
| more'n  more 'n |  | ||||||
| 'tis    't is |  | ||||||
| 'twas   't was |  | ||||||
| wanna   wan na |  | ||||||
| whaddya wha dd ya |  | ||||||
| whatcha wha t cha |  | ||||||
| ain't   ai n't |  | ||||||
| aren't  are n't |  | ||||||
| can't   can n't |  | ||||||
| could've    could 've |  | ||||||
| couldn't    could n't |  | ||||||
| couldn't've could n't 've |  | ||||||
| didn't  did n't |  | ||||||
| doesn't does n't |  | ||||||
| don't   do n't |  | ||||||
| hadn't  had n't |  | ||||||
| hadn't've   had n't 've |  | ||||||
| hasn't  has n't |  | ||||||
| haven't have n't |  | ||||||
| he'd    he 'd |  | ||||||
| he'd've he 'd 've |  | ||||||
| he'll   he 'll |  | ||||||
| he's    he 's |  | ||||||
| how'd   he 'd |  | ||||||
| how'll  he 'll |  | ||||||
| how's   how 's |  | ||||||
| I'd I 'd |  | ||||||
| I'd've  I 'd 've |  | ||||||
| I'll    I 'll |  | ||||||
| I'm I 'm |  | ||||||
| I've    I 've |  | ||||||
| isn't   is n't |  | ||||||
| it'd    it 'd |  | ||||||
| it'd've it 'd 've |  | ||||||
| it'll   it 'll |  | ||||||
| it's    it 's |  | ||||||
| let's   let 's |  | ||||||
| mightn't    might n't |  | ||||||
| mightn't've might n't 've |  | ||||||
| might've    might 've |  | ||||||
| mustn't must n't |  | ||||||
| must've must 've |  | ||||||
| needn't need n't |  | ||||||
| not've  not h've |  | ||||||
| shan't  sha n't |  | ||||||
| she'd   she 'd |  | ||||||
| she'd've    she 'd 've |  | ||||||
| she'll  she 'll |  | ||||||
| she's   she 's |  | ||||||
| should've   should 've |  | ||||||
| shouldn't   should n't |  | ||||||
| shouldn't've    should n't 've |  | ||||||
| that's  that 's |  | ||||||
| there'd there 'd |  | ||||||
| there'd've  there 'd 've |  | ||||||
| there's there 's |  | ||||||
| they'd  there 'd |  | ||||||
| they'd've   they 'd 've |  | ||||||
| they'll they 'll |  | ||||||
| they're they 're |  | ||||||
| they've they 've |  | ||||||
| wasn't  was n't |  | ||||||
| we'd    we 'd |  | ||||||
| we'd've we 'd h've |  | ||||||
| we'll   we 'll |  | ||||||
| we're   we 're |  | ||||||
| we've   we h've |  | ||||||
| weren't were n't |  | ||||||
| what'll what 'll |  | ||||||
| what're what 're |  | ||||||
| what's  what 's |  | ||||||
| what've what 've |  | ||||||
| when's  when 's |  | ||||||
| where'd where 'd |  | ||||||
| where's where 's |  | ||||||
| where've    where 've |  | ||||||
| who'd   who 'd |  | ||||||
| who'll  who 'll |  | ||||||
| who're  who 're |  | ||||||
| who's   who 's |  | ||||||
| who've  who 've |  | ||||||
| why'll  why 'll |  | ||||||
| why're  why 're |  | ||||||
| why's   why 's |  | ||||||
| won't   will n't |  | ||||||
| would've    would 've |  | ||||||
| wouldn't    would n't |  | ||||||
| wouldn't've would n't 've |  | ||||||
| you'd   you 'd |  | ||||||
| you'd've    you 'd 've |  | ||||||
| you'll  you 'll |  | ||||||
| you're  you 're |  | ||||||
| you've  you 've |  | ||||||
|  | @ -2,103 +2,4 @@ | ||||||
| #  21:09, 25 June 2014 | #  21:09, 25 June 2014 | ||||||
| #*--*  -- | #*--*  -- | ||||||
| #*---* --- | #*---* --- | ||||||
| #*'s  's |  | ||||||
| 
 | 
 | ||||||
| cannot  can not |  | ||||||
| d'ye    d' ye |  | ||||||
| gimme   gim me |  | ||||||
| gonna   gon na |  | ||||||
| lemme   lem me |  | ||||||
| more'n  more 'n |  | ||||||
| 'tis    't is |  | ||||||
| 'twas   't was |  | ||||||
| wanna   wan na |  | ||||||
| whaddya wha dd ya |  | ||||||
| whatcha wha t cha |  | ||||||
| ain't   ai n't |  | ||||||
| aren't  are n't |  | ||||||
| can't   can n't |  | ||||||
| could've    could 've |  | ||||||
| couldn't    could n't |  | ||||||
| couldn't've could n't 've |  | ||||||
| didn't  did n't |  | ||||||
| doesn't does n't |  | ||||||
| don't   do n't |  | ||||||
| hadn't  had n't |  | ||||||
| hadn't've   had n't 've |  | ||||||
| hasn't  has n't |  | ||||||
| haven't have n't |  | ||||||
| he'd    he 'd |  | ||||||
| he'd've he 'd 've |  | ||||||
| he'll   he 'll |  | ||||||
| he's    he 's |  | ||||||
| how'd   he 'd |  | ||||||
| how'll  he 'll |  | ||||||
| how's   how 's |  | ||||||
| I'd I 'd |  | ||||||
| I'd've  I 'd 've |  | ||||||
| I'll    I 'll |  | ||||||
| I'm I 'm |  | ||||||
| I've    I 've |  | ||||||
| isn't   is n't |  | ||||||
| it'd    it 'd |  | ||||||
| it'd've it 'd 've |  | ||||||
| it'll   it 'll |  | ||||||
| it's    it 's |  | ||||||
| let's   let 's |  | ||||||
| mightn't    might n't |  | ||||||
| mightn't've might n't 've |  | ||||||
| might've    might 've |  | ||||||
| mustn't must n't |  | ||||||
| must've must 've |  | ||||||
| needn't need n't |  | ||||||
| not've  not h've |  | ||||||
| shan't  sha n't |  | ||||||
| she'd   she 'd |  | ||||||
| she'd've    she 'd 've |  | ||||||
| she'll  she 'll |  | ||||||
| she's   she 's |  | ||||||
| should've   should 've |  | ||||||
| shouldn't   should n't |  | ||||||
| shouldn't've    should n't 've |  | ||||||
| that's  that 's |  | ||||||
| there'd there 'd |  | ||||||
| there'd've  there 'd 've |  | ||||||
| there's there 's |  | ||||||
| they'd  there 'd |  | ||||||
| they'd've   they 'd 've |  | ||||||
| they'll they 'll |  | ||||||
| they're they 're |  | ||||||
| they've they 've |  | ||||||
| wasn't  was n't |  | ||||||
| we'd    we 'd |  | ||||||
| we'd've we 'd h've |  | ||||||
| we'll   we 'll |  | ||||||
| we're   we 're |  | ||||||
| we've   we h've |  | ||||||
| weren't were n't |  | ||||||
| what'll what 'll |  | ||||||
| what're what 're |  | ||||||
| what's  what 's |  | ||||||
| what've what 've |  | ||||||
| when's  when 's |  | ||||||
| where'd where 'd |  | ||||||
| where's where 's |  | ||||||
| where've    where 've |  | ||||||
| who'd   who 'd |  | ||||||
| who'll  who 'll |  | ||||||
| who're  who 're |  | ||||||
| who's   who 's |  | ||||||
| who've  who 've |  | ||||||
| why'll  why 'll |  | ||||||
| why're  why 're |  | ||||||
| why's   why 's |  | ||||||
| won't   will n't |  | ||||||
| would've    would 've |  | ||||||
| wouldn't    would n't |  | ||||||
| wouldn't've would n't 've |  | ||||||
| you'd   you 'd |  | ||||||
| you'd've    you 'd 've |  | ||||||
| you'll  you 'll |  | ||||||
| you're  you 're |  | ||||||
| you've  you 've |  | ||||||
|  |  | ||||||
|  | @ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon | ||||||
|      |      | ||||||
|     guide/overview |     guide/overview | ||||||
|     guide/install |     guide/install | ||||||
|     api/languages/index.rst |     api/index.rst | ||||||
|     api/modules/index.rst |  | ||||||
| 
 | 
 | ||||||
| Source (GitHub) | Source (GitHub) | ||||||
| ---------------- | ---------------- | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							|  | @ -9,7 +9,6 @@ def clean(): | ||||||
| def docs(): | def docs(): | ||||||
|     with lcd('docs'): |     with lcd('docs'): | ||||||
|         local('sphinx-build -b html . ./_build') |         local('sphinx-build -b html . ./_build') | ||||||
|         local('open _build/index.html') |  | ||||||
| 
 | 
 | ||||||
| def test(): | def test(): | ||||||
|     local('py.test -x') |     local('py.test -x') | ||||||
|  |  | ||||||
							
								
								
									
										7
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -45,16 +45,13 @@ else: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| exts = [ | exts = [ | ||||||
|     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), |     #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), | ||||||
|     Extension("spacy.en", ["spacy/en.pyx"], language="c++", |     Extension("spacy.en", ["spacy/en.pyx"], language="c++", | ||||||
|               include_dirs=includes), |               include_dirs=includes), | ||||||
|     Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes), |     Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes), | ||||||
|     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), |  | ||||||
|     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), |     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), | ||||||
|     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", |     Extension("spacy.word", ["spacy/word.pyx"], language="c++", | ||||||
|               include_dirs=includes), |               include_dirs=includes), | ||||||
|     Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++", |  | ||||||
|               include_dirs=includes) |  | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										20
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -5,17 +5,17 @@ scheme in several important respects: | ||||||
| 
 | 
 | ||||||
| * Whitespace is added as tokens, except for single spaces. e.g., | * Whitespace is added as tokens, except for single spaces. e.g., | ||||||
| 
 | 
 | ||||||
|     >>> tokenize(u'\\nHello  \\tThere').strings |     >>> [w.string for w in tokenize(u'\\nHello  \\tThere')] | ||||||
|     [u'\\n', u'Hello', u' ', u'\\t', u'There'] |     [u'\\n', u'Hello', u' ', u'\\t', u'There'] | ||||||
| 
 | 
 | ||||||
| * Contractions are normalized, e.g. | * Contractions are normalized, e.g. | ||||||
| 
 | 
 | ||||||
|     >>> tokenize(u"isn't ain't won't he's").strings |     >>> [w.string for w in u"isn't ain't won't he's")] | ||||||
|     [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] |     [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] | ||||||
|    |    | ||||||
| * Hyphenated words are split, with the hyphen preserved, e.g.: | * Hyphenated words are split, with the hyphen preserved, e.g.: | ||||||
|      |      | ||||||
|     >>> tokenize(u'New York-based').strings |     >>> [w.string for w in tokenize(u'New York-based')] | ||||||
|     [u'New', u'York', u'-', u'based'] |     [u'New', u'York', u'-', u'based'] | ||||||
| 
 | 
 | ||||||
| Other improvements: | Other improvements: | ||||||
|  | @ -39,25 +39,11 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from libc.stdlib cimport malloc, calloc, free | from libc.stdlib cimport malloc, calloc, free | ||||||
| from libc.stdint cimport uint64_t | from libc.stdint cimport uint64_t | ||||||
| from libcpp.vector cimport vector |  | ||||||
| 
 | 
 | ||||||
| cimport spacy | cimport spacy | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| from spacy.orthography.latin cimport * |  | ||||||
| 
 |  | ||||||
| from .orthography.latin import * |  | ||||||
| from .lexeme import * |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class English(spacy.Language): | cdef class English(spacy.Language): | ||||||
|     # How to ensure the order here aligns with orthography.latin? |  | ||||||
|     view_funcs = [ |  | ||||||
|         get_normalized, |  | ||||||
|         get_word_shape, |  | ||||||
|         get_last3 |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     cdef int find_split(self, unicode word): |     cdef int find_split(self, unicode word): | ||||||
|         cdef size_t length = len(word) |         cdef size_t length = len(word) | ||||||
|         cdef int i = 0 |         cdef int i = 0 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user