mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Begin work on full PTB-compatible English tokenization
This commit is contained in:
		
							parent
							
								
									0c1be7effe
								
							
						
					
					
						commit
						df0458001d
					
				
							
								
								
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										146129
									
								
								data/en_ptb/case
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										316709
									
								
								data/en_ptb/clusters
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										93
									
								
								data/en_ptb/tokenization
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								data/en_ptb/tokenization
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,93 @@ | ||||||
|  | # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions | ||||||
|  | #  21:09, 25 June 2014 | ||||||
|  | #*--*  -- | ||||||
|  | #*---* --- | ||||||
|  | #*'s  's | ||||||
|  | 
 | ||||||
|  | ain't   are not | ||||||
|  | aren't  are not | ||||||
|  | can't   can not | ||||||
|  | could've    could have | ||||||
|  | couldn't    could not | ||||||
|  | couldn't've could not have | ||||||
|  | didn't  did not | ||||||
|  | doesn't does not | ||||||
|  | don't   do not | ||||||
|  | hadn't  had not | ||||||
|  | hadn't've   had not have | ||||||
|  | hasn't  has not | ||||||
|  | haven't have not | ||||||
|  | he'd    he would | ||||||
|  | he'd've he would have | ||||||
|  | he'll   he will | ||||||
|  | he's    he 's | ||||||
|  | how'd   he would | ||||||
|  | how'll  he will | ||||||
|  | how's   how 's | ||||||
|  | I'd I would | ||||||
|  | I'd've  I would have | ||||||
|  | I'll    I will | ||||||
|  | I'm I am | ||||||
|  | I've    I have | ||||||
|  | isn't   is not | ||||||
|  | it'd    it would | ||||||
|  | it'd've it would have | ||||||
|  | it'll   it will | ||||||
|  | it's    it 's | ||||||
|  | let's   let 's | ||||||
|  | mightn't    might not | ||||||
|  | mightn't've might not have | ||||||
|  | might've    might have | ||||||
|  | mustn't must not | ||||||
|  | must've must have | ||||||
|  | needn't need not | ||||||
|  | not've  not have | ||||||
|  | shan't  shall not | ||||||
|  | she'd   she would | ||||||
|  | she'd've    she would have | ||||||
|  | she'll  she will | ||||||
|  | she's   she 's | ||||||
|  | should've   should have | ||||||
|  | shouldn't   should not | ||||||
|  | shouldn't've    should not have | ||||||
|  | that's  that 's | ||||||
|  | there'd there would | ||||||
|  | there'd've  there would have | ||||||
|  | there's there is | ||||||
|  | they'd  there would | ||||||
|  | they'd've   they would have | ||||||
|  | they'll they will | ||||||
|  | they're they are | ||||||
|  | they've they have | ||||||
|  | wasn't  was not | ||||||
|  | we'd    we would | ||||||
|  | we'd've we would have | ||||||
|  | we'll   we will | ||||||
|  | we're   we are | ||||||
|  | we've   we have | ||||||
|  | weren't were not | ||||||
|  | what'll what will | ||||||
|  | what're what are | ||||||
|  | what's  what 's | ||||||
|  | what've what have | ||||||
|  | when's  when 's | ||||||
|  | where'd where would | ||||||
|  | where's where 's | ||||||
|  | where've    where have | ||||||
|  | who'd   who would | ||||||
|  | who'll  who will | ||||||
|  | who're  who are | ||||||
|  | who's   who 's | ||||||
|  | who've  who have | ||||||
|  | why'll  who will | ||||||
|  | why're  why are | ||||||
|  | why's   why is | ||||||
|  | won't   will not | ||||||
|  | would've    would have | ||||||
|  | wouldn't    would not | ||||||
|  | wouldn't've would not have | ||||||
|  | you'd   you would | ||||||
|  | you'd've    you would have | ||||||
|  | you'll  you will | ||||||
|  | you're  you are | ||||||
|  | you've  you have | ||||||
							
								
								
									
										15
									
								
								spacy/en_ptb.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/en_ptb.pxd
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | ||||||
|  | from libcpp.vector cimport vector | ||||||
|  | 
 | ||||||
|  | from spacy.spacy cimport StringHash | ||||||
|  | from spacy.spacy cimport Vocab | ||||||
|  | from spacy.lexeme cimport Lexeme | ||||||
|  | from spacy.lexeme cimport Lexeme_addr | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef Vocab VOCAB | ||||||
|  | cdef dict BACOV | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cpdef Lexeme_addr lookup(unicode word) except 0 | ||||||
|  | cpdef vector[Lexeme_addr] tokenize(unicode string) except * | ||||||
|  | cpdef unicode unhash(StringHash hash_value) | ||||||
							
								
								
									
										74
									
								
								spacy/en_ptb.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								spacy/en_ptb.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,74 @@ | ||||||
|  | '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, | ||||||
|  | so that strings can be retrieved from hashes.  Use 64-bit hash values and | ||||||
|  | boldly assume no collisions. | ||||||
|  | ''' | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from libc.stdlib cimport malloc, calloc, free | ||||||
|  | from libc.stdint cimport uint64_t | ||||||
|  | from libcpp.vector cimport vector | ||||||
|  | 
 | ||||||
|  | from spacy.lexeme cimport Lexeme | ||||||
|  | from spacy.string_tools cimport substr | ||||||
|  | from . import util | ||||||
|  | 
 | ||||||
|  | cimport spacy | ||||||
|  | 
 | ||||||
|  | BACOV = {} | ||||||
|  | VOCAB = Vocab() | ||||||
|  | VOCAB.set_empty_key(0) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en')) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cpdef vector[Lexeme_addr] tokenize(unicode string) except *: | ||||||
|  |     return spacy.tokenize(VOCAB, BACOV, find_split, string) | ||||||
|  |   | ||||||
|  | 
 | ||||||
|  | cpdef Lexeme_addr lookup(unicode string) except 0: | ||||||
|  |     return spacy.lookup(VOCAB, BACOV, find_split, -1, string) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cpdef unicode unhash(StringHash hash_value): | ||||||
|  |     return spacy.unhash(BACOV, hash_value) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef vector[StringHash] make_string_views(unicode word): | ||||||
|  |     cdef unicode s | ||||||
|  |     return vector[StringHash]() | ||||||
|  |     #if word.isdigit() and len(word) == 4: | ||||||
|  |     #    return '!YEAR' | ||||||
|  |     #elif word[0].isdigit(): | ||||||
|  |     #    return '!DIGITS' | ||||||
|  |     #else: | ||||||
|  |     #    return word.lower() | ||||||
|  |    | ||||||
|  | 
 | ||||||
|  | cdef int find_split(unicode word, size_t length): | ||||||
|  |     cdef int i = 0 | ||||||
|  |     # Contractions | ||||||
|  |     if word.endswith("'s"): | ||||||
|  |         return length - 2 | ||||||
|  |     # Leading punctuation | ||||||
|  |     if is_punct(word, 0, length): | ||||||
|  |         return 1 | ||||||
|  |     elif length >= 1: | ||||||
|  |         # Split off all trailing punctuation characters | ||||||
|  |         i = 0 | ||||||
|  |         while i < length and not is_punct(word, i, length): | ||||||
|  |             i += 1 | ||||||
|  |     return i | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef bint is_punct(unicode word, size_t i, size_t length): | ||||||
|  |     # Don't count appostrophes as punct if the next char is a letter | ||||||
|  |     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): | ||||||
|  |         return False | ||||||
|  |     # Don't count commas as punct if the next char is a number | ||||||
|  |     if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): | ||||||
|  |         return False | ||||||
|  |     # Don't count periods as punct if the next char is a number | ||||||
|  |     if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): | ||||||
|  |         return False | ||||||
|  |     return not word[i].isalnum() | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user