mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals
This commit is contained in:
		
							parent
							
								
									a62c38e1ef
								
							
						
					
					
						commit
						d5bef02c72
					
				
							
								
								
									
										146129
									
								
								data/en/case
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										146129
									
								
								data/en/case
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										316709
									
								
								data/en/clusters
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										316709
									
								
								data/en/clusters
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										93
									
								
								data/en/tokenization
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								data/en/tokenization
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,93 @@ | ||||||
|  | # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions | ||||||
|  | #  21:09, 25 June 2014 | ||||||
|  | #*--*  -- | ||||||
|  | #*---* --- | ||||||
|  | #*'s  's | ||||||
|  | 
 | ||||||
|  | ain't   are not | ||||||
|  | aren't  are not | ||||||
|  | can't   can not | ||||||
|  | could've    could have | ||||||
|  | couldn't    could not | ||||||
|  | couldn't've could not have | ||||||
|  | didn't  did not | ||||||
|  | doesn't does not | ||||||
|  | don't   do not | ||||||
|  | hadn't  had not | ||||||
|  | hadn't've   had not have | ||||||
|  | hasn't  has not | ||||||
|  | haven't have not | ||||||
|  | he'd    he would | ||||||
|  | he'd've he would have | ||||||
|  | he'll   he will | ||||||
|  | he's    he 's | ||||||
|  | how'd   he would | ||||||
|  | how'll  he will | ||||||
|  | how's   how 's | ||||||
|  | I'd I would | ||||||
|  | I'd've  I would have | ||||||
|  | I'll    I will | ||||||
|  | I'm I am | ||||||
|  | I've    I have | ||||||
|  | isn't   is not | ||||||
|  | it'd    it would | ||||||
|  | it'd've it would have | ||||||
|  | it'll   it will | ||||||
|  | it's    it 's | ||||||
|  | let's   let 's | ||||||
|  | mightn't    might not | ||||||
|  | mightn't've might not have | ||||||
|  | might've    might have | ||||||
|  | mustn't must not | ||||||
|  | must've must have | ||||||
|  | needn't need not | ||||||
|  | not've  not have | ||||||
|  | shan't  shall not | ||||||
|  | she'd   she would | ||||||
|  | she'd've    she would have | ||||||
|  | she'll  she will | ||||||
|  | she's   she 's | ||||||
|  | should've   should have | ||||||
|  | shouldn't   should not | ||||||
|  | shouldn't've    should not have | ||||||
|  | that's  that 's | ||||||
|  | there'd there would | ||||||
|  | there'd've  there would have | ||||||
|  | there's there is | ||||||
|  | they'd  there would | ||||||
|  | they'd've   they would have | ||||||
|  | they'll they will | ||||||
|  | they're they are | ||||||
|  | they've they have | ||||||
|  | wasn't  was not | ||||||
|  | we'd    we would | ||||||
|  | we'd've we would have | ||||||
|  | we'll   we will | ||||||
|  | we're   we are | ||||||
|  | we've   we have | ||||||
|  | weren't were not | ||||||
|  | what'll what will | ||||||
|  | what're what are | ||||||
|  | what's  what 's | ||||||
|  | what've what have | ||||||
|  | when's  when 's | ||||||
|  | where'd where would | ||||||
|  | where's where 's | ||||||
|  | where've    where have | ||||||
|  | who'd   who would | ||||||
|  | who'll  who will | ||||||
|  | who're  who are | ||||||
|  | who's   who 's | ||||||
|  | who've  who have | ||||||
|  | why'll  who will | ||||||
|  | why're  why are | ||||||
|  | why's   why is | ||||||
|  | won't   will not | ||||||
|  | would've    would have | ||||||
|  | wouldn't    would not | ||||||
|  | wouldn't've would not have | ||||||
|  | you'd   you would | ||||||
|  | you'd've    you would have | ||||||
|  | you'll  you will | ||||||
|  | you're  you are | ||||||
|  | you've  you have | ||||||
							
								
								
									
										4001
									
								
								spacy/en.cpp
									
									
									
									
									
								
							
							
						
						
									
										4001
									
								
								spacy/en.cpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										18
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -1,17 +1,15 @@ | ||||||
| from ext.sparsehash cimport dense_hash_map | from libcpp.vector cimport vector | ||||||
| from spacy.lexeme cimport StringHash | 
 | ||||||
|  | from spacy.spacy cimport StringHash | ||||||
|  | from spacy.spacy cimport Vocab | ||||||
| from spacy.lexeme cimport Lexeme | from spacy.lexeme cimport Lexeme | ||||||
|  | from spacy.lexeme cimport Lexeme_addr | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ctypedef Py_UNICODE* string_ptr | cdef Vocab VOCAB | ||||||
| ctypedef size_t Lexeme_addr # For python interop  | cdef dict BACOV | ||||||
| ctypedef Lexeme* Lexeme_ptr |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef Lexeme_addr lookup(unicode word) except 0 | cpdef Lexeme_addr lookup(unicode word) except 0 | ||||||
| cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0 | cpdef vector[Lexeme_addr] tokenize(unicode string) except * | ||||||
| cdef StringHash hash_string(unicode s, size_t length) except 0 |  | ||||||
| cpdef unicode unhash(StringHash hash_value) | cpdef unicode unhash(StringHash hash_value) | ||||||
|  |  | ||||||
							
								
								
									
										206
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										206
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -9,211 +9,43 @@ from libc.stdint cimport uint64_t | ||||||
| from libcpp.vector cimport vector | from libcpp.vector cimport vector | ||||||
| 
 | 
 | ||||||
| from spacy.lexeme cimport Lexeme | from spacy.lexeme cimport Lexeme | ||||||
| from ext.murmurhash cimport MurmurHash64A | from spacy.string_tools cimport substr | ||||||
| from ext.murmurhash cimport MurmurHash64B |  | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
|  | cimport spacy | ||||||
| 
 | 
 | ||||||
| STRINGS = {} | BACOV = {} | ||||||
| LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() | VOCAB = Vocab() | ||||||
| LEXEMES.set_empty_key(0) | VOCAB.set_empty_key(0) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) | spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en')) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_tokenization(token_rules): |  | ||||||
|     cdef Lexeme* word |  | ||||||
|     cdef StringHash hashed |  | ||||||
|     for chunk, lex, tokens in token_rules: |  | ||||||
|         hashed = hash_string(chunk, len(chunk)) |  | ||||||
|         assert LEXEMES[hashed] == NULL |  | ||||||
|         word = _add(hashed, lex, len(lex), len(lex)) |  | ||||||
|         for i, lex in enumerate(tokens): |  | ||||||
|             token_string = '%s:@:%d:@:%s' % (chunk, i, lex) |  | ||||||
|             length = len(token_string) |  | ||||||
|             hashed = hash_string(token_string, length) |  | ||||||
|             word.tail = _add(hashed, lex, 0, len(lex)) |  | ||||||
|             word = word.tail |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| load_tokenization(util.read_tokenization('en')) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef vector[Lexeme_addr] tokenize(unicode string) except *: | cpdef vector[Lexeme_addr] tokenize(unicode string) except *: | ||||||
|     cdef size_t length = len(string) |     return spacy.tokenize(VOCAB, BACOV, find_split, string) | ||||||
|     cdef Py_UNICODE* characters = <Py_UNICODE*>string |   | ||||||
| 
 |  | ||||||
|     cdef size_t i |  | ||||||
|     cdef Py_UNICODE c |  | ||||||
| 
 |  | ||||||
|     cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() |  | ||||||
|     cdef unicode current = u'' |  | ||||||
|     cdef Lexeme* token |  | ||||||
|     for i in range(length): |  | ||||||
|         c = characters[i] |  | ||||||
|         if is_whitespace(c): |  | ||||||
|             if current: |  | ||||||
|                 token = <Lexeme*>lookup(current) |  | ||||||
|                 while token != NULL: |  | ||||||
|                     tokens.push_back(<Lexeme_addr>token) |  | ||||||
|                     token = token.tail |  | ||||||
|             current = u'' |  | ||||||
|         else: |  | ||||||
|             current += c |  | ||||||
|     if current: |  | ||||||
|         token = <Lexeme*>lookup(current) |  | ||||||
|         while token != NULL: |  | ||||||
|             tokens.push_back(<Lexeme_addr>token) |  | ||||||
|             token = token.tail |  | ||||||
|     return tokens |  | ||||||
| 
 |  | ||||||
| cdef inline bint is_whitespace(Py_UNICODE c): |  | ||||||
|     # TODO: Support other unicode spaces |  | ||||||
|     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html |  | ||||||
|     if c == u' ': |  | ||||||
|         return True |  | ||||||
|     elif c == u'\n': |  | ||||||
|         return True |  | ||||||
|     elif c == u'\t': |  | ||||||
|         return True |  | ||||||
|     else: |  | ||||||
|         return False |  | ||||||
| 
 | 
 | ||||||
| cpdef Lexeme_addr lookup(unicode string) except 0: | cpdef Lexeme_addr lookup(unicode string) except 0: | ||||||
|     '''.. function:: enumerate(sequence[, start=0]) |     return spacy.lookup(VOCAB, BACOV, find_split, -1, string) | ||||||
|     Fetch a Lexeme representing a word string. If the word has not been seen, |  | ||||||
|     construct one, splitting off any attached punctuation or clitics.  A |  | ||||||
|     reference to BLANK_WORD is returned for the empty string. |  | ||||||
|      |  | ||||||
|     To specify the boundaries of the word if it has not been seen, use lookup_chunk. |  | ||||||
|     ''' |  | ||||||
|     if string == '': |  | ||||||
|         return <Lexeme_addr>&BLANK_WORD |  | ||||||
|     cdef size_t length = len(string) |  | ||||||
|     cdef StringHash hashed = hash_string(string, length) |  | ||||||
|     cdef Lexeme* word_ptr = LEXEMES[hashed] |  | ||||||
|     cdef size_t n |  | ||||||
|     if word_ptr == NULL: |  | ||||||
|         word_ptr = _add(hashed, string, _find_split(string, length), length) |  | ||||||
|     return <Lexeme_addr>word_ptr |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: |  | ||||||
|     '''Fetch a Lexeme representing a word string. If the word has not been seen, |  | ||||||
|     construct one, given the specified start and end indices.  A negative index |  | ||||||
|     significes 0 for start, and the string length for end --- i.e. the string |  | ||||||
|     will not be sliced if start == -1 and end == -1. |  | ||||||
|      |  | ||||||
|     A reference to BLANK_WORD is returned for the empty string. |  | ||||||
|     ''' |  | ||||||
|     if string == '': |  | ||||||
|         return <Lexeme_addr>&BLANK_WORD |  | ||||||
|     cdef size_t length = len(string) |  | ||||||
|     cdef StringHash hashed = hash_string(string, length) |  | ||||||
|     cdef Lexeme* chunk_ptr = LEXEMES[hashed] |  | ||||||
|     if chunk_ptr == NULL: |  | ||||||
|         chunk_ptr = _add(hashed, string, start, length) |  | ||||||
|     return <Lexeme_addr>chunk_ptr |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef StringHash hash_string(unicode s, size_t length) except 0: |  | ||||||
|     '''Hash unicode with MurmurHash64A''' |  | ||||||
|     assert length |  | ||||||
|     return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef unicode unhash(StringHash hash_value): | cpdef unicode unhash(StringHash hash_value): | ||||||
|     '''Fetch a string from the reverse index, given its hash value.''' |     return spacy.unhash(BACOV, hash_value) | ||||||
|     cdef string_ptr string = STRINGS[hash_value] |  | ||||||
|     if string == NULL: |  | ||||||
|         raise ValueError(hash_value) |  | ||||||
| 
 |  | ||||||
|     return string |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef unicode normalize_word_string(unicode word): | cdef vector[StringHash] make_string_views(unicode word): | ||||||
|     '''Return a normalized version of the word, mapping: |  | ||||||
|     - 4 digit strings into !YEAR |  | ||||||
|     - Other digit strings into !DIGITS |  | ||||||
|     - All other strings into lower-case |  | ||||||
|     ''' |  | ||||||
|     cdef unicode s |     cdef unicode s | ||||||
|     if word.isdigit() and len(word) == 4: |     return vector[StringHash]() | ||||||
|         return '!YEAR' |     #if word.isdigit() and len(word) == 4: | ||||||
|     elif word[0].isdigit(): |     #    return '!YEAR' | ||||||
|         return '!DIGITS' |     #elif word[0].isdigit(): | ||||||
|     else: |     #    return '!DIGITS' | ||||||
|         return word.lower() |     #else: | ||||||
|      |     #    return word.lower() | ||||||
| 
 |  | ||||||
| cpdef unicode _substr(unicode string, int start, int end, size_t length): |  | ||||||
|     if end >= length: |  | ||||||
|         end = -1 |  | ||||||
|     if start >= length: |  | ||||||
|         start = 0 |  | ||||||
|     if start <= 0 and end < 0: |  | ||||||
|         return string |  | ||||||
|     elif start < 0: |  | ||||||
|         start = 0 |  | ||||||
|     elif end < 0: |  | ||||||
|         end = length |  | ||||||
|     return string[start:end] |  | ||||||
|    |    | ||||||
| 
 | 
 | ||||||
| cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: | cdef int find_split(unicode word, size_t length): | ||||||
|     assert string |  | ||||||
|     assert split <= length |  | ||||||
|     word = _init_lexeme(string, hashed, split, length) |  | ||||||
|     LEXEMES[hashed] = word |  | ||||||
|     STRINGS[hashed] = string |  | ||||||
|     return word |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, |  | ||||||
|                           int split, size_t length) except NULL: |  | ||||||
|     assert split <= length |  | ||||||
|     cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) |  | ||||||
| 
 |  | ||||||
|     word.first = <Py_UNICODE>(string[0] if string else 0) |  | ||||||
|     word.sic = hashed |  | ||||||
|      |  | ||||||
|     cdef unicode tail_string |  | ||||||
|     cdef unicode lex  |  | ||||||
|     if split != 0 and split < length: |  | ||||||
|         lex = _substr(string, 0, split, length) |  | ||||||
|         tail_string = _substr(string, split, length, length) |  | ||||||
|     else: |  | ||||||
|         lex = string |  | ||||||
|         tail_string = '' |  | ||||||
|     assert lex |  | ||||||
|     cdef unicode normed = normalize_word_string(lex) |  | ||||||
|     cdef unicode last3 = _substr(string, length - 3, length, length) |  | ||||||
| 
 |  | ||||||
|     assert normed |  | ||||||
|     assert len(normed) |  | ||||||
|      |  | ||||||
|     word.lex = hash_string(lex, len(lex)) |  | ||||||
|     word.normed = hash_string(normed, len(normed)) |  | ||||||
|     word.last3 = hash_string(last3, len(last3)) |  | ||||||
| 
 |  | ||||||
|     STRINGS[word.lex] = lex |  | ||||||
|     STRINGS[word.normed] = normed |  | ||||||
|     STRINGS[word.last3] = last3 |  | ||||||
| 
 |  | ||||||
|     # These are loaded later |  | ||||||
|     word.prob = 0 |  | ||||||
|     word.cluster = 0 |  | ||||||
|     word.oft_upper = False |  | ||||||
|     word.oft_title = False |  | ||||||
|      |  | ||||||
|     # Now recurse, and deal with the tail |  | ||||||
|     if tail_string: |  | ||||||
|         word.tail = <Lexeme*>lookup(tail_string) |  | ||||||
|     return word |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef size_t _find_split(unicode word, size_t length): |  | ||||||
|     cdef int i = 0 |     cdef int i = 0 | ||||||
|     # Contractions |     # Contractions | ||||||
|     if word.endswith("'s"): |     if word.endswith("'s"): | ||||||
|  |  | ||||||
							
								
								
									
										915
									
								
								spacy/lexeme.cpp
									
									
									
									
									
								
							
							
						
						
									
										915
									
								
								spacy/lexeme.cpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -1,9 +1,12 @@ | ||||||
| from libc.stdint cimport uint64_t | from libc.stdint cimport uint64_t | ||||||
| 
 | 
 | ||||||
| 
 | # Put these above import to avoid circular import problem | ||||||
| ctypedef int ClusterID | ctypedef int ClusterID | ||||||
| ctypedef uint64_t StringHash | ctypedef uint64_t StringHash | ||||||
|  | ctypedef size_t Lexeme_addr | ||||||
| 
 | 
 | ||||||
|  | from spacy.spacy cimport Vocab | ||||||
|  | from spacy.spacy cimport Splitter | ||||||
| 
 | 
 | ||||||
| cdef struct Lexeme: | cdef struct Lexeme: | ||||||
|     StringHash sic # Hash of the original string |     StringHash sic # Hash of the original string | ||||||
|  | @ -20,6 +23,12 @@ cdef struct Lexeme: | ||||||
|     Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens |     Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) | ||||||
|  | 
 | ||||||
|  | cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split, | ||||||
|  |                          unicode string, StringHash hashed, | ||||||
|  |                          int split, size_t length) except NULL | ||||||
|  |   | ||||||
| # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which | # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which | ||||||
| # has a conditional to pick out the correct item.  This allows safe iteration | # has a conditional to pick out the correct item.  This allows safe iteration | ||||||
| # over the Lexeme, via: | # over the Lexeme, via: | ||||||
|  |  | ||||||
|  | @ -2,6 +2,60 @@ | ||||||
| Mostly useful from Python-space. From Cython-space, you can just cast to | Mostly useful from Python-space. From Cython-space, you can just cast to | ||||||
| Lexeme* yourself. | Lexeme* yourself. | ||||||
| ''' | ''' | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from spacy.string_tools cimport substr | ||||||
|  | from spacy.spacy cimport hash_string | ||||||
|  | from spacy.spacy cimport lookup | ||||||
|  | 
 | ||||||
|  | from libc.stdlib cimport malloc, calloc, free | ||||||
|  | from libc.stdint cimport uint64_t | ||||||
|  | from libcpp.vector cimport vector | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split, | ||||||
|  |                          unicode string, StringHash hashed, | ||||||
|  |                          int split, size_t length) except NULL: | ||||||
|  |     assert split <= length | ||||||
|  |     cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) | ||||||
|  | 
 | ||||||
|  |     word.first = <Py_UNICODE>(string[0] if string else 0) | ||||||
|  |     word.sic = hashed | ||||||
|  |      | ||||||
|  |     cdef unicode tail_string | ||||||
|  |     cdef unicode lex  | ||||||
|  |     if split != 0 and split < length: | ||||||
|  |         lex = substr(string, 0, split, length) | ||||||
|  |         tail_string = substr(string, split, length, length) | ||||||
|  |     else: | ||||||
|  |         lex = string | ||||||
|  |         tail_string = '' | ||||||
|  |     assert lex | ||||||
|  |     #cdef unicode normed = normalize_word_string(lex) | ||||||
|  |     cdef unicode normed = '?' | ||||||
|  |     cdef unicode last3 = substr(string, length - 3, length, length) | ||||||
|  | 
 | ||||||
|  |     assert normed | ||||||
|  |     assert len(normed) | ||||||
|  |      | ||||||
|  |     word.lex = hash_string(lex, len(lex)) | ||||||
|  |     word.normed = hash_string(normed, len(normed)) | ||||||
|  |     word.last3 = hash_string(last3, len(last3)) | ||||||
|  | 
 | ||||||
|  |     bacov[word.lex] = lex | ||||||
|  |     bacov[word.normed] = normed | ||||||
|  |     bacov[word.last3] = last3 | ||||||
|  | 
 | ||||||
|  |     # These are loaded later | ||||||
|  |     word.prob = 0 | ||||||
|  |     word.cluster = 0 | ||||||
|  |     word.oft_upper = False | ||||||
|  |     word.oft_title = False | ||||||
|  |      | ||||||
|  |     # Now recurse, and deal with the tail | ||||||
|  |     if tail_string: | ||||||
|  |         word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string) | ||||||
|  |     return word | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef StringHash sic_of(size_t lex_id) except 0: | cpdef StringHash sic_of(size_t lex_id) except 0: | ||||||
|  |  | ||||||
							
								
								
									
										1652
									
								
								spacy/spacy.cpp
									
									
									
									
									
								
							
							
						
						
									
										1652
									
								
								spacy/spacy.cpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -1,5 +1,24 @@ | ||||||
| from libcpp.vector cimport vector | from libcpp.vector cimport vector | ||||||
|  | from libc.stdint cimport uint64_t | ||||||
|  | 
 | ||||||
|  | from ext.sparsehash cimport dense_hash_map | ||||||
|  | 
 | ||||||
|  | # Circular import problems here | ||||||
|  | ctypedef size_t Lexeme_addr | ||||||
|  | ctypedef uint64_t StringHash | ||||||
|  | ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab | ||||||
|  | ctypedef int (*Splitter)(unicode word, size_t length) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| from spacy.lexeme cimport Lexeme | from spacy.lexeme cimport Lexeme | ||||||
| 
 | 
 | ||||||
| 
 | cdef load_tokenization(Vocab& vocab, dict bacov, token_rules) | ||||||
|  | cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter, | ||||||
|  |                                   unicode string) except * | ||||||
|  | cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start, | ||||||
|  |                         unicode string) except 0 | ||||||
|  | cdef StringHash hash_string(unicode s, size_t length) except 0 | ||||||
|  | cdef unicode unhash(dict bacov, StringHash hash_value) | ||||||
|  |   | ||||||
|  |   | ||||||
| cpdef vector[size_t] expand_chunk(size_t addr) except * | cpdef vector[size_t] expand_chunk(size_t addr) except * | ||||||
|  |  | ||||||
|  | @ -1,5 +1,78 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| from spacy.lexeme cimport Lexeme | 
 | ||||||
|  | from ext.murmurhash cimport MurmurHash64A | ||||||
|  | from ext.murmurhash cimport MurmurHash64B | ||||||
|  | 
 | ||||||
|  | from spacy.lexeme cimport init_lexeme | ||||||
|  | from spacy.lexeme cimport BLANK_WORD | ||||||
|  | 
 | ||||||
|  | from spacy.string_tools cimport is_whitespace | ||||||
|  | 
 | ||||||
|  | from . import util | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef load_tokenization(Vocab& vocab, dict bacov, token_rules): | ||||||
|  |     cdef Lexeme* word | ||||||
|  |     cdef StringHash hashed | ||||||
|  |     for chunk, lex, tokens in token_rules: | ||||||
|  |         hashed = hash_string(chunk, len(chunk)) | ||||||
|  |         assert vocab[hashed] == 0 | ||||||
|  |         word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex)) | ||||||
|  |         for i, lex in enumerate(tokens): | ||||||
|  |             token_string = '%s:@:%d:@:%s' % (chunk, i, lex) | ||||||
|  |             length = len(token_string) | ||||||
|  |             hashed = hash_string(token_string, length) | ||||||
|  |             word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex)) | ||||||
|  |             word = word.tail | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter, | ||||||
|  |                                   unicode string) except *: | ||||||
|  |     cdef size_t length = len(string) | ||||||
|  |     cdef Py_UNICODE* characters = <Py_UNICODE*>string | ||||||
|  | 
 | ||||||
|  |     cdef size_t i | ||||||
|  |     cdef Py_UNICODE c | ||||||
|  | 
 | ||||||
|  |     cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() | ||||||
|  |     cdef unicode current = u'' | ||||||
|  |     cdef Lexeme* token | ||||||
|  |     for i in range(length): | ||||||
|  |         c = characters[i] | ||||||
|  |         if is_whitespace(c): | ||||||
|  |             if current: | ||||||
|  |                 token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current) | ||||||
|  |                 while token != NULL: | ||||||
|  |                     tokens.push_back(<Lexeme_addr>token) | ||||||
|  |                     token = token.tail | ||||||
|  |             current = u'' | ||||||
|  |         else: | ||||||
|  |             current += c | ||||||
|  |     if current: | ||||||
|  |         token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current) | ||||||
|  |         while token != NULL: | ||||||
|  |             tokens.push_back(<Lexeme_addr>token) | ||||||
|  |             token = token.tail | ||||||
|  |     return tokens | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start, | ||||||
|  |                         unicode string) except 0: | ||||||
|  |     '''Fetch a Lexeme representing a word string. If the word has not been seen, | ||||||
|  |     construct one, splitting off any attached punctuation or clitics.  A | ||||||
|  |     reference to BLANK_WORD is returned for the empty string. | ||||||
|  |      | ||||||
|  |     To specify the boundaries of the word if it has not been seen, use lookup_chunk. | ||||||
|  |     ''' | ||||||
|  |     if string == '': | ||||||
|  |         return <Lexeme_addr>&BLANK_WORD | ||||||
|  |     cdef size_t length = len(string) | ||||||
|  |     cdef StringHash hashed = hash_string(string, length) | ||||||
|  |     cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed] | ||||||
|  |     if word_ptr == NULL: | ||||||
|  |         start = find_split(string, length) if start == -1 else start | ||||||
|  |         word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length) | ||||||
|  |     return <Lexeme_addr>word_ptr | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef vector[size_t] expand_chunk(size_t addr) except *: | cpdef vector[size_t] expand_chunk(size_t addr) except *: | ||||||
|  | @ -11,3 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *: | ||||||
|     return tokens |     return tokens | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef StringHash hash_string(unicode s, size_t length) except 0: | ||||||
|  |     '''Hash unicode with MurmurHash64A''' | ||||||
|  |     assert length | ||||||
|  |     return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef unicode unhash(dict bacov, StringHash hash_value): | ||||||
|  |     '''Fetch a string from the reverse index, given its hash value.''' | ||||||
|  |     return bacov[hash_value] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed, | ||||||
|  |                   unicode string, int split, size_t length) except NULL: | ||||||
|  |     assert string | ||||||
|  |     assert split <= length | ||||||
|  |     word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length) | ||||||
|  |     vocab[hashed] = <Lexeme_addr>word | ||||||
|  |     bacov[hashed] = string | ||||||
|  |     return word | ||||||
|  |  | ||||||
							
								
								
									
										3
									
								
								spacy/string_tools.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								spacy/string_tools.pxd
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,3 @@ | ||||||
|  | cpdef unicode substr(unicode string, int start, int end, size_t length) | ||||||
|  | 
 | ||||||
|  | cdef bint is_whitespace(Py_UNICODE c) | ||||||
							
								
								
									
										25
									
								
								spacy/string_tools.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/string_tools.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | ||||||
|  | cpdef unicode substr(unicode string, int start, int end, size_t length): | ||||||
|  |     if end >= length: | ||||||
|  |         end = -1 | ||||||
|  |     if start >= length: | ||||||
|  |         start = 0 | ||||||
|  |     if start <= 0 and end < 0: | ||||||
|  |         return string | ||||||
|  |     elif start < 0: | ||||||
|  |         start = 0 | ||||||
|  |     elif end < 0: | ||||||
|  |         end = length | ||||||
|  |     return string[start:end] | ||||||
|  |    | ||||||
|  | 
 | ||||||
|  | cdef bint is_whitespace(Py_UNICODE c): | ||||||
|  |     # TODO: Support other unicode spaces | ||||||
|  |     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html | ||||||
|  |     if c == u' ': | ||||||
|  |         return True | ||||||
|  |     elif c == u'\n': | ||||||
|  |         return True | ||||||
|  |     elif c == u'\t': | ||||||
|  |         return True | ||||||
|  |     else: | ||||||
|  |         return False | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user