mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Add merge() method to Tokens, with fairly brittle/hacky implementation, but quite easy to test. Passing minimal tests. Still need to fix left/right deps in C data
This commit is contained in:
		
							parent
							
								
									557856e84c
								
							
						
					
					
						commit
						e70b87efeb
					
				|  | @ -1,8 +1,10 @@ | ||||||
| # cython: embedsignature=True | # cython: embedsignature=True | ||||||
|  | from libc.string cimport memset | ||||||
| 
 | 
 | ||||||
| from preshed.maps cimport PreshMap | from preshed.maps cimport PreshMap | ||||||
| from preshed.counter cimport PreshCounter | from preshed.counter cimport PreshCounter | ||||||
| 
 | 
 | ||||||
|  | from .strings cimport slice_unicode | ||||||
| from .vocab cimport EMPTY_LEXEME | from .vocab cimport EMPTY_LEXEME | ||||||
| from .typedefs cimport attr_id_t, attr_t | from .typedefs cimport attr_id_t, attr_t | ||||||
| from .typedefs cimport LEMMA | from .typedefs cimport LEMMA | ||||||
|  | @ -11,6 +13,7 @@ from .typedefs cimport POS, LEMMA | ||||||
| from .parts_of_speech import UNIV_POS_NAMES | from .parts_of_speech import UNIV_POS_NAMES | ||||||
| from .lexeme cimport check_flag | from .lexeme cimport check_flag | ||||||
| from .spans import Span | from .spans import Span | ||||||
|  | from .structs cimport UniStr | ||||||
| 
 | 
 | ||||||
| from unidecode import unidecode | from unidecode import unidecode | ||||||
| 
 | 
 | ||||||
|  | @ -253,6 +256,88 @@ cdef class Tokens: | ||||||
|         for i in range(self.length): |         for i in range(self.length): | ||||||
|             self.data[i] = parsed[i] |             self.data[i] = parsed[i] | ||||||
| 
 | 
 | ||||||
|  |     def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, | ||||||
|  |               unicode ent_type): | ||||||
|  |         cdef int i | ||||||
|  |         cdef int start = -1 | ||||||
|  |         cdef int end = -1 | ||||||
|  |         for i in range(self.length): | ||||||
|  |             if self.data[i].idx == start_idx: | ||||||
|  |                 start = i | ||||||
|  |             if (self.data[i].idx + self.data[i].lex.length) == end_idx: | ||||||
|  |                 end = i + 1 | ||||||
|  |                 break | ||||||
|  |         else: | ||||||
|  |             return None | ||||||
|  |         # Get LexemeC for newly merged token | ||||||
|  |         cdef UniStr new_orth_c | ||||||
|  |         slice_unicode(&new_orth_c, self._string, start_idx, end_idx) | ||||||
|  |         cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c) | ||||||
|  |         # House the new merged token where it starts | ||||||
|  |         cdef TokenC* token = &self.data[start] | ||||||
|  |         # Update fields | ||||||
|  |         token.lex = lex | ||||||
|  |         # What to do about morphology?? | ||||||
|  |         # TODO: token.morph = ??? | ||||||
|  |         token.tag = self.vocab.strings[tag] | ||||||
|  |         token.lemma = self.vocab.strings[lemma]  | ||||||
|  |         if ent_type == 'O': | ||||||
|  |             token.ent_iob = 2 | ||||||
|  |             token.ent_type = 0 | ||||||
|  |         else: | ||||||
|  |             token.ent_iob = 3 | ||||||
|  |             token.ent_type = self.vocab.strings[ent_type] | ||||||
|  |         # Fix dependencies | ||||||
|  |         # Begin by setting all the head indices to absolute token positions | ||||||
|  |         # This is easier to work with for now than the offsets | ||||||
|  |         for i in range(self.length): | ||||||
|  |             self.data[i].head += i | ||||||
|  |         # Find the head of the merged token, and its dep relation | ||||||
|  |         outer_heads = {} | ||||||
|  |         for i in range(start, end): | ||||||
|  |             head_idx = self.data[i].head | ||||||
|  |             if head_idx == i or head_idx < start or head_idx >= end: | ||||||
|  |                 # Don't consider "heads" which are actually dominated by a word | ||||||
|  |                 # in the region we're merging | ||||||
|  |                 gp = head_idx | ||||||
|  |                 while self.data[gp].head != gp: | ||||||
|  |                     if start <= gp < end: | ||||||
|  |                         break | ||||||
|  |                     gp = self.data[gp].head | ||||||
|  |                 else: | ||||||
|  |                     # If we have multiple words attaching to the same head, | ||||||
|  |                     # but with different dep labels, we're preferring the last | ||||||
|  |                     # occurring dep label. Shrug. What else could we do, I guess? | ||||||
|  |                     outer_heads[head_idx] = self.data[i].dep | ||||||
|  | 
 | ||||||
|  |         token.head, token.dep = max(outer_heads.items()) | ||||||
|  |         # Adjust deps before shrinking tokens | ||||||
|  |         # Tokens which point into the merged token should now point to it | ||||||
|  |         # Subtract the offset from all tokens which point to >= end | ||||||
|  |         offset = (end - start) - 1 | ||||||
|  |         for i in range(self.length): | ||||||
|  |             head_idx = self.data[i].head | ||||||
|  |             if start <= head_idx < end: | ||||||
|  |                 self.data[i].head = start | ||||||
|  |             elif head_idx >= end: | ||||||
|  |                 self.data[i].head -= offset | ||||||
|  |         # TODO: Fix left and right deps | ||||||
|  |         # Now compress the token array | ||||||
|  |         for i in range(end, self.length): | ||||||
|  |             self.data[i - offset] = self.data[i] | ||||||
|  |         for i in range(self.length - offset, self.length): | ||||||
|  |             memset(&self.data[i], 0, sizeof(TokenC)) | ||||||
|  |             self.data[i].lex = &EMPTY_LEXEME | ||||||
|  |         self.length -= offset | ||||||
|  |         for i in range(self.length): | ||||||
|  |             # ...And, set heads back to a relative position | ||||||
|  |             self.data[i].head -= i | ||||||
|  | 
 | ||||||
|  |         # Clear cached Python objects | ||||||
|  |         self._py_tokens = [None] * self.length | ||||||
|  |         # Return the merged Python object | ||||||
|  |         return self[start] | ||||||
|  |   | ||||||
| 
 | 
 | ||||||
| cdef class Token: | cdef class Token: | ||||||
|     """An individual token --- i.e. a word, a punctuation symbol, etc.  Created |     """An individual token --- i.e. a word, a punctuation symbol, etc.  Created | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user