mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* merge add lex last - add index finder funcs
This commit is contained in:
		
							parent
							
								
									a06e3c8963
								
							
						
					
					
						commit
						562db6d2d0
					
				|  | @ -438,11 +438,25 @@ cdef class Doc: | ||||||
|                 keep_reading = False |                 keep_reading = False | ||||||
|             yield n_bytes_str + data |             yield n_bytes_str + data | ||||||
| 
 | 
 | ||||||
|     # This function is terrible --- need to fix this. |     def token_index_start(self, int start_idx): | ||||||
|     def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, |         """ Get index of token in doc that has character index start_idx """ | ||||||
|               unicode ent_type): |         cdef int i | ||||||
|         """Merge a multi-word expression into a single token.  Currently |         for i in range(self.length): | ||||||
|         experimental; API is likely to change.""" |             if self.c[i].idx == start_idx: | ||||||
|  |                 return i | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def token_index_end(self, int end_idx): | ||||||
|  |         """ Get index+1 of token in doc ending with character index end_idx """ | ||||||
|  |         cdef int i | ||||||
|  |         for i in range(self.length): | ||||||
|  |             if (self.c[i].idx + self.c[i].lex.length) == end_idx: | ||||||
|  |                 return i + 1 | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def range_from_indices(self, int start_idx, int end_idx): | ||||||
|  |         """ Get tuple - span of token indices which correspond to | ||||||
|  |             character indices (start_idx, end_idx) if such a span exists""" | ||||||
|         cdef int i |         cdef int i | ||||||
|         cdef int start = -1 |         cdef int start = -1 | ||||||
|         cdef int end = -1 |         cdef int end = -1 | ||||||
|  | @ -453,10 +467,18 @@ cdef class Doc: | ||||||
|                 if start == -1: |                 if start == -1: | ||||||
|                     return None |                     return None | ||||||
|                 end = i + 1 |                 end = i + 1 | ||||||
|                 break |                 return (start, end) | ||||||
|         else: |  | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|  |     # This function is terrible --- need to fix this. | ||||||
|  |     def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, | ||||||
|  |               unicode ent_type): | ||||||
|  |         """Merge a multi-word expression into a single token.  Currently | ||||||
|  |         experimental; API is likely to change.""" | ||||||
|  |         start_end = self.range_from_indices(start_idx, end_idx) | ||||||
|  |         if start_end is None: | ||||||
|  |             return None | ||||||
|  |         start, end = start_end | ||||||
|         cdef Span span = self[start:end] |         cdef Span span = self[start:end] | ||||||
|         # Get LexemeC for newly merged token |         # Get LexemeC for newly merged token | ||||||
|         new_orth = ''.join([t.text_with_ws for t in span]) |         new_orth = ''.join([t.text_with_ws for t in span]) | ||||||
|  | @ -465,8 +487,6 @@ cdef class Doc: | ||||||
|         cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) |         cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) | ||||||
|         # House the new merged token where it starts |         # House the new merged token where it starts | ||||||
|         cdef TokenC* token = &self.c[start] |         cdef TokenC* token = &self.c[start] | ||||||
|         # Update fields |  | ||||||
|         token.lex = lex |  | ||||||
|         token.spacy = self.c[end-1].spacy |         token.spacy = self.c[end-1].spacy | ||||||
|         if tag in self.vocab.morphology.tag_map: |         if tag in self.vocab.morphology.tag_map: | ||||||
|             self.vocab.morphology.assign_tag(token, tag) |             self.vocab.morphology.assign_tag(token, tag) | ||||||
|  | @ -485,6 +505,10 @@ cdef class Doc: | ||||||
|         # bridges over the entity. Here the alignment of the tokens changes. |         # bridges over the entity. Here the alignment of the tokens changes. | ||||||
|         span_root = span.root.i |         span_root = span.root.i | ||||||
|         token.dep = span.root.dep |         token.dep = span.root.dep | ||||||
|  |         # We update token.lex after keeping span root and dep, since | ||||||
|  |         # setting token.lex will change span.start and span.end properties | ||||||
|  |         # as it modifies the character offsets in the doc | ||||||
|  |         token.lex = lex | ||||||
|         for i in range(self.length): |         for i in range(self.length): | ||||||
|             self.c[i].head += i |             self.c[i].head += i | ||||||
|         # Set the head of the merged token, and its dep relation, from the Span |         # Set the head of the merged token, and its dep relation, from the Span | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user