mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Add dependency post-process rule to ensure spaces are attached to neighbouring tokens, so that they can't be sentence boundaries
This commit is contained in:
		
							parent
							
								
									1521cf25c9
								
							
						
					
					
						commit
						8b39feefbe
					
				|  | @ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t | ||||||
| from .transition_system cimport move_cost_func_t, label_cost_func_t | from .transition_system cimport move_cost_func_t, label_cost_func_t | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
| from ..gold cimport GoldParseC | from ..gold cimport GoldParseC | ||||||
| from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE | from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE | ||||||
|  | from ..lexeme cimport Lexeme | ||||||
| 
 | 
 | ||||||
| from libc.stdint cimport uint32_t | from libc.stdint cimport uint32_t | ||||||
| from libc.string cimport memcpy | from libc.string cimport memcpy | ||||||
|  | @ -380,7 +381,10 @@ cdef class ArcEager(TransitionSystem): | ||||||
| 
 | 
 | ||||||
|     cdef int finalize_state(self, StateClass st) nogil: |     cdef int finalize_state(self, StateClass st) nogil: | ||||||
|         for i in range(st.length): |         for i in range(st.length): | ||||||
|             if st._sent[i].head == 0 and st._sent[i].dep == 0: |             # Always attach spaces to the previous word | ||||||
|  |             if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): | ||||||
|  |                 st._sent[i].head = -1 if (i >= 1) else 1 | ||||||
|  |             elif st._sent[i].head == 0 and st._sent[i].dep == 0: | ||||||
|                 st._sent[i].dep = self.root_label |                 st._sent[i].dep = self.root_label | ||||||
|             # If we're not using the Break transition, we segment via root-labelled |             # If we're not using the Break transition, we segment via root-labelled | ||||||
|             # arcs between the root words. |             # arcs between the root words. | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user