mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Fix Issue #131: Force whitespace characters to attach syntactically to previous token, and ensure they cannot serve as stand-alone 'sentence' units.
This commit is contained in:
		
							parent
							
								
									8b39feefbe
								
							
						
					
					
						commit
						9dd2f25c74
					
				|  | @ -380,10 +380,17 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     cdef int finalize_state(self, StateClass st) nogil: |     cdef int finalize_state(self, StateClass st) nogil: | ||||||
|  |         cdef int i | ||||||
|         for i in range(st.length): |         for i in range(st.length): | ||||||
|             # Always attach spaces to the previous word |             # Always attach spaces to the previous word | ||||||
|             if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): |             if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): | ||||||
|                 st._sent[i].head = -1 if (i >= 1) else 1 |                 st._sent[i].head = -1 if (i >= 1) else 1 | ||||||
|  |                 if st._sent[i].sent_start and st._sent[i].head == -1: | ||||||
|  |                     st._sent[i].sent_start = False | ||||||
|  |                     # If we had this space token as the start of a sentence, | ||||||
|  |                     # move that sentence start forward one | ||||||
|  |                     if (i + 1) < st.length and not st._sent[i+1].sent_start: | ||||||
|  |                         st._sent[i+1].sent_start = True | ||||||
|             elif st._sent[i].head == 0 and st._sent[i].dep == 0: |             elif st._sent[i].head == 0 and st._sent[i].dep == 0: | ||||||
|                 st._sent[i].dep = self.root_label |                 st._sent[i].dep = self.root_label | ||||||
|             # If we're not using the Break transition, we segment via root-labelled |             # If we're not using the Break transition, we segment via root-labelled | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user