mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Disprefer punctuation and spaces as heads of spans
This commit is contained in:
		
							parent
							
								
									bed36ab0ff
								
							
						
					
					
						commit
						334c4b2b57
					
				|  | @ -12,6 +12,8 @@ from ..attrs cimport attr_id_t | |||
| from ..parts_of_speech cimport univ_pos_t | ||||
| from ..util import normalize_slice | ||||
| from .doc cimport token_by_start, token_by_end | ||||
| from ..attrs cimport IS_PUNCT, IS_SPACE | ||||
| from ..lexeme cimport Lexeme | ||||
| 
 | ||||
| 
 | ||||
| cdef class Span: | ||||
|  | @ -183,12 +185,17 @@ cdef class Span: | |||
|             # For each word, we count the path length, and arg min this measure. | ||||
|             # We could use better tree logic to save steps here...But I think this | ||||
|             # should be okay. | ||||
|             cdef int current_best = _count_words_to_root(&self.doc.c[self.start], | ||||
|                                                          self.doc.length) | ||||
|             cdef int root = self.start | ||||
|             cdef int current_best = self.doc.length | ||||
|             cdef int root = -1 | ||||
|             for i in range(self.start, self.end): | ||||
|                 if self.start <= (i+self.doc.c[i].head) < self.end: | ||||
|                     continue | ||||
|                 # Don't allow punctuation or spaces to be the root, if there are | ||||
|                 # better candidates | ||||
|                 if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT): | ||||
|                     continue | ||||
|                 if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE): | ||||
|                     continue | ||||
|                 words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) | ||||
|                 if words_to_root < current_best: | ||||
|                     current_best = words_to_root | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user