mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Iterate over lr_edges until sents are correct (#4702)
Iterate over lr_edges until all heads are within the current sentence. Instead of iterating over them for a fixed number of iterations, check whether the sentence boundaries are correct for the heads and stop when all are correct. Stop after a maximum of 10 iterations, providing a warning in this case since the sentence boundaries may not be correct.
This commit is contained in:
		
							parent
							
								
									cbacb0f1a4
								
							
						
					
					
						commit
						2d8c6e1124
					
				|  | @ -101,6 +101,7 @@ class Warnings(object): | ||||||
|             "the Knowledge Base.") |             "the Knowledge Base.") | ||||||
|     W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " |     W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " | ||||||
|             "previous components in the pipeline declare that they assign it.") |             "previous components in the pipeline declare that they assign it.") | ||||||
|  |     W026 = ("Unable to set all sentence boundaries from dependency parses.") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @add_codes | @add_codes | ||||||
|  |  | ||||||
|  | @ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): | ||||||
|     assert tokens[4].left_edge.i == 0 |     assert tokens[4].left_edge.i == 0 | ||||||
|     assert tokens[4].right_edge.i == 4 |     assert tokens[4].right_edge.i == 4 | ||||||
|     assert tokens[4].head.i == 4 |     assert tokens[4].head.i == 4 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_parser_set_sent_starts(en_vocab): | ||||||
|  |     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] | ||||||
|  |     heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] | ||||||
|  |     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] | ||||||
|  |     doc = get_doc( | ||||||
|  |         en_vocab, words=words, deps=deps, heads=heads | ||||||
|  |     ) | ||||||
|  |     for i in range(len(words)): | ||||||
|  |         if i == 0 or i == 3: | ||||||
|  |             assert doc[i].is_sent_start == True | ||||||
|  |         else: | ||||||
|  |             assert doc[i].is_sent_start == None | ||||||
|  |     for sent in doc.sents: | ||||||
|  |         for token in sent: | ||||||
|  |             assert token.head in sent | ||||||
|  |  | ||||||
|  | @ -21,6 +21,9 @@ ctypedef fused LexemeOrToken: | ||||||
| cdef int set_children_from_heads(TokenC* tokens, int length) except -1 | cdef int set_children_from_heads(TokenC* tokens, int length) except -1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 | cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1153,35 +1153,69 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: | ||||||
|         tokens[i].r_kids = 0 |         tokens[i].r_kids = 0 | ||||||
|         tokens[i].l_edge = i |         tokens[i].l_edge = i | ||||||
|         tokens[i].r_edge = i |         tokens[i].r_edge = i | ||||||
|     # Three times, for non-projectivity. See issue #3170. This isn't a very |     cdef int loop_count = 0 | ||||||
|     # satisfying fix, but I think it's sufficient. |     cdef bint heads_within_sents = False | ||||||
|     for loop_count in range(3): |     # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to | ||||||
|         # Set left edges |     # handle non-projective dependency parses, stopping when all heads are | ||||||
|         for i in range(length): |     # within their respective sentence boundaries. We have documented cases | ||||||
|             child = &tokens[i] |     # that need at least 4 iterations, so this is to be on the safe side | ||||||
|             head = &tokens[i + child.head] |     # without risking getting stuck in an infinite loop if something is | ||||||
|             if child < head and loop_count == 0: |     # terribly malformed. | ||||||
|                 head.l_kids += 1 |     while not heads_within_sents: | ||||||
|             if child.l_edge < head.l_edge: |         heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) | ||||||
|                 head.l_edge = child.l_edge |         if loop_count > 10: | ||||||
|             if child.r_edge > head.r_edge: |             user_warning(Warnings.W026) | ||||||
|                 head.r_edge = child.r_edge |         loop_count += 1 | ||||||
|         # Set right edges - same as above, but iterate in reverse |  | ||||||
|         for i in range(length-1, -1, -1): |  | ||||||
|             child = &tokens[i] |  | ||||||
|             head = &tokens[i + child.head] |  | ||||||
|             if child > head and loop_count == 0: |  | ||||||
|                 head.r_kids += 1 |  | ||||||
|             if child.r_edge > head.r_edge: |  | ||||||
|                 head.r_edge = child.r_edge |  | ||||||
|             if child.l_edge < head.l_edge: |  | ||||||
|                 head.l_edge = child.l_edge |  | ||||||
|     # Set sentence starts |     # Set sentence starts | ||||||
|     for i in range(length): |     for i in range(length): | ||||||
|         if tokens[i].head == 0 and tokens[i].dep != 0: |         if tokens[i].head == 0 and tokens[i].dep != 0: | ||||||
|             tokens[tokens[i].l_edge].sent_start = True |             tokens[tokens[i].l_edge].sent_start = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1: | ||||||
|  |     # May be called multiple times due to non-projectivity. See issues #3170 | ||||||
|  |     # and #4688. | ||||||
|  |     # Set left edges | ||||||
|  |     cdef TokenC* head | ||||||
|  |     cdef TokenC* child | ||||||
|  |     cdef int i, j | ||||||
|  |     for i in range(length): | ||||||
|  |         child = &tokens[i] | ||||||
|  |         head = &tokens[i + child.head] | ||||||
|  |         if child < head and loop_count == 0: | ||||||
|  |             head.l_kids += 1 | ||||||
|  |         if child.l_edge < head.l_edge: | ||||||
|  |             head.l_edge = child.l_edge | ||||||
|  |         if child.r_edge > head.r_edge: | ||||||
|  |             head.r_edge = child.r_edge | ||||||
|  |     # Set right edges - same as above, but iterate in reverse | ||||||
|  |     for i in range(length-1, -1, -1): | ||||||
|  |         child = &tokens[i] | ||||||
|  |         head = &tokens[i + child.head] | ||||||
|  |         if child > head and loop_count == 0: | ||||||
|  |             head.r_kids += 1 | ||||||
|  |         if child.r_edge > head.r_edge: | ||||||
|  |             head.r_edge = child.r_edge | ||||||
|  |         if child.l_edge < head.l_edge: | ||||||
|  |             head.l_edge = child.l_edge | ||||||
|  |     # Get sentence start positions according to current state | ||||||
|  |     sent_starts = set() | ||||||
|  |     for i in range(length): | ||||||
|  |         if tokens[i].head == 0 and tokens[i].dep != 0: | ||||||
|  |             sent_starts.add(tokens[i].l_edge) | ||||||
|  |     cdef int curr_sent_start = 0 | ||||||
|  |     cdef int curr_sent_end = 0 | ||||||
|  |     # Check whether any heads are not within the current sentence | ||||||
|  |     for i in range(length): | ||||||
|  |         if (i > 0 and i in sent_starts) or i == length - 1: | ||||||
|  |             curr_sent_end = i | ||||||
|  |             for j in range(curr_sent_start, curr_sent_end): | ||||||
|  |                 if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1: | ||||||
|  |                     return False | ||||||
|  |             curr_sent_start = i | ||||||
|  |     return True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef int _get_tokens_lca(Token token_j, Token token_k): | cdef int _get_tokens_lca(Token token_j, Token token_k): | ||||||
|     """Given two tokens, returns the index of the lowest common ancestor |     """Given two tokens, returns the index of the lowest common ancestor | ||||||
|     (LCA) among the two. If they have no common ancestor, -1 is returned. |     (LCA) among the two. If they have no common ancestor, -1 is returned. | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user