mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Fix ent_iob tags in doc.merge to avoid inconsistent sequences
This commit is contained in:
		
							parent
							
								
									7d4687162f
								
							
						
					
					
						commit
						a3d0cb15d3
					
				| 
						 | 
					@ -955,6 +955,13 @@ cdef class Doc:
 | 
				
			||||||
                self.vocab.morphology.assign_tag(token, attr_value)
 | 
					                self.vocab.morphology.assign_tag(token, attr_value)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                Token.set_struct_attr(token, attr_name, attr_value)
 | 
					                Token.set_struct_attr(token, attr_name, attr_value)
 | 
				
			||||||
 | 
					        # Make sure ent_iob remains consistent
 | 
				
			||||||
 | 
					        if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
 | 
				
			||||||
 | 
					            if token.ent_type == self.c[end].ent_type:
 | 
				
			||||||
 | 
					                token.ent_iob = 3
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # If they're not the same entity type, let them be two entities
 | 
				
			||||||
 | 
					                self.c[end].ent_iob = 3
 | 
				
			||||||
        # Begin by setting all the head indices to absolute token positions
 | 
					        # Begin by setting all the head indices to absolute token positions
 | 
				
			||||||
        # This is easier to work with for now than the offsets
 | 
					        # This is easier to work with for now than the offsets
 | 
				
			||||||
        # Before thinking of something simpler, beware the case where a
 | 
					        # Before thinking of something simpler, beware the case where a
 | 
				
			||||||
| 
						 | 
					@ -980,8 +987,6 @@ cdef class Doc:
 | 
				
			||||||
                self.c[i].head = start
 | 
					                self.c[i].head = start
 | 
				
			||||||
            elif head_idx >= end:
 | 
					            elif head_idx >= end:
 | 
				
			||||||
                self.c[i].head -= offset
 | 
					                self.c[i].head -= offset
 | 
				
			||||||
        token.ent_iob = span[0].ent_iob
 | 
					 | 
				
			||||||
        token.ent_type = span[0].ent_type
 | 
					 | 
				
			||||||
        # Now compress the token array
 | 
					        # Now compress the token array
 | 
				
			||||||
        for i in range(end, self.length):
 | 
					        for i in range(end, self.length):
 | 
				
			||||||
            self.c[i - offset] = self.c[i]
 | 
					            self.c[i - offset] = self.c[i]
 | 
				
			||||||
| 
						 | 
					@ -992,7 +997,6 @@ cdef class Doc:
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            # ...And, set heads back to a relative position
 | 
					            # ...And, set heads back to a relative position
 | 
				
			||||||
            self.c[i].head -= i
 | 
					            self.c[i].head -= i
 | 
				
			||||||
        # TODO: Fix entity IOB
 | 
					 | 
				
			||||||
        # Set the left/right children, left/right edges
 | 
					        # Set the left/right children, left/right edges
 | 
				
			||||||
        set_children_from_heads(self.c, self.length)
 | 
					        set_children_from_heads(self.c, self.length)
 | 
				
			||||||
        # Clear the cached Python objects
 | 
					        # Clear the cached Python objects
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user