mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Fix problems with lower and whitespace in variants
Port relevant changes from #5361: * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants
This commit is contained in:
		
							parent
							
								
									10d938f221
								
							
						
					
					
						commit
						1d8168d1fd
					
				| 
						 | 
					@ -362,6 +362,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
 | 
				
			||||||
    if not example.token_annotation:
 | 
					    if not example.token_annotation:
 | 
				
			||||||
        return example
 | 
					        return example
 | 
				
			||||||
    raw = example.text
 | 
					    raw = example.text
 | 
				
			||||||
 | 
					    lower = False
 | 
				
			||||||
    if random.random() >= 0.5:
 | 
					    if random.random() >= 0.5:
 | 
				
			||||||
        lower = True
 | 
					        lower = True
 | 
				
			||||||
        if raw is not None:
 | 
					        if raw is not None:
 | 
				
			||||||
| 
						 | 
					@ -429,8 +430,11 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
 | 
				
			||||||
            raw_idx += 1
 | 
					            raw_idx += 1
 | 
				
			||||||
        for word in variant_example.token_annotation.words:
 | 
					        for word in variant_example.token_annotation.words:
 | 
				
			||||||
            match_found = False
 | 
					            match_found = False
 | 
				
			||||||
 | 
					            # skip whitespace words
 | 
				
			||||||
 | 
					            if word.isspace():
 | 
				
			||||||
 | 
					                match_found = True
 | 
				
			||||||
            # add identical word
 | 
					            # add identical word
 | 
				
			||||||
            if word not in variants and raw[raw_idx:].startswith(word):
 | 
					            elif word not in variants and raw[raw_idx:].startswith(word):
 | 
				
			||||||
                variant_raw += word
 | 
					                variant_raw += word
 | 
				
			||||||
                raw_idx += len(word)
 | 
					                raw_idx += len(word)
 | 
				
			||||||
                match_found = True
 | 
					                match_found = True
 | 
				
			||||||
| 
						 | 
					@ -445,6 +449,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
 | 
				
			||||||
            # something went wrong, abort
 | 
					            # something went wrong, abort
 | 
				
			||||||
            # (add a warning message?)
 | 
					            # (add a warning message?)
 | 
				
			||||||
            if not match_found:
 | 
					            if not match_found:
 | 
				
			||||||
 | 
					                print("aborting")
 | 
				
			||||||
                return example
 | 
					                return example
 | 
				
			||||||
            # add following whitespace
 | 
					            # add following whitespace
 | 
				
			||||||
            while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
 | 
					            while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user