mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	fix augment (needs further testing)
This commit is contained in:
		
							parent
							
								
									4ed399c848
								
							
						
					
					
						commit
						3c4f9e4cc4
					
				|  | @ -1,14 +1,14 @@ | ||||||
| import random | import random | ||||||
| import itertools | import itertools | ||||||
| from .example import Example |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def make_orth_variants(nlp, example, orth_variant_level=0.0): | def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): | ||||||
|     if random.random() >= orth_variant_level: |     if random.random() >= orth_variant_level: | ||||||
|         return example |         return raw_text, orig_token_dict | ||||||
|     if not example.token_annotation: |     if not orig_token_dict: | ||||||
|         return example |         return raw_text, orig_token_dict | ||||||
|     raw = example.text |     raw = raw_text | ||||||
|  |     token_dict = orig_token_dict | ||||||
|     lower = False |     lower = False | ||||||
|     if random.random() >= 0.5: |     if random.random() >= 0.5: | ||||||
|         lower = True |         lower = True | ||||||
|  | @ -16,16 +16,10 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | ||||||
|             raw = raw.lower() |             raw = raw.lower() | ||||||
|     ndsv = nlp.Defaults.single_orth_variants |     ndsv = nlp.Defaults.single_orth_variants | ||||||
|     ndpv = nlp.Defaults.paired_orth_variants |     ndpv = nlp.Defaults.paired_orth_variants | ||||||
|     # modify words in paragraph_tuples |     words = token_dict.get("words", []) | ||||||
|     variant_example = Example(doc=nlp.make_doc(raw)) |     tags = token_dict.get("tags", []) | ||||||
|     token_annotation = example.token_annotation |     # keep unmodified if words or tags are not defined | ||||||
|     words = token_annotation.words |     if words and tags: | ||||||
|     tags = token_annotation.tags |  | ||||||
|     if not words or not tags: |  | ||||||
|         # add the unmodified annotation |  | ||||||
|         token_dict = token_annotation.to_dict() |  | ||||||
|         variant_example.token_annotation = TokenAnnotation(**token_dict) |  | ||||||
|     else: |  | ||||||
|         if lower: |         if lower: | ||||||
|             words = [w.lower() for w in words] |             words = [w.lower() for w in words] | ||||||
|         # single variants |         # single variants | ||||||
|  | @ -56,12 +50,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | ||||||
|                             if words[word_idx] in pair: |                             if words[word_idx] in pair: | ||||||
|                                 pair_idx = pair.index(words[word_idx]) |                                 pair_idx = pair.index(words[word_idx]) | ||||||
|                     words[word_idx] = punct_choices[punct_idx][pair_idx] |                     words[word_idx] = punct_choices[punct_idx][pair_idx] | ||||||
| 
 |  | ||||||
|         token_dict = token_annotation.to_dict() |  | ||||||
|         token_dict["words"] = words |         token_dict["words"] = words | ||||||
|         token_dict["tags"] = tags |         token_dict["tags"] = tags | ||||||
|         variant_example.token_annotation = TokenAnnotation(**token_dict) |     # modify raw | ||||||
|     # modify raw to match variant_paragraph_tuples |  | ||||||
|     if raw is not None: |     if raw is not None: | ||||||
|         variants = [] |         variants = [] | ||||||
|         for single_variants in ndsv: |         for single_variants in ndsv: | ||||||
|  | @ -80,7 +71,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | ||||||
|         while raw_idx < len(raw) and raw[raw_idx].isspace(): |         while raw_idx < len(raw) and raw[raw_idx].isspace(): | ||||||
|             variant_raw += raw[raw_idx] |             variant_raw += raw[raw_idx] | ||||||
|             raw_idx += 1 |             raw_idx += 1 | ||||||
|         for word in variant_example.token_annotation.words: |         for word in words: | ||||||
|             match_found = False |             match_found = False | ||||||
|             # skip whitespace words |             # skip whitespace words | ||||||
|             if word.isspace(): |             if word.isspace(): | ||||||
|  | @ -100,14 +91,13 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | ||||||
|             # something went wrong, abort |             # something went wrong, abort | ||||||
|             # (add a warning message?) |             # (add a warning message?) | ||||||
|             if not match_found: |             if not match_found: | ||||||
|                 return example |                 return raw_text, orig_token_dict | ||||||
|             # add following whitespace |             # add following whitespace | ||||||
|             while raw_idx < len(raw) and raw[raw_idx].isspace(): |             while raw_idx < len(raw) and raw[raw_idx].isspace(): | ||||||
|                 variant_raw += raw[raw_idx] |                 variant_raw += raw[raw_idx] | ||||||
|                 raw_idx += 1 |                 raw_idx += 1 | ||||||
|         variant_example.doc = variant_raw |         raw = variant_raw | ||||||
|         return variant_example |     return raw, token_dict | ||||||
|     return variant_example |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def add_noise(orig, noise_level): | def add_noise(orig, noise_level): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user