mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 08:27:44 +03:00 
			
		
		
		
	cleanup
This commit is contained in:
		
							parent
							
								
									64b57204e2
								
							
						
					
					
						commit
						c09d99a069
					
				|  | @ -676,6 +676,19 @@ def debug_data( | ||||||
|         trees_train: Set[str] = gold_train_data["lemmatizer_trees"] |         trees_train: Set[str] = gold_train_data["lemmatizer_trees"] | ||||||
|         trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"] |         trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"] | ||||||
| 
 | 
 | ||||||
|  |         # This is necessary context if someone is attempting to interpret whether the | ||||||
|  |         # number of trees exclusively in the dev set is meaningful. | ||||||
|  |         msg.info(f"{len(trees_train)} lemmatizer trees generated from training data.") | ||||||
|  |         msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data.") | ||||||
|  |         dev_not_train = trees_dev - trees_train | ||||||
|  |         if len(dev_not_train) != 0: | ||||||
|  |             msg.warn( | ||||||
|  |                 f"{len(dev_not_train)} lemmatizer trees were found exclusively in the dev data." | ||||||
|  |             ) | ||||||
|  |         else: | ||||||
|  |             # Would we ever expect this case? It seems like it would be pretty rare. | ||||||
|  |             msg.good("All trees in dev data present in training data.") | ||||||
|  | 
 | ||||||
|         if gold_train_data["n_low_cardinality_lemmas"] > 0: |         if gold_train_data["n_low_cardinality_lemmas"] > 0: | ||||||
|             msg.warn( |             msg.warn( | ||||||
|                 f"{gold_train_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." |                 f"{gold_train_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." | ||||||
|  | @ -683,12 +696,6 @@ def debug_data( | ||||||
|         else: |         else: | ||||||
|             msg.good("Training docs have sufficient unique lemmas") |             msg.good("Training docs have sufficient unique lemmas") | ||||||
| 
 | 
 | ||||||
|         train_not_dev = trees_train - trees_dev |  | ||||||
|         if len(train_not_dev) != 0: |  | ||||||
|             msg.warn(f"{len(train_not_dev)} labels were found only in the train data.") |  | ||||||
|         else: |  | ||||||
|             msg.good("Training data contains all lemmatizer trees in dev set.") |  | ||||||
| 
 |  | ||||||
|         if gold_train_data["n_low_cardinality_lemmas"] > 0: |         if gold_train_data["n_low_cardinality_lemmas"] > 0: | ||||||
|             msg.warn( |             msg.warn( | ||||||
|                 f"{gold_dev_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." |                 f"{gold_dev_data['n_low_cardinality_lemmas']} docs with 1 or 0 unique lemmas." | ||||||
|  | @ -696,12 +703,6 @@ def debug_data( | ||||||
|         else: |         else: | ||||||
|             msg.good("Dev docs have sufficient unique lemmas") |             msg.good("Dev docs have sufficient unique lemmas") | ||||||
| 
 | 
 | ||||||
|         dev_not_train = trees_dev - trees_train |  | ||||||
|         if len(dev_not_train) != 0: |  | ||||||
|             msg.warn(f"{len(dev_not_train)} labels were found only in the dev data.") |  | ||||||
|         else: |  | ||||||
|             msg.good("Trees in dev data present in training data.") |  | ||||||
| 
 |  | ||||||
|     msg.divider("Summary") |     msg.divider("Summary") | ||||||
|     good_counts = msg.counts[MESSAGES.GOOD] |     good_counts = msg.counts[MESSAGES.GOOD] | ||||||
|     warn_counts = msg.counts[MESSAGES.WARN] |     warn_counts = msg.counts[MESSAGES.WARN] | ||||||
|  | @ -906,7 +907,10 @@ def _compile_gold( | ||||||
|                     tree_id = trees.add(token.text, token.lemma_) |                     tree_id = trees.add(token.text, token.lemma_) | ||||||
|                     tree_str = trees.tree_to_str(tree_id) |                     tree_str = trees.tree_to_str(tree_id) | ||||||
|                     data["lemmatizer_trees"].add(tree_str) |                     data["lemmatizer_trees"].add(tree_str) | ||||||
|             if len(lemma_set) < 2: |             # We want to identify cases where lemmas aren't assigned | ||||||
|  |             # or are all assigned the same value, as this would indicate | ||||||
|  |             # an issue since we're expecting a large set of lemmas | ||||||
|  |             if len(lemma_set) < 2 and len(gold) > 1: | ||||||
|                 data["n_low_cardinality_lemmas"] += 1 |                 data["n_low_cardinality_lemmas"] += 1 | ||||||
|     return data |     return data | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user