mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	rm unclear uniqueness check
This commit is contained in:
		
							parent
							
								
									4c45dfabae
								
							
						
					
					
						commit
						6ea2b3524b
					
				|  | @ -693,18 +693,6 @@ def debug_data( | |||
|             # and we might actually want a warning? | ||||
|             msg.info("All trees in dev data present in training data.") | ||||
| 
 | ||||
|         if gold_train_data["n_low_cardinality_lemmas"] > 0: | ||||
|             n = gold_train_data["n_low_cardinality_lemmas"] | ||||
|             msg.warn(f"{n} docs with 1 or 0 unique lemmas.") | ||||
|         else: | ||||
|             msg.good("All training docs meet lemma uniqueness requirements.") | ||||
| 
 | ||||
|         if gold_dev_data["n_low_cardinality_lemmas"] > 0: | ||||
|             n = gold_dev_data["n_low_cardinality_lemmas"] | ||||
|             msg.warn(f"{n} docs with 1 or 0 unique lemmas.") | ||||
|         else: | ||||
|             msg.good("All dev docs meet lemma uniqueness requirements.") | ||||
| 
 | ||||
|         if gold_train_data["no_lemma_annotations"] > 0: | ||||
|             n = gold_train_data["no_lemma_annotations"] | ||||
|             msg.warn(f"{n} docs with no lemma annotations.") | ||||
|  | @ -793,7 +781,6 @@ def _compile_gold( | |||
|         "lemmatizer_trees": set(), | ||||
|         "no_lemma_annotations": 0, | ||||
|         "partial_lemma_annotations": 0, | ||||
|         "n_low_cardinality_lemmas": 0, | ||||
|     } | ||||
|     if "trainable_lemmatizer" in factory_names: | ||||
|         trees = EditTrees(nlp.vocab.strings) | ||||
|  | @ -933,18 +920,11 @@ def _compile_gold( | |||
|                 continue | ||||
|             if any(token.lemma == 0 for token in gold): | ||||
|                 data["partial_lemma_annotations"] += 1 | ||||
|             lemma_set = set() | ||||
|             for token in gold: | ||||
|                 if token.lemma != 0: | ||||
|                     lemma_set.add(token.lemma) | ||||
|                     tree_id = trees.add(token.text, token.lemma_) | ||||
|                     tree_str = trees.tree_to_str(tree_id) | ||||
|                     data["lemmatizer_trees"].add(tree_str) | ||||
|             # We want to identify cases where lemmas aren't assigned | ||||
|             # or are all assigned the same value, as this would indicate | ||||
|             # an issue since we're expecting a large set of lemmas | ||||
|             if len(lemma_set) < 2 and len(gold) > 1: | ||||
|                 data["n_low_cardinality_lemmas"] += 1 | ||||
|     return data | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1244,19 +1244,6 @@ def test_debug_data_trainable_lemmatizer_partial(): | |||
|     data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) | ||||
|     assert data["partial_lemma_annotations"] == 2 | ||||
| 
 | ||||
| def test_debug_data_trainable_lemmatizer_low_cardinality(): | ||||
|     low_cardinality_examples = [ | ||||
|         ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}), | ||||
|         ("Eat blue ham", {"lemmas": ["no", "no", "no"]}), | ||||
|     ] | ||||
|     nlp = Language() | ||||
|     train_examples = [] | ||||
|     for t in low_cardinality_examples: | ||||
|         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) | ||||
| 
 | ||||
|     data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) | ||||
|     assert data["n_low_cardinality_lemmas"] == 2 | ||||
| 
 | ||||
| def test_debug_data_trainable_lemmatizer_not_annotated(): | ||||
|     unannotated_examples = [ | ||||
|         ("She likes green eggs", {}), | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user