mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Ignore misaligned in Morphologizer.get_loss (#6363)
Fix bug where `Morphologizer.get_loss` treated misaligned annotation as `EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH` mappings.
This commit is contained in:
		
							parent
							
								
									a0c899a0ff
								
							
						
					
					
						commit
						a7e7d6c6c9
					
				| 
						 | 
				
			
			@ -92,9 +92,6 @@ class Morphologizer(Tagger):
 | 
			
		|||
        # 2) labels_pos stores a mapping from morph+POS->POS
 | 
			
		||||
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
 | 
			
		||||
        self.cfg = dict(sorted(cfg.items()))
 | 
			
		||||
        # add mappings for empty morph
 | 
			
		||||
        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
 | 
			
		||||
        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def labels(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -201,8 +198,8 @@ class Morphologizer(Tagger):
 | 
			
		|||
                doc_tag_ids = doc_tag_ids.get()
 | 
			
		||||
            for j, tag_id in enumerate(doc_tag_ids):
 | 
			
		||||
                morph = self.labels[tag_id]
 | 
			
		||||
                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
 | 
			
		||||
                doc.c[j].pos = self.cfg["labels_pos"][morph]
 | 
			
		||||
                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
 | 
			
		||||
                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
 | 
			
		||||
 | 
			
		||||
    def get_loss(self, examples, scores):
 | 
			
		||||
        """Find the loss and gradient of loss for the batch of documents and
 | 
			
		||||
| 
						 | 
				
			
			@ -228,8 +225,8 @@ class Morphologizer(Tagger):
 | 
			
		|||
                # doesn't, so if either is None, treat both as None here so that
 | 
			
		||||
                # truths doesn't end up with an unknown morph+POS combination
 | 
			
		||||
                if pos is None or morph is None:
 | 
			
		||||
                    pos = None
 | 
			
		||||
                    morph = None
 | 
			
		||||
                    label = None
 | 
			
		||||
                else:
 | 
			
		||||
                    label_dict = Morphology.feats_to_dict(morph)
 | 
			
		||||
                    if pos:
 | 
			
		||||
                        label_dict[self.POS_FEAT] = pos
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -116,3 +116,23 @@ def test_overfitting_IO():
 | 
			
		|||
    no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
 | 
			
		||||
    assert_equal(batch_deps_1, batch_deps_2)
 | 
			
		||||
    assert_equal(batch_deps_1, no_batch_deps)
 | 
			
		||||
 | 
			
		||||
    # Test without POS
 | 
			
		||||
    nlp.remove_pipe("morphologizer")
 | 
			
		||||
    nlp.add_pipe("morphologizer")
 | 
			
		||||
    for example in train_examples:
 | 
			
		||||
        for token in example.reference:
 | 
			
		||||
            token.pos_ = ""
 | 
			
		||||
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 | 
			
		||||
    for i in range(50):
 | 
			
		||||
        losses = {}
 | 
			
		||||
        nlp.update(train_examples, sgd=optimizer, losses=losses)
 | 
			
		||||
    assert losses["morphologizer"] < 0.00001
 | 
			
		||||
 | 
			
		||||
    # Test the trained model
 | 
			
		||||
    test_text = "I like blue ham"
 | 
			
		||||
    doc = nlp(test_text)
 | 
			
		||||
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
 | 
			
		||||
    gold_pos_tags = ["", "", "", ""]
 | 
			
		||||
    assert [str(t.morph) for t in doc] == gold_morphs
 | 
			
		||||
    assert [t.pos_ for t in doc] == gold_pos_tags
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user