mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-20 18:54:21 +03:00 
			
		
		
		
	💫 Fix interaction of lemmatizer and tokenizer exceptions (#3388)
Closes #2203. Closes #3268. Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object. This PR applies two fixes: 1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite. 2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`). ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
		
							parent
							
								
									04ca710da7
								
							
						
					
					
						commit
						80b94313b6
					
				|  | @ -110,7 +110,8 @@ cdef class Morphology: | |||
|             analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, | ||||
|                                             self.tag_map.get(tag_str, {})) | ||||
|             self._cache.set(tag_id, token.lex.orth, analysis) | ||||
|         token.lemma = analysis.lemma | ||||
|         if token.lemma == 0: | ||||
|             token.lemma = analysis.lemma | ||||
|         token.pos = analysis.tag.pos | ||||
|         token.tag = analysis.tag.name | ||||
|         token.morph = analysis.tag.morph | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| import numpy | ||||
| from spacy.tokens import Doc | ||||
| from spacy.displacy import render | ||||
| from spacy.gold import iob_to_biluo | ||||
|  | @ -39,6 +40,26 @@ def test_issue2179(): | |||
|     assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) | ||||
| 
 | ||||
| 
 | ||||
| def test_issue2203(en_vocab): | ||||
|     """Test that lemmas are set correctly in doc.from_array.""" | ||||
|     words = ["I", "'ll", "survive"] | ||||
|     tags = ["PRP", "MD", "VB"] | ||||
|     lemmas = ["-PRON-", "will", "survive"] | ||||
|     tag_ids = [en_vocab.strings.add(tag) for tag in tags] | ||||
|     lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     # Work around lemma corrpution problem and set lemmas after tags | ||||
|     doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) | ||||
|     doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) | ||||
|     assert [t.tag_ for t in doc] == tags | ||||
|     assert [t.lemma_ for t in doc] == lemmas | ||||
|     # We need to serialize both tag and lemma, since this is what causes the bug | ||||
|     doc_array = doc.to_array(["TAG", "LEMMA"]) | ||||
|     new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) | ||||
|     assert [t.tag_ for t in new_doc] == tags | ||||
|     assert [t.lemma_ for t in new_doc] == lemmas | ||||
| 
 | ||||
| 
 | ||||
| def test_issue2219(en_vocab): | ||||
|     vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] | ||||
|     add_vecs_to_vocab(en_vocab, vectors) | ||||
|  |  | |||
|  | @ -763,17 +763,18 @@ cdef class Doc: | |||
|             attr_ids[i] = attr_id | ||||
|         if len(array.shape) == 1: | ||||
|             array = array.reshape((array.size, 1)) | ||||
|         # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA | ||||
|         if TAG in attrs: | ||||
|             col = attrs.index(TAG) | ||||
|             for i in range(length): | ||||
|                 if array[i, col] != 0: | ||||
|                     self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) | ||||
|         # Now load the data | ||||
|         for i in range(self.length): | ||||
|             token = &self.c[i] | ||||
|             for j in range(n_attrs): | ||||
|                 Token.set_struct_attr(token, attr_ids[j], array[i, j]) | ||||
|         # Auxiliary loading logic | ||||
|         for col, attr_id in enumerate(attrs): | ||||
|             if attr_id == TAG: | ||||
|                 for i in range(length): | ||||
|                     if array[i, col] != 0: | ||||
|                         self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) | ||||
|                 if attr_ids[j] != TAG: | ||||
|                     Token.set_struct_attr(token, attr_ids[j], array[i, j]) | ||||
|         # Set flags | ||||
|         self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) | ||||
|         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user