mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Improve tag map initialization and updating (#5764)
* Improve tag map initialization and updating Generalize tag map initialization and updating so that the tag map can be loaded correctly prior to loading a `Corpus` with `spacy debug-data` and `spacy train`. * normalize provided tag map as necessary * use the same method for initializing and updating the tag map * Replace rather than update tag map Replace rather than update tag map when loading a custom tag map. Updating the tag map is problematic due to the sorted list of tag names and the fact that the tag map will contain lingering/unwanted tags from the default tag map. * Update CLI scripts * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
		
							parent
							
								
									b81a89f0a9
								
							
						
					
					
						commit
						9ee1c54f40
					
				| 
						 | 
					@ -131,8 +131,8 @@ def debug_data(
 | 
				
			||||||
    tag_map = {}
 | 
					    tag_map = {}
 | 
				
			||||||
    if tag_map_path is not None:
 | 
					    if tag_map_path is not None:
 | 
				
			||||||
        tag_map = srsly.read_json(tag_map_path)
 | 
					        tag_map = srsly.read_json(tag_map_path)
 | 
				
			||||||
    # Update tag map with provided mapping
 | 
					    # Replace tag map with provided mapping
 | 
				
			||||||
    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
					    nlp.vocab.morphology.load_tag_map(tag_map)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Data file validation")
 | 
					    msg.divider("Data file validation")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -124,8 +124,8 @@ def train(
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        nlp.begin_training(lambda: train_examples)
 | 
					        nlp.begin_training(lambda: train_examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Update tag map with provided mapping
 | 
					    # Replace tag map with provided mapping
 | 
				
			||||||
    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
					    nlp.vocab.morphology.load_tag_map(tag_map)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create empty extra lexeme tables so the data from spacy-lookups-data
 | 
					    # Create empty extra lexeme tables so the data from spacy-lookups-data
 | 
				
			||||||
    # isn't loaded if these features are accessed
 | 
					    # isn't loaded if these features are accessed
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,6 +64,20 @@ cdef class Morphology:
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self.strings = strings
 | 
					        self.strings = strings
 | 
				
			||||||
        self.tags = PreshMap()
 | 
					        self.tags = PreshMap()
 | 
				
			||||||
 | 
					        self.load_tag_map(tag_map)
 | 
				
			||||||
 | 
					        self.lemmatizer = lemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self._cache = PreshMapArray(self.n_tags)
 | 
				
			||||||
 | 
					        self.exc = {}
 | 
				
			||||||
 | 
					        if exc is not None:
 | 
				
			||||||
 | 
					            for (tag, orth), attrs in exc.items():
 | 
				
			||||||
 | 
					                attrs = _normalize_props(attrs)
 | 
				
			||||||
 | 
					                self.add_special_case(
 | 
				
			||||||
 | 
					                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def load_tag_map(self, tag_map):
 | 
				
			||||||
 | 
					        self.tag_map = {}
 | 
				
			||||||
 | 
					        self.reverse_index = {}
 | 
				
			||||||
        # Add special space symbol. We prefix with underscore, to make sure it
 | 
					        # Add special space symbol. We prefix with underscore, to make sure it
 | 
				
			||||||
        # always sorts to the end.
 | 
					        # always sorts to the end.
 | 
				
			||||||
        if '_SP' in tag_map:
 | 
					        if '_SP' in tag_map:
 | 
				
			||||||
| 
						 | 
					@ -74,27 +88,14 @@ cdef class Morphology:
 | 
				
			||||||
            self.strings.add('_SP')
 | 
					            self.strings.add('_SP')
 | 
				
			||||||
            tag_map = dict(tag_map)
 | 
					            tag_map = dict(tag_map)
 | 
				
			||||||
            tag_map['_SP'] = space_attrs
 | 
					            tag_map['_SP'] = space_attrs
 | 
				
			||||||
        self.tag_names = tuple(sorted(tag_map.keys()))
 | 
					 | 
				
			||||||
        self.tag_map = {}
 | 
					 | 
				
			||||||
        self.lemmatizer = lemmatizer
 | 
					 | 
				
			||||||
        self.n_tags = len(tag_map)
 | 
					 | 
				
			||||||
        self.reverse_index = {}
 | 
					 | 
				
			||||||
        self._load_from_tag_map(tag_map)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self._cache = PreshMapArray(self.n_tags)
 | 
					 | 
				
			||||||
        self.exc = {}
 | 
					 | 
				
			||||||
        if exc is not None:
 | 
					 | 
				
			||||||
            for (tag, orth), attrs in exc.items():
 | 
					 | 
				
			||||||
                attrs = _normalize_props(attrs)
 | 
					 | 
				
			||||||
                self.add_special_case(
 | 
					 | 
				
			||||||
                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _load_from_tag_map(self, tag_map):
 | 
					 | 
				
			||||||
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
					        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
				
			||||||
            attrs = _normalize_props(attrs)
 | 
					            attrs = _normalize_props(attrs)
 | 
				
			||||||
            self.add(attrs)
 | 
					            self.add(attrs)
 | 
				
			||||||
            self.tag_map[tag_str] = dict(attrs)
 | 
					            self.tag_map[tag_str] = dict(attrs)
 | 
				
			||||||
            self.reverse_index[self.strings.add(tag_str)] = i
 | 
					            self.reverse_index[self.strings.add(tag_str)] = i
 | 
				
			||||||
 | 
					        self.tag_names = tuple(sorted(self.tag_map.keys()))
 | 
				
			||||||
 | 
					        self.n_tags = len(self.tag_map)
 | 
				
			||||||
 | 
					        self._cache = PreshMapArray(self.n_tags)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
 | 
					        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,8 +27,7 @@ def test_overfitting_IO():
 | 
				
			||||||
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
 | 
					    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    tagger = nlp.create_pipe("tagger")
 | 
					    tagger = nlp.create_pipe("tagger")
 | 
				
			||||||
    for tag, values in TAG_MAP.items():
 | 
					    nlp.vocab.morphology.load_tag_map(TAG_MAP)
 | 
				
			||||||
        tagger.add_label(tag, values)
 | 
					 | 
				
			||||||
    train_examples = []
 | 
					    train_examples = []
 | 
				
			||||||
    for t in TRAIN_DATA:
 | 
					    for t in TRAIN_DATA:
 | 
				
			||||||
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
 | 
					        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user