From 9ee1c54f40e901533ef16cd148556cbf83cca6a7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 13:13:57 +0200 Subject: [PATCH] Improve tag map initialization and updating (#5764) * Improve tag map initialization and updating Generalize tag map initialization and updating so that the tag map can be loaded correctly prior to loading a `Corpus` with `spacy debug-data` and `spacy train`. * normalize provided tag map as necessary * use the same method for initializing and updating the tag map * Replace rather than update tag map Replace rather than update tag map when loading a custom tag map. Updating the tag map is problematic due to the sorted list of tag names and the fact that the tag map will contain lingering/unwanted tags from the default tag map. * Update CLI scripts * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map. --- spacy/cli/debug_data.py | 4 ++-- spacy/cli/train.py | 4 ++-- spacy/morphology.pyx | 33 +++++++++++++++-------------- spacy/tests/pipeline/test_tagger.py | 3 +-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 9d1986d8a..49bfa9e82 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -131,8 +131,8 @@ def debug_data( tag_map = {} if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) msg.divider("Data file validation") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index feebc30d4..f69ad5b60 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -124,8 +124,8 @@ def train( ) nlp.begin_training(lambda: train_examples) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0852418f2..dac10137b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -64,6 +64,20 @@ cdef class Morphology: self.mem = Pool() self.strings = strings self.tags = PreshMap() + self.load_tag_map(tag_map) + self.lemmatizer = lemmatizer + + self._cache = PreshMapArray(self.n_tags) + self.exc = {} + if exc is not None: + for (tag, orth), attrs in exc.items(): + attrs = _normalize_props(attrs) + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) + + def load_tag_map(self, tag_map): + self.tag_map = {} + self.reverse_index = {} # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. if '_SP' in tag_map: @@ -74,27 +88,14 @@ cdef class Morphology: self.strings.add('_SP') tag_map = dict(tag_map) tag_map['_SP'] = space_attrs - self.tag_names = tuple(sorted(tag_map.keys())) - self.tag_map = {} - self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) - self.reverse_index = {} - self._load_from_tag_map(tag_map) - - self._cache = PreshMapArray(self.n_tags) - self.exc = {} - if exc is not None: - for (tag, orth), attrs in exc.items(): - attrs = _normalize_props(attrs) - self.add_special_case( - self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i + self.tag_names = tuple(sorted(self.tag_map.keys())) + self.n_tags = len(self.tag_map) + self._cache = PreshMapArray(self.n_tags) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index aedf8e2b3..ec7a15115 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -27,8 +27,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() tagger = nlp.create_pipe("tagger") - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) + nlp.vocab.morphology.load_tag_map(TAG_MAP) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))