From 597bcc629e173dfd87422188dc76a2f1053a9bba Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:13:39 +0200 Subject: [PATCH] Improve tag map initialization and updating (#5768) * Improve tag map initialization and updating Generalize tag map initialization and updating so that a provided tag map can be loaded correctly in the CLI. * normalize provided tag map as necessary * use the same method for initializing and overwriting the tag map * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map. --- spacy/cli/debug_data.py | 4 ++-- spacy/cli/train.py | 4 ++-- spacy/morphology.pyx | 31 ++++++++++++++++--------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 7a4a093e2..22540c779 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -70,8 +70,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) msg.divider("Data format validation") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b81214b95..e24aa8a95 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -250,8 +250,8 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a9bab38ed..18bba0124 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -150,6 +150,19 @@ cdef class Morphology: self.mem = Pool() self.strings = string_store self.tags = PreshMap() + self._feat_map = MorphologyClassMap(FEATURES) + self.load_tag_map(tag_map) + self.lemmatizer = lemmatizer + + self._cache = PreshMapArray(self.n_tags) + self.exc = {} + if exc is not None: + for (tag, orth), attrs in exc.items(): + attrs = _normalize_props(attrs) + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) + + def load_tag_map(self, tag_map): # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. if '_SP' in tag_map: @@ -160,29 +173,17 @@ cdef class Morphology: self.strings.add('_SP') tag_map = dict(tag_map) tag_map['_SP'] = space_attrs - self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} - self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES) - self._load_from_tag_map(tag_map) - - self._cache = PreshMapArray(self.n_tags) - self.exc = {} - if exc is not None: - for (tag, orth), attrs in exc.items(): - attrs = _normalize_props(attrs) - self.add_special_case( - self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add({self._feat_map.id2feat[feat] for feat in attrs if feat in self._feat_map.id2feat}) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i + self.tag_names = tuple(sorted(self.tag_map.keys())) + self.n_tags = len(self.tag_map) + self._cache = PreshMapArray(self.n_tags) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer,