Improve tag map initialization and updating (#5768)

* Improve tag map initialization and updating

Generalize tag map initialization and updating so that a provided tag
map can be loaded correctly in the CLI.

* normalize provided tag map as necessary
* use the same method for initializing and overwriting the tag map

* Reinitialize cache after loading new tag map

Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
Adriane Boyd 2020-07-19 11:13:39 +02:00 committed by GitHub
parent 7e14272096
commit 597bcc629e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 19 deletions

View File

@ -70,8 +70,8 @@ def debug_data(
else: else:
lang_cls = get_lang_class(lang) lang_cls = get_lang_class(lang)
nlp = lang_cls() nlp = lang_cls()
# Update tag map with provided mapping # Replace tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.load_tag_map(tag_map)
msg.divider("Data format validation") msg.divider("Data format validation")

View File

@ -250,8 +250,8 @@ def train(
pipe_cfg = {} pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
# Update tag map with provided mapping # Replace tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.load_tag_map(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed

View File

@ -150,6 +150,19 @@ cdef class Morphology:
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
self.tags = PreshMap() self.tags = PreshMap()
self._feat_map = MorphologyClassMap(FEATURES)
self.load_tag_map(tag_map)
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
def load_tag_map(self, tag_map):
# Add special space symbol. We prefix with underscore, to make sure it # Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end. # always sorts to the end.
if '_SP' in tag_map: if '_SP' in tag_map:
@ -160,29 +173,17 @@ cdef class Morphology:
self.strings.add('_SP') self.strings.add('_SP')
tag_map = dict(tag_map) tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {} self.tag_map = {}
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {} self.reverse_index = {}
self._feat_map = MorphologyClassMap(FEATURES)
self._load_from_tag_map(tag_map)
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
def _load_from_tag_map(self, tag_map):
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
self.add({self._feat_map.id2feat[feat] for feat in attrs self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat}) if feat in self._feat_map.id2feat})
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i self.reverse_index[self.strings.add(tag_str)] = i
self.tag_names = tuple(sorted(self.tag_map.keys()))
self.n_tags = len(self.tag_map)
self._cache = PreshMapArray(self.n_tags)
def __reduce__(self): def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer, return (Morphology, (self.strings, self.tag_map, self.lemmatizer,