mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Improve tag map initialization and updating (#5768)
* Improve tag map initialization and updating Generalize tag map initialization and updating so that a provided tag map can be loaded correctly in the CLI. * normalize provided tag map as necessary * use the same method for initializing and overwriting the tag map * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
parent
7e14272096
commit
597bcc629e
|
@ -70,8 +70,8 @@ def debug_data(
|
|||
else:
|
||||
lang_cls = get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
msg.divider("Data format validation")
|
||||
|
||||
|
|
|
@ -250,8 +250,8 @@ def train(
|
|||
pipe_cfg = {}
|
||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||
# isn't loaded if these features are accessed
|
||||
|
|
|
@ -150,6 +150,19 @@ cdef class Morphology:
|
|||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.tags = PreshMap()
|
||||
self._feat_map = MorphologyClassMap(FEATURES)
|
||||
self.load_tag_map(tag_map)
|
||||
self.lemmatizer = lemmatizer
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
for (tag, orth), attrs in exc.items():
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add_special_case(
|
||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
def load_tag_map(self, tag_map):
|
||||
# Add special space symbol. We prefix with underscore, to make sure it
|
||||
# always sorts to the end.
|
||||
if '_SP' in tag_map:
|
||||
|
@ -160,29 +173,17 @@ cdef class Morphology:
|
|||
self.strings.add('_SP')
|
||||
tag_map = dict(tag_map)
|
||||
tag_map['_SP'] = space_attrs
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.tag_map = {}
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.reverse_index = {}
|
||||
self._feat_map = MorphologyClassMap(FEATURES)
|
||||
self._load_from_tag_map(tag_map)
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
for (tag, orth), attrs in exc.items():
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add_special_case(
|
||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
def _load_from_tag_map(self, tag_map):
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||
if feat in self._feat_map.id2feat})
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
self.reverse_index[self.strings.add(tag_str)] = i
|
||||
self.tag_names = tuple(sorted(self.tag_map.keys()))
|
||||
self.n_tags = len(self.tag_map)
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
|
|
Loading…
Reference in New Issue
Block a user