mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Improve tag map initialization and updating (#5768)
* Improve tag map initialization and updating Generalize tag map initialization and updating so that a provided tag map can be loaded correctly in the CLI. * normalize provided tag map as necessary * use the same method for initializing and overwriting the tag map * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
parent
7e14272096
commit
597bcc629e
|
@ -70,8 +70,8 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
lang_cls = get_lang_class(lang)
|
lang_cls = get_lang_class(lang)
|
||||||
nlp = lang_cls()
|
nlp = lang_cls()
|
||||||
# Update tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||||
|
|
||||||
msg.divider("Data format validation")
|
msg.divider("Data format validation")
|
||||||
|
|
||||||
|
|
|
@ -250,8 +250,8 @@ def train(
|
||||||
pipe_cfg = {}
|
pipe_cfg = {}
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||||
|
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
# isn't loaded if these features are accessed
|
# isn't loaded if these features are accessed
|
||||||
|
|
|
@ -150,6 +150,19 @@ cdef class Morphology:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
self.tags = PreshMap()
|
self.tags = PreshMap()
|
||||||
|
self._feat_map = MorphologyClassMap(FEATURES)
|
||||||
|
self.load_tag_map(tag_map)
|
||||||
|
self.lemmatizer = lemmatizer
|
||||||
|
|
||||||
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
self.exc = {}
|
||||||
|
if exc is not None:
|
||||||
|
for (tag, orth), attrs in exc.items():
|
||||||
|
attrs = _normalize_props(attrs)
|
||||||
|
self.add_special_case(
|
||||||
|
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||||
|
|
||||||
|
def load_tag_map(self, tag_map):
|
||||||
# Add special space symbol. We prefix with underscore, to make sure it
|
# Add special space symbol. We prefix with underscore, to make sure it
|
||||||
# always sorts to the end.
|
# always sorts to the end.
|
||||||
if '_SP' in tag_map:
|
if '_SP' in tag_map:
|
||||||
|
@ -160,29 +173,17 @@ cdef class Morphology:
|
||||||
self.strings.add('_SP')
|
self.strings.add('_SP')
|
||||||
tag_map = dict(tag_map)
|
tag_map = dict(tag_map)
|
||||||
tag_map['_SP'] = space_attrs
|
tag_map['_SP'] = space_attrs
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
|
||||||
self.tag_map = {}
|
self.tag_map = {}
|
||||||
self.lemmatizer = lemmatizer
|
|
||||||
self.n_tags = len(tag_map)
|
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
self._feat_map = MorphologyClassMap(FEATURES)
|
|
||||||
self._load_from_tag_map(tag_map)
|
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
|
||||||
self.exc = {}
|
|
||||||
if exc is not None:
|
|
||||||
for (tag, orth), attrs in exc.items():
|
|
||||||
attrs = _normalize_props(attrs)
|
|
||||||
self.add_special_case(
|
|
||||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
|
||||||
|
|
||||||
def _load_from_tag_map(self, tag_map):
|
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||||
if feat in self._feat_map.id2feat})
|
if feat in self._feat_map.id2feat})
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
self.reverse_index[self.strings.add(tag_str)] = i
|
self.reverse_index[self.strings.add(tag_str)] = i
|
||||||
|
self.tag_names = tuple(sorted(self.tag_map.keys()))
|
||||||
|
self.n_tags = len(self.tag_map)
|
||||||
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user