Improve tag map initialization and updating (#5764)

* Improve tag map initialization and updating

Generalize tag map initialization and updating so that the tag map can
be loaded correctly prior to loading a `Corpus` with `spacy debug-data`
and `spacy train`.

* normalize provided tag map as necessary
* use the same method for initializing and updating the tag map

* Replace rather than update tag map

Replace rather than update tag map when loading a custom tag map.
Updating the tag map is problematic due to the sorted list of tag names
and the fact that the tag map will contain lingering/unwanted tags from
the default tag map.

* Update CLI scripts

* Reinitialize cache after loading new tag map

Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
Adriane Boyd 2020-07-19 13:13:57 +02:00 committed by GitHub
parent b81a89f0a9
commit 9ee1c54f40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 22 deletions

View File

@ -131,8 +131,8 @@ def debug_data(
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
msg.divider("Data file validation")

View File

@ -124,8 +124,8 @@ def train(
)
nlp.begin_training(lambda: train_examples)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed

View File

@ -64,6 +64,20 @@ cdef class Morphology:
self.mem = Pool()
self.strings = strings
self.tags = PreshMap()
self.load_tag_map(tag_map)
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
def load_tag_map(self, tag_map):
self.tag_map = {}
self.reverse_index = {}
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
if '_SP' in tag_map:
@ -74,27 +88,14 @@ cdef class Morphology:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {}
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
self._load_from_tag_map(tag_map)
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
def _load_from_tag_map(self, tag_map):
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.add(attrs)
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
self.tag_names = tuple(sorted(self.tag_map.keys()))
self.n_tags = len(self.tag_map)
self._cache = PreshMapArray(self.n_tags)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,

View File

@ -27,8 +27,7 @@ def test_overfitting_IO():
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
nlp = English()
tagger = nlp.create_pipe("tagger")
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
nlp.vocab.morphology.load_tag_map(TAG_MAP)
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))