From 68fade8f76d6e00d29df807f3090098d5ee37580 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 19 Jul 2020 00:02:29 +0200 Subject: [PATCH 1/4] Add Plausible [ci skip] --- website/gatsby-config.js | 5 +++++ website/meta/site.json | 2 ++ website/package-lock.json | 35 +++++++++++++++++++++++++++++++++++ website/package.json | 1 + 4 files changed, 43 insertions(+) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index d08c574c6..752628749 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -19,6 +19,7 @@ const isNightly = !!+process.env.SPACY_NIGHTLY || site.nightlyBranches.includes( const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png` const binderBranch = isNightly ? 'nightly' : site.binderBranch const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl +const domain = isNightly ? site.domainNightly : site.domain module.exports = { siteMetadata: { @@ -148,6 +149,10 @@ module.exports = { respectDNT: true, }, }, + { + resolve: `gatsby-plugin-plausible`, + options: { domain }, + }, { resolve: 'gatsby-plugin-robots-txt', options: { diff --git a/website/meta/site.json b/website/meta/site.json index 5fb1a4533..7e6f4c692 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -3,7 +3,9 @@ "description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.", "slogan": "Industrial-strength Natural Language Processing in Python", "siteUrl": "https://spacy.io", + "domain": "spacy.io", "siteUrlNightly": "https://nightly.spacy.io", + "domainNightly": "nightly.spacy.io", "nightlyBranches": ["nightly.spacy.io"], "email": "contact@explosion.ai", "company": "Explosion", diff --git a/website/package-lock.json b/website/package-lock.json index 96a10a8af..02bc3a27b 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -12915,6 +12915,41 @@ } } }, + "gatsby-plugin-plausible": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/gatsby-plugin-plausible/-/gatsby-plugin-plausible-0.0.6.tgz", + "integrity": "sha512-qUdPQ3haeX2DIywGZ2boMpmFAnSbWzqS9cG9/OO0mWLigA0sDLWwGkpHIAvrfepgbB9U/roLtXflctBwOIxtcQ==", + "requires": { + "@babel/runtime": "^7.9.2", + "minimatch": "3.0.4", + "react": "^16.13.1" + }, + "dependencies": { + "@babel/runtime": { + "version": "7.10.5", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.5.tgz", + "integrity": "sha512-otddXKhdNn7d0ptoFRHtMLa8LqDxLYwTjB4nYgM1yy5N6gU/MUf8zqyyLltCH3yAVitBzmwK4us+DD0l/MauAg==", + "requires": { + "regenerator-runtime": "^0.13.4" + } + }, + "react": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react/-/react-16.13.1.tgz", + "integrity": "sha512-YMZQQq32xHLX0bz5Mnibv1/LHb3Sqzngu7xstSM+vrkE5Kzr9xE0yMByK5kMoTK30YVJE61WfbxIFFvfeDKT1w==", + "requires": { + "loose-envify": "^1.1.0", + "object-assign": "^4.1.1", + "prop-types": "^15.6.2" + } + }, + "regenerator-runtime": { + "version": "0.13.5", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz", + "integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA==" + } + } + }, "gatsby-plugin-react-helmet": { "version": "3.0.6", "resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz", diff --git a/website/package.json b/website/package.json index 3c76014b3..12702692d 100644 --- a/website/package.json +++ b/website/package.json @@ -23,6 +23,7 @@ "gatsby-plugin-google-analytics": "^2.0.14", "gatsby-plugin-manifest": "^2.0.17", "gatsby-plugin-offline": "^2.0.24", + "gatsby-plugin-plausible": "0.0.6", "gatsby-plugin-react-helmet": "^3.0.6", "gatsby-plugin-react-svg": "^2.0.0", "gatsby-plugin-robots-txt": "^1.5.1", From 38b59d728d8c77d1a0767b765eea958683a85533 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 19 Jul 2020 11:10:31 +0200 Subject: [PATCH 2/4] Upgrade of UD eval script (#5776) * new morph feature format * add new languages with tokenization * update with all new pretrained models --- bin/ud/run_eval.py | 72 ++++++++++++++++++++++++---------------------- bin/ud/ud_train.py | 4 ++- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/bin/ud/run_eval.py b/bin/ud/run_eval.py index 2da476721..3a30c0ee9 100644 --- a/bin/ud/run_eval.py +++ b/bin/ud/run_eval.py @@ -12,11 +12,11 @@ from ud_train import write_conllu from spacy.lang.lex_attrs import word_shape from spacy.util import get_lang_class -# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb') -ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr," - "ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no," +# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later) +ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr," + "ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb," "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl," - "tr, tt, uk, ur, vi, zh") + "tr, tt, uk, ur, vi, yo, zh") # Non-parsing tasks that will be evaluated (works for default models) EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats'] @@ -251,39 +251,43 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train # initialize all models with the multi-lang model for lang in languages: - models[lang] = [multi] if multi else [] - # add default models if we don't want to evaluate parsing info - if not check_parse: - # Norwegian is 'nb' in spaCy but 'no' in the UD corpora - if lang == 'no': - models['no'].append(load_default_model_sentencizer('nb')) - else: - models[lang].append(load_default_model_sentencizer(lang)) + UD_lang = lang + # Norwegian is 'nb' in spaCy but 'no' in the UD corpora + if lang == "nb": + UD_lang = "no" + try: + models[UD_lang] = [multi] if multi else [] + # add default models if we don't want to evaluate parsing info + if not check_parse: + models[UD_lang].append(load_default_model_sentencizer(lang)) + except: + print(f"Exception initializing lang {lang} - skipping") # language-specific trained models if not exclude_trained_models: - if 'de' in models: - models['de'].append(load_model('de_core_news_sm')) - models['de'].append(load_model('de_core_news_md')) - if 'el' in models: - models['el'].append(load_model('el_core_news_sm')) - models['el'].append(load_model('el_core_news_md')) - if 'en' in models: - models['en'].append(load_model('en_core_web_sm')) - models['en'].append(load_model('en_core_web_md')) - models['en'].append(load_model('en_core_web_lg')) - if 'es' in models: - models['es'].append(load_model('es_core_news_sm')) - models['es'].append(load_model('es_core_news_md')) - if 'fr' in models: - models['fr'].append(load_model('fr_core_news_sm')) - models['fr'].append(load_model('fr_core_news_md')) - if 'it' in models: - models['it'].append(load_model('it_core_news_sm')) - if 'nl' in models: - models['nl'].append(load_model('nl_core_news_sm')) - if 'pt' in models: - models['pt'].append(load_model('pt_core_news_sm')) + news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"] + news_languages = ["nb"] + web_languages = ["en", "zh"] + sizes = ["sm", "md", "lg"] + for lang in web_languages: + UD_lang = lang + for size in sizes: + model_name = f'{lang}_core_web_{size}' + try: + models[UD_lang].append(load_model(model_name)) + except Exception as e: + print(f"Error loading {model_name}: {e}") + + for lang in news_languages: + UD_lang = lang + if lang == "nb": + UD_lang = "no" + for size in sizes: + model_name = f'{lang}_core_news_{size}' + try: + models[UD_lang].append(load_model(model_name)) + except Exception as e: + print(f"Error loading {model_name}: {e}") with out_path.open(mode='w', encoding='utf-8') as out_file: run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 88c534d0a..ac5987aa4 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -303,7 +303,9 @@ def get_token_conllu(token, i): feat_str = [] replacements = {"one": "1", "two": "2", "three": "3"} for feat in features: - if not feat.startswith("begin") and not feat.startswith("end"): + if "=" in feat: + feat_str.append(feat) + elif not feat.startswith("begin") and not feat.startswith("end"): key, value = feat.split("_", 1) value = replacements.get(value, value) feat_str.append("%s=%s" % (key, value.title())) From b81a89f0a94ce5a191720ba0eccff43667da6ba9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:10:51 +0200 Subject: [PATCH 3/4] Update morphologizer (#5766) * update `Morphologizer.begin_training` for use with `Example` * make init and begin_training more consistent * add `Morphology.normalize_features` to normalize outside of `Morphology.add` * make sure `get_loss` doesn't create unknown labels when the POS and morph alignments differ --- spacy/morphology.pyx | 29 ++++++--- spacy/pipeline/morphologizer.pyx | 71 ++++++++++++++-------- spacy/tests/pipeline/test_morphologizer.py | 31 +++++++--- 3 files changed, 91 insertions(+), 40 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a3aa8be22..0852418f2 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -58,7 +58,7 @@ cdef class Morphology: FEATURE_SEP = "|" FIELD_SEP = "=" VALUE_SEP = "," - EMPTY_MORPH = "_" + EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): self.mem = Pool() @@ -117,13 +117,7 @@ cdef class Morphology: if not isinstance(features, dict): warnings.warn(Warnings.W100.format(feature=features)) features = {} - features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} - # normalized UFEATS string with sorted fields and values - norm_feats_string = self.FEATURE_SEP.join(sorted([ - self.FIELD_SEP.join([field, values]) - for field, values in string_features.items() - ])) # intified ("Field", "Field=Value") pairs field_feature_pairs = [] for field in sorted(string_features): @@ -137,6 +131,7 @@ cdef class Morphology: # the hash key for the tag is either the hash of the normalized UFEATS # string or the hash of an empty placeholder (using the empty string # would give a hash key of 0, which is not good for PreshMap) + norm_feats_string = self.normalize_features(features) if norm_feats_string: tag.key = self.strings.add(norm_feats_string) else: @@ -144,6 +139,26 @@ cdef class Morphology: self.insert(tag) return tag.key + def normalize_features(self, features): + """Create a normalized UFEATS string from a features string or dict. + + features (Union[dict, str]): Features as dict or UFEATS string. + RETURNS (str): Features as normalized UFEATS string. + """ + if isinstance(features, str): + features = self.feats_to_dict(features) + if not isinstance(features, dict): + warnings.warn(Warnings.W100.format(feature=features)) + features = {} + features = _normalize_props(features) + string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} + # normalized UFEATS string with sorted fields and values + norm_feats_string = self.FEATURE_SEP.join(sorted([ + self.FIELD_SEP.join([field, values]) + for field, values in string_features.items() + ])) + return norm_feats_string or self.EMPTY_MORPH + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: """Creates a MorphAnalysisC from a list of intified ("Field", "Field=Value") tuples where fields with multiple values have diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 57b778434..bc77dda47 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -23,29 +23,45 @@ from .defaults import default_morphologizer @component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer) class Morphologizer(Tagger): + POS_FEAT = "POS" + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault("labels", {}) - self.cfg.setdefault("morph_pos", {}) + # to be able to set annotations without string operations on labels, + # store mappings from morph+POS labels to token-level annotations: + # 1) labels_morph stores a mapping from morph+POS->morph + self.cfg.setdefault("labels_morph", {}) + # 2) labels_pos stores a mapping from morph+POS->POS + self.cfg.setdefault("labels_pos", {}) + # add mappings for empty morph + self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH + self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] @property def labels(self): - return tuple(self.cfg["labels"].keys()) + return tuple(self.cfg["labels_morph"].keys()) def add_label(self, label): if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 - morph = Morphology.feats_to_dict(label) - norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] - pos = morph.get("POS", "") - if norm_morph_pos not in self.cfg["labels"]: - self.cfg["labels"][norm_morph_pos] = norm_morph_pos - self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] + # normalize label + norm_label = self.vocab.morphology.normalize_features(label) + # extract separate POS and morph tags + label_dict = Morphology.feats_to_dict(label) + pos = label_dict.get(self.POS_FEAT, "") + if self.POS_FEAT in label_dict: + label_dict.pop(self.POS_FEAT) + # normalize morph string and add to morphology table + norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)] + # add label mappings + if norm_label not in self.cfg["labels_morph"]: + self.cfg["labels_morph"][norm_label] = norm_morph + self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, @@ -53,14 +69,16 @@ class Morphologizer(Tagger): for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph - norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] + morph = token.morph_ + # create and add the combined morph+POS label + morph_dict = Morphology.feats_to_dict(morph) if pos: - morph["POS"] = pos - norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] - if norm_morph_pos not in self.cfg["labels"]: - self.cfg["labels"][norm_morph_pos] = norm_morph - self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + # add label->morph and label->POS mappings + if norm_label not in self.cfg["labels_morph"]: + self.cfg["labels_morph"][norm_label] = morph + self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.set_output(len(self.labels)) self.model.initialize() link_vectors_to_models(self.vocab) @@ -79,8 +97,8 @@ class Morphologizer(Tagger): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): morph = self.labels[tag_id] - doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph]) - doc.c[j].pos = self.cfg["morph_pos"][morph] + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) + doc.c[j].pos = self.cfg["labels_pos"][morph] doc.is_morphed = True @@ -94,14 +112,17 @@ class Morphologizer(Tagger): for i in range(len(morphs)): pos = pos_tags[i] morph = morphs[i] - feats = Morphology.feats_to_dict(morph) + # POS may align (same value for multiple tokens) when morph + # doesn't, so if either is None, treat both as None here so that + # truths doesn't end up with an unknown morph+POS combination + if pos is None or morph is None: + pos = None + morph = None + label_dict = Morphology.feats_to_dict(morph) if pos: - feats["POS"] = pos - if len(feats) > 0: - morph = self.vocab.strings[self.vocab.morphology.add(feats)] - if morph == "": - morph = Morphology.EMPTY_MORPH - eg_truths.append(morph) + label_dict[self.POS_FEAT] = pos + label = self.vocab.strings[self.vocab.morphology.add(label_dict)] + eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 9b7e2788d..757c9214c 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -5,6 +5,7 @@ from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir +from spacy.morphology import Morphology def test_label_types(): @@ -23,9 +24,10 @@ TRAIN_DATA = [ "pos": ["NOUN", "VERB", "ADJ", "NOUN"], }, ), + # test combinations of morph+POS ( "Eat blue ham", - {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}, ), ] @@ -38,7 +40,12 @@ def test_overfitting_IO(): for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): - morphologizer.add_label(morph + "|POS=" + pos) + if morph and pos: + morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos) + elif pos: + morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos) + elif morph: + morphologizer.add_label(morph) nlp.add_pipe(morphologizer) optimizer = nlp.begin_training() @@ -48,19 +55,27 @@ def test_overfitting_IO(): assert losses["morphologizer"] < 0.00001 # test the trained model - test_text = "I like blue eggs" + test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = [ - "Feat=N|POS=NOUN", - "Feat=V|POS=VERB", - "Feat=J|POS=ADJ", - "Feat=N|POS=NOUN", + "Feat=N", + "Feat=V", + "", + "", + ] + gold_pos_tags = [ + "NOUN", + "VERB", + "ADJ", + "", ] assert [t.morph_ for t in doc] == gold_morphs + assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert gold_morphs == [t.morph_ for t in doc2] + assert [t.morph_ for t in doc2] == gold_morphs + assert [t.pos_ for t in doc2] == gold_pos_tags From 9ee1c54f40e901533ef16cd148556cbf83cca6a7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 13:13:57 +0200 Subject: [PATCH 4/4] Improve tag map initialization and updating (#5764) * Improve tag map initialization and updating Generalize tag map initialization and updating so that the tag map can be loaded correctly prior to loading a `Corpus` with `spacy debug-data` and `spacy train`. * normalize provided tag map as necessary * use the same method for initializing and updating the tag map * Replace rather than update tag map Replace rather than update tag map when loading a custom tag map. Updating the tag map is problematic due to the sorted list of tag names and the fact that the tag map will contain lingering/unwanted tags from the default tag map. * Update CLI scripts * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map. --- spacy/cli/debug_data.py | 4 ++-- spacy/cli/train.py | 4 ++-- spacy/morphology.pyx | 33 +++++++++++++++-------------- spacy/tests/pipeline/test_tagger.py | 3 +-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 9d1986d8a..49bfa9e82 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -131,8 +131,8 @@ def debug_data( tag_map = {} if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) msg.divider("Data file validation") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index feebc30d4..f69ad5b60 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -124,8 +124,8 @@ def train( ) nlp.begin_training(lambda: train_examples) - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0852418f2..dac10137b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -64,6 +64,20 @@ cdef class Morphology: self.mem = Pool() self.strings = strings self.tags = PreshMap() + self.load_tag_map(tag_map) + self.lemmatizer = lemmatizer + + self._cache = PreshMapArray(self.n_tags) + self.exc = {} + if exc is not None: + for (tag, orth), attrs in exc.items(): + attrs = _normalize_props(attrs) + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) + + def load_tag_map(self, tag_map): + self.tag_map = {} + self.reverse_index = {} # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. if '_SP' in tag_map: @@ -74,27 +88,14 @@ cdef class Morphology: self.strings.add('_SP') tag_map = dict(tag_map) tag_map['_SP'] = space_attrs - self.tag_names = tuple(sorted(tag_map.keys())) - self.tag_map = {} - self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) - self.reverse_index = {} - self._load_from_tag_map(tag_map) - - self._cache = PreshMapArray(self.n_tags) - self.exc = {} - if exc is not None: - for (tag, orth), attrs in exc.items(): - attrs = _normalize_props(attrs) - self.add_special_case( - self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i + self.tag_names = tuple(sorted(self.tag_map.keys())) + self.n_tags = len(self.tag_map) + self._cache = PreshMapArray(self.n_tags) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index aedf8e2b3..ec7a15115 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -27,8 +27,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() tagger = nlp.create_pipe("tagger") - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) + nlp.vocab.morphology.load_tag_map(TAG_MAP) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))