From ecbb9c4b9f89120ba04642852780d592c024b6ef Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 11:50:42 +0100 Subject: [PATCH 01/14] load Underscore state when multiprocessing --- spacy/language.py | 11 ++++++++--- spacy/tokens/underscore.py | 8 ++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 5544b6341..71180a65d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -15,6 +15,7 @@ import multiprocessing as mp from itertools import chain, cycle from .tokenizer import Tokenizer +from .tokens.underscore import Underscore from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups @@ -852,7 +853,10 @@ class Language(object): sender.send() procs = [ - mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch)) + mp.Process( + target=_apply_pipes, + args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), + ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] for proc in procs: @@ -1107,7 +1111,7 @@ def _pipe(docs, proc, kwargs): yield doc -def _apply_pipes(make_doc, pipes, reciever, sender): +def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): """Worker for Language.pipe receiver (multiprocessing.Connection): Pipe to receive text. Usually @@ -1115,8 +1119,9 @@ def _apply_pipes(make_doc, pipes, reciever, sender): sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` """ + Underscore.load_state(underscore_state) while True: - texts = reciever.get() + texts = receiver.get() docs = (make_doc(text) for text in texts) for pipe in pipes: docs = pipe(docs) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b36fe9294..8dac8526e 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -79,6 +79,14 @@ class Underscore(object): def _get_key(self, name): return ("._.", name, self._start, self._end) + @classmethod + def get_state(cls): + return cls.token_extensions, cls.span_extensions, cls.doc_extensions + + @classmethod + def load_state(cls, state): + cls.token_extensions, cls.span_extensions, cls.doc_extensions = state + def get_ext_args(**kwargs): """Validate and convert arguments. Reused in Doc, Token and Span.""" From 05dedaa2cf2e57469ac860fbd0af638c27c02148 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:00:13 +0100 Subject: [PATCH 02/14] add unit test --- spacy/tests/regression/test_issue4903.py | 40 ++++++++++++++++++++++++ spacy/tests/regression/test_issue4924.py | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue4903.py diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py new file mode 100644 index 000000000..97293aec7 --- /dev/null +++ b/spacy/tests/regression/test_issue4903.py @@ -0,0 +1,40 @@ +# coding: utf8 +from __future__ import unicode_literals + +import spacy +from spacy.tokens import Span, Doc + + +class CustomPipe: + name = "my_pipe" + + def __init__(self): + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +def test_issue4903(): + # ensures that this runs correctly and doesn't hang or crash on Windows / macOS + nlp = spacy.load("en_core_web_sm") + custom_component = CustomPipe() + nlp.add_pipe(custom_component, after="parser") + + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + # works without 'n_process' + for doc in nlp.pipe(text, n_process=2): + print(doc) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 8aea2c3d5..0e45291a9 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -11,6 +11,6 @@ def nlp(): return spacy.blank("en") -def test_evaluate(nlp): +def test_issue4924(nlp): docs_golds = [("", {})] nlp.evaluate(docs_golds) From 65f5b48b5db0e8e11e73e505469ccdb38e8f07af Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:06:27 +0100 Subject: [PATCH 03/14] add comment --- spacy/language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language.py b/spacy/language.py index 71180a65d..737e0bf3c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1118,6 +1118,7 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): created by `multiprocessing.Pipe()` sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` + underscore_state (tuple): The data in the Underscore class of the parent """ Underscore.load_state(underscore_state) while True: From 51d37033c8b2f280cfc0ddf2b1ecf0537f347532 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:10:05 +0100 Subject: [PATCH 04/14] remove old comment --- spacy/tests/regression/test_issue4903.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 97293aec7..d09b32849 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -35,6 +35,5 @@ def test_issue4903(): nlp.add_pipe(custom_component, after="parser") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - # works without 'n_process' for doc in nlp.pipe(text, n_process=2): print(doc) From 46628d88903edaa2c3614339a0d464b9fcdcc690 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:12:52 +0100 Subject: [PATCH 05/14] add some asserts --- spacy/tests/regression/test_issue4903.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index d09b32849..0a255d9a8 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -35,5 +35,7 @@ def test_issue4903(): nlp.add_pipe(custom_component, after="parser") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - for doc in nlp.pipe(text, n_process=2): - print(doc) + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." From 7939c6388656e1abb932b2deb1af90928c297aa2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 12:26:27 +0100 Subject: [PATCH 06/14] use English instead of model --- spacy/tests/regression/test_issue4903.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 0a255d9a8..82e21b79f 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import spacy +from spacy.lang.en import English from spacy.tokens import Span, Doc @@ -30,9 +31,10 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - nlp = spacy.load("en_core_web_sm") + nlp = English() custom_component = CustomPipe() - nlp.add_pipe(custom_component, after="parser") + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(custom_component, after="sentencizer") text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] docs = list(nlp.pipe(text, n_process=2)) From 6e717c62ed2d0407b37ae0e19c033964425419cc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 13:21:31 +0100 Subject: [PATCH 07/14] avoid the tests interacting with eachother through the global Underscore variable --- spacy/tests/regression/test_issue4849.py | 6 ++++++ spacy/tests/regression/test_issue4903.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..7e58243bc 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -3,11 +3,17 @@ from __future__ import unicode_literals from spacy.lang.en import English from spacy.pipeline import EntityRuler +from spacy.tokens.underscore import Underscore def test_issue4849(): nlp = English() + # reset the Underscore object because test_underscore has a lambda function that can't be pickled + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + ruler = EntityRuler( nlp, patterns=[ {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 82e21b79f..156845558 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import spacy from spacy.lang.en import English from spacy.tokens import Span, Doc +from spacy.tokens.underscore import Underscore class CustomPipe: @@ -31,6 +32,12 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS + + # reset the Underscore object because test_underscore has a lambda function that can't be pickled + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + nlp = English() custom_component = CustomPipe() nlp.add_pipe(nlp.create_pipe("sentencizer")) From d1f0b397b5a8cba5e59dd5448a831932055c7f45 Mon Sep 17 00:00:00 2001 From: questoph Date: Thu, 13 Feb 2020 22:18:51 +0100 Subject: [PATCH 08/14] Update punctuation.py --- spacy/lang/lb/punctuation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..2a4587856 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_ ELISION = " ' ’ ".strip().replace(" ", "") +abbrev = ("d", "D") + _infixes = ( LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), From 5352fc8fc3f06e99a2d5159da7d4c226be1b82c1 Mon Sep 17 00:00:00 2001 From: questoph Date: Fri, 14 Feb 2020 12:02:15 +0100 Subject: [PATCH 09/14] Update tokenizer_exceptions.py --- spacy/lang/lb/tokenizer_exceptions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..1c9b2dde3 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -10,6 +10,8 @@ _exc = {} # translate / delete what is not necessary for exc_data in [ + {ORTH: "’t", LEMMA: "et", NORM: "et"}, + {ORTH: "’T", LEMMA: "et", NORM: "et"}, {ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, From 3853d385faad420f94f39223da148265113149e1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Feb 2020 13:41:24 +0100 Subject: [PATCH 10/14] Fix formatting in Token API --- website/docs/api/token.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 68402d1b4..c30c01c20 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -437,8 +437,8 @@ The L2 norm of the token's vector representation. | `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `lower` | int | Lowercase form of the token. | | `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | | `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | | `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | From b49a3afd0cde67debd2128b2cf2c816322c6d0d7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 15:49:20 +0100 Subject: [PATCH 11/14] use clean_underscore fixture --- spacy/tests/doc/test_underscore.py | 9 +++++++++ spacy/tests/matcher/test_matcher_api.py | 2 ++ spacy/tests/regression/test_issue4849.py | 5 ----- spacy/tests/regression/test_issue4903.py | 5 ----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 2877bfeea..c1eff2c20 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token from spacy.tokens.underscore import Underscore +@pytest.fixture(scope="function", autouse=True) +def clean_underscore(): + # reset the Underscore object after the test, to avoid having state copied across tests + yield + Underscore.doc_extensions = {} + Underscore.span_extensions = {} + Underscore.token_extensions = {} + + def test_create_doc_underscore(): doc = Mock() doc.doc = doc diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e4584d03a..a826a0a0e 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,6 +6,7 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token +from ..doc.test_underscore import clean_underscore @pytest.fixture @@ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab): assert matches[2] == "test hello world" +@pytest.mark.usefixtures("clean_underscore") def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 7e58243bc..85d03fe9a 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -9,11 +9,6 @@ from spacy.tokens.underscore import Underscore def test_issue4849(): nlp = English() - # reset the Underscore object because test_underscore has a lambda function that can't be pickled - Underscore.doc_extensions = {} - Underscore.span_extensions = {} - Underscore.token_extensions = {} - ruler = EntityRuler( nlp, patterns=[ {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 156845558..9a3c10d61 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -33,11 +33,6 @@ class CustomPipe: def test_issue4903(): # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - # reset the Underscore object because test_underscore has a lambda function that can't be pickled - Underscore.doc_extensions = {} - Underscore.span_extensions = {} - Underscore.token_extensions = {} - nlp = English() custom_component = CustomPipe() nlp.add_pipe(nlp.create_pipe("sentencizer")) From 54d8665ff74239c42a0fb6f457c26a50bc269079 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Mon, 24 Feb 2020 16:15:28 -0500 Subject: [PATCH 12/14] Add missing comma in a dependency specification Conda is complaining that it can't parse that line otherwise. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 55396e011..12d7a2e63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.0.5<0.2.0 + spacy_lookups_data>=0.0.5,<0.2.0 cuda = cupy>=5.0.0b4 cuda80 = From dc36ec98a4f57b6f0e9e7d508b3152cb53e67da7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 16:46:14 +0100 Subject: [PATCH 13/14] Update pyproject.toml --- pyproject.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fed528d4a..8a6ababf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,11 @@ [build-system] -requires = ["setuptools"] +requires = [ + "setuptools", + "wheel", + "cython>=0.25", + "cymem>=2.0.2,<2.1.0", + "preshed>=3.0.2,<3.1.0", + "murmurhash>=0.28.0,<1.1.0", + "thinc==7.4.0.dev0", +] build-backend = "setuptools.build_meta" From ff184b7a9c64d954a7c7445e00c7505ed1d930f0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 26 Feb 2020 12:10:38 +0100 Subject: [PATCH 14/14] Add tag_map argument to CLI debug-data and train (#4750) (#5038) Add an argument for a path to a JSON-formatted tag map, which is used to update and extend the default language tag map. --- spacy/cli/debug_data.py | 10 +++++++++- spacy/cli/train.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4b12052c3..0e12a594c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000 lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), pipeline=( "Comma-separated names of pipeline components to train", @@ -41,6 +42,7 @@ def debug_data( lang, train_path, dev_path, + tag_map_path=None, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, @@ -60,6 +62,10 @@ def debug_data( if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: @@ -67,6 +73,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) msg.divider("Data format validation") @@ -344,7 +352,7 @@ def debug_data( if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] - tag_map = nlp.Defaults.tag_map + tag_map = nlp.vocab.morphology.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( len(labels), diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5af93a8f3..968a009f6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -57,6 +57,7 @@ from .. import about textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -95,6 +96,7 @@ def train( textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, + tag_map_path=None, verbose=False, debug=False, ): @@ -132,6 +134,9 @@ def train( output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly @@ -238,6 +243,9 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors)