From a0c899a0fff08e09f7ebabb8e0e50baa4f4b0897 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 10 Nov 2020 13:14:47 +0100 Subject: [PATCH 1/7] Fix textcat + transformer architecture (#6371) * add pooling to textcat TransformerListener * maybe_get_dim in case it's null --- spacy/cli/templates/quickstart_training.jinja | 3 +++ spacy/ml/models/textcat.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 1194438de..37983cb1a 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -143,6 +143,9 @@ nO = null @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 +[components.textcat.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + [components.textcat.model.linear_model] @architectures = "spacy.TextCatBOW.v1" exclusive_classes = false diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index d4aed2839..2ec036810 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -61,14 +61,14 @@ def build_bow_text_classifier( @registry.architectures.register("spacy.TextCatEnsemble.v2") -def build_text_classifier( +def build_text_classifier_v2( tok2vec: Model[List[Doc], List[Floats2d]], linear_model: Model[List[Doc], Floats2d], nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): - width = tok2vec.get_dim("nO") + width = tok2vec.maybe_get_dim("nO") cnn_model = ( tok2vec >> list2ragged() @@ -94,7 +94,7 @@ def build_text_classifier( # TODO: move to legacy @registry.architectures.register("spacy.TextCatEnsemble.v1") -def build_text_classifier( +def build_text_classifier_v1( width: int, embed_size: int, pretrained_vectors: Optional[bool], From a7e7d6c6c902055b66b208c299cfe8b578a497d8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 10 Nov 2020 13:15:09 +0100 Subject: [PATCH 2/7] Ignore misaligned in Morphologizer.get_loss (#6363) Fix bug where `Morphologizer.get_loss` treated misaligned annotation as `EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH` mappings. --- spacy/pipeline/morphologizer.pyx | 19 ++++++++----------- spacy/tests/pipeline/test_morphologizer.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a03c7daf0..305f8f5df 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -92,9 +92,6 @@ class Morphologizer(Tagger): # 2) labels_pos stores a mapping from morph+POS->POS cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}} self.cfg = dict(sorted(cfg.items())) - # add mappings for empty morph - self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH - self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] @property def labels(self): @@ -201,8 +198,8 @@ class Morphologizer(Tagger): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): morph = self.labels[tag_id] - doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) - doc.c[j].pos = self.cfg["labels_pos"][morph] + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0)) + doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0) def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and @@ -228,12 +225,12 @@ class Morphologizer(Tagger): # doesn't, so if either is None, treat both as None here so that # truths doesn't end up with an unknown morph+POS combination if pos is None or morph is None: - pos = None - morph = None - label_dict = Morphology.feats_to_dict(morph) - if pos: - label_dict[self.POS_FEAT] = pos - label = self.vocab.strings[self.vocab.morphology.add(label_dict)] + label = None + else: + label_dict = Morphology.feats_to_dict(morph) + if pos: + label_dict[self.POS_FEAT] = pos + label = self.vocab.strings[self.vocab.morphology.add(label_dict)] eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 85d1d6c8b..add42e00a 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -116,3 +116,23 @@ def test_overfitting_IO(): no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + + # Test without POS + nlp.remove_pipe("morphologizer") + nlp.add_pipe("morphologizer") + for example in train_examples: + for token in example.reference: + token.pos_ = "" + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["morphologizer"] < 0.00001 + + # Test the trained model + test_text = "I like blue ham" + doc = nlp(test_text) + gold_morphs = ["Feat=N", "Feat=V", "", ""] + gold_pos_tags = ["", "", "", ""] + assert [str(t.morph) for t in doc] == gold_morphs + assert [t.pos_ for t in doc] == gold_pos_tags From 96726ec1f62ade72b7904b2b21b4a893f93d0ca8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 17 Nov 2020 14:36:44 +0100 Subject: [PATCH 3/7] Fix DocBin init in training example (#6396) --- website/docs/usage/training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 274ea5989..58c846e9d 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -969,7 +969,7 @@ import spacy from spacy.tokens import Doc, DocBin nlp = spacy.blank("en") -docbin = DocBin(nlp.vocab) +docbin = DocBin() words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."] spaces = [True, True, True, True, True, True, True, False] ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"] From 165993d8e57f2bd0ea35f4792f414951dc6c4787 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 19 Nov 2020 14:11:38 +0100 Subject: [PATCH 4/7] fix typo in transformer docs (#6404) --- website/docs/api/transformer.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 5754d2238..e31c8ad2c 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Description | -| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | -| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | +| Setting | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py From 26296ab223b809cef5bd1fd997a4112119815864 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 27 Nov 2020 07:39:49 +0100 Subject: [PATCH 5/7] Add error message if DocBin zlib decompress fails (#6394) Add a better error message if DocBin zlib decompress fails, indicating that the data is not in `DocBin` format. --- spacy/errors.py | 4 ++++ spacy/tokens/_serialize.py | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index f4fd3731f..c2bb36b93 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -712,6 +712,10 @@ class Errors: E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the " "token itself. To set the morph from this MorphAnalysis, set from " "the string value with: `token.set_morph(str(other_morph))`.") + E1014 = ("Error loading DocBin data. It doesn't look like the data is in " + "DocBin (.spacy) format. If your data is in spaCy v2's JSON " + "training format, convert it using `python -m spacy convert " + "file.json .`.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 11eb75821..821f55eb6 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -198,7 +198,10 @@ class DocBin: DOCS: https://nightly.spacy.io/api/docbin#from_bytes """ - msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) + try: + msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) + except zlib.error: + raise ValueError(Errors.E1014) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.frombuffer(msg["lengths"], dtype="int32") From d21d2c2e59fd2347fc0280e4388fa7fa50786f8d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Nov 2020 15:15:51 +0800 Subject: [PATCH 6/7] Don't multiply accuracy by 100 --- website/src/templates/models.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 17140b072..b9658dacd 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -120,7 +120,7 @@ function formatAccuracy(data) { ? null : { label, - value: (value * 100).toFixed(2), + value: value.toFixed(2), help: MODEL_META[label], } }) From 9beba7164f5aae9797243bcdb8bd963a95a6f557 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Nov 2020 15:17:14 +0800 Subject: [PATCH 7/7] Make jinja2 top-level import No problem anymore since it's now an official dependency --- requirements.txt | 2 +- spacy/cli/init_config.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3a777f163..074775ae8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.5.0,<1.7.0 +jinja2 # Official Python utilities setuptools packaging>=20.0 @@ -26,4 +27,3 @@ pytest>=4.6.5 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.5.0,<3.6.0 -jinja2 diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 9f73b17ae..ff11f97f6 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings from thinc.api import Config import srsly import re +from jinja2 import Template from .. import util from ..language import DEFAULT_CONFIG_PRETRAIN_PATH @@ -127,10 +128,6 @@ def init_config( ) -> None: is_stdout = str(output_file) == "-" msg = Printer(no_print=is_stdout) - try: - from jinja2 import Template - except ImportError: - msg.fail("This command requires jinja2", "pip install jinja2", exits=1) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) # Filter out duplicates since tok2vec and transformer are added by template