From 853edace37af044e21b0631d8d35ede18d16a482 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 14:11:06 +0200 Subject: [PATCH 1/3] fix MultiHashEmbed example in documentation --- spacy/ml/models/tok2vec.py | 2 +- website/docs/usage/embeddings-transformers.md | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 23cfe883b..1a78cf75e 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -110,7 +110,7 @@ def MultiHashEmbed( The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into - account some subword information, without construction a fully character-based + account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in the representation as well, with the vectors table will be kept static (i.e. it's not updated). diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 73540b3d3..856685dad 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -516,16 +516,14 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `also_use_static_vectors` flag. This setting is -also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) -layer, which builds the default token-to-vector encoding architecture. +word vector tables using the `include_static_vectors` flag. ```ini [tagger.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 128 -rows = 7000 -also_embed_subwords = true +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [7000, 3500, 3500, 3500] also_use_static_vectors = true ``` From 040c7c054125d32da2af9c73f604b811e6ae0d97 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 15:40:58 +0200 Subject: [PATCH 2/3] fix get_dim calls in build_simple_cnn_text_classifier --- spacy/ml/models/textcat.py | 4 ++-- spacy/util.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 1117b4fde..ec8998e2d 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier( """ with Model.define_operators({">>": chain}): if exclusive_classes: - output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO")) + output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model.set_ref("output_layer", output_layer) else: - linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) + linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = ( tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() ) diff --git a/spacy/util.py b/spacy/util.py index 3d567a425..47fbcce1c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: if not path.parent.exists(): raise IOError(Errors.E052.format(path=path.parent)) if not path.exists() or not path.is_file(): - raise IOError(Errors.E053.format(path=path, name="meta.json")) + raise IOError(Errors.E053.format(path=path.parent, name="meta.json")) meta = srsly.read_json(path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: From e972ecba727a35d59080dc0e217faa02044abb4e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 16:03:14 +0200 Subject: [PATCH 3/3] add utf8 encoding for opening file --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e4559929e..8413c639b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path): if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": - with input_path.open() as file_: + with input_path.open(encoding="utf8") as file_: input_data = file_.read() converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner":