diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e4559929e..8413c639b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path): if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": - with input_path.open() as file_: + with input_path.open(encoding="utf8") as file_: input_data = file_.read() converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner": diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 1117b4fde..ec8998e2d 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier( """ with Model.define_operators({">>": chain}): if exclusive_classes: - output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO")) + output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model.set_ref("output_layer", output_layer) else: - linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) + linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = ( tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() ) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 6ef7b2325..95e200927 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -110,7 +110,7 @@ def MultiHashEmbed( The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into - account some subword information, without construction a fully character-based + account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in the representation as well, with the vectors table will be kept static (i.e. it's not updated). diff --git a/spacy/util.py b/spacy/util.py index 3d567a425..47fbcce1c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: if not path.parent.exists(): raise IOError(Errors.E052.format(path=path.parent)) if not path.exists() or not path.is_file(): - raise IOError(Errors.E053.format(path=path, name="meta.json")) + raise IOError(Errors.E053.format(path=path.parent, name="meta.json")) meta = srsly.read_json(path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index d104db86a..093b0c137 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -516,9 +516,7 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `include_static_vectors` flag. This setting is -also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) -layer, which builds the default token-to-vector encoding architecture. +word vector tables using the `include_static_vectors` flag. ```ini [tagger.model.tok2vec.embed]