Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-09-21 10:59:07 +02:00
parent 9d32cac736
commit 1114219ae3
14 changed files with 69 additions and 46 deletions

View File

@ -6,7 +6,6 @@ from wasabi import msg
import srsly
import hashlib
import typer
import subprocess
from click import NoSuchOption
from typer.main import get_command
from contextlib import contextmanager
@ -327,7 +326,7 @@ def git_checkout(
)
with make_tempdir() as tmp_dir:
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
ret = run_command(cmd, capture=True)
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))

View File

@ -156,11 +156,7 @@ class Language:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(
self.lang,
self.Defaults,
vectors_name=vectors_name,
)
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1462,7 +1458,7 @@ class Language:
# here :(
for i, (name1, proc1) in enumerate(self.pipeline):
if hasattr(proc1, "find_listeners"):
for name2, proc2 in self.pipeline[i+1:]:
for name2, proc2 in self.pipeline[i + 1 :]:
if isinstance(getattr(proc2, "model", None), Model):
proc1.find_listeners(proc2.model)

View File

@ -164,7 +164,9 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
def CharacterEmbed(
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
):
"""Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
),
StaticVectors(width, dropout=0.0),
),
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
with_array(
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
),
ragged2list(),
)
)
else:
model = chain(
concatenate(
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
with_array(
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
),
ragged2list(),
)
)
return model

View File

@ -1,4 +1,4 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel):
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on

View File

@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=[2, 1, 1, 0],
deps=["dep"] * 4,
)
lca = doc[:2].get_lca_matrix()
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the

View File

@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=["dep"] * len(heads),
)
lefts = {}
rights = {}

View File

@ -345,10 +345,7 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
(
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
{"a": 0.25, "b": 0.75},
),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
],
)
def test_language_factories_combine_score_weights(weights, expected):
@ -363,16 +360,10 @@ def test_language_factories_scores():
weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.factory(
f"{name}1",
scores=list(weights1),
default_score_weights=weights1,
func=func,
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
)
Language.factory(
f"{name}2",
scores=list(weights2),
default_score_weights=weights2,
func=func,
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
)
meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.default_score_weights == weights1

View File

@ -212,9 +212,17 @@ def test_issue1834():
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
deps=["dep"] * len(words),
)
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
print(
doc.has_annotation("DEP"),
[t.head.i for t in doc],
[t.is_sent_start for t in doc],
)
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
print(
new_doc.has_annotation("DEP"),
[t.head.i for t in new_doc],
[t.is_sent_start for t in new_doc],
)
assert new_doc[6].sent_start
assert new_doc.has_annotation("DEP")
assert new_doc.has_annotation("TAG")

View File

@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
textcat = TextCategorizer(
en_vocab,
model,
labels=["ENTITY", "ACTION", "MODIFIER"],
threshold=0.5,
positive_label=None,
)
textcat.to_bytes(exclude=["vocab"])

View File

@ -3,7 +3,6 @@ from click import NoSuchOption
from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides

View File

@ -291,8 +291,7 @@ def test_spacy_blank():
@pytest.mark.parametrize(
"value",
[False, None, ["x", "y"], Language, Vocab],
"value", [False, None, ["x", "y"], Language, Vocab],
)
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"

View File

@ -95,7 +95,7 @@ def test_util_dot_section():
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["pipeline"] == [] # default value []
assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger")

View File

@ -1,7 +1,6 @@
from typing import Dict, Iterable, Callable
import pytest
from thinc.api import Config
from spacy import Language
from spacy.util import load_model_from_config, registry, dot_to_object
from spacy.training import Example
@ -10,19 +9,19 @@ from spacy.training import Example
def test_readers():
config_string = """
[training]
[corpora]
@readers = "myreader.v1"
[nlp]
lang = "en"
pipeline = ["tok2vec", "textcat"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.textcat]
factory = "textcat"
"""
@ -69,19 +68,19 @@ def test_readers():
def test_cat_readers(reader, additional_config):
nlp_config_string = """
[training]
[corpora]
@readers = "PLACEHOLDER"
[nlp]
lang = "en"
pipeline = ["tok2vec", "textcat"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.textcat]
factory = "textcat"
"""

View File

@ -34,7 +34,17 @@ def doc():
# fmt: on
nlp = English()
words = [t.text for t in nlp.make_doc(text)]
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
doc = get_doc(
nlp.vocab,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
heads=heads,
deps=deps,
lemmas=lemmas,
ents=ents,
)
doc.cats = cats
return doc