Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-09-21 10:59:07 +02:00
parent 9d32cac736
commit 1114219ae3
14 changed files with 69 additions and 46 deletions

View File

@ -6,7 +6,6 @@ from wasabi import msg
import srsly import srsly
import hashlib import hashlib
import typer import typer
import subprocess
from click import NoSuchOption from click import NoSuchOption
from typer.main import get_command from typer.main import get_command
from contextlib import contextmanager from contextlib import contextmanager
@ -327,7 +326,7 @@ def git_checkout(
) )
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
ret = run_command(cmd, capture=True) run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories # We need Path(name) to make sure we also support subdirectories
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))

View File

@ -156,11 +156,7 @@ class Language:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab( vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
self.lang,
self.Defaults,
vectors_name=vectors_name,
)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1462,7 +1458,7 @@ class Language:
# here :( # here :(
for i, (name1, proc1) in enumerate(self.pipeline): for i, (name1, proc1) in enumerate(self.pipeline):
if hasattr(proc1, "find_listeners"): if hasattr(proc1, "find_listeners"):
for name2, proc2 in self.pipeline[i+1:]: for name2, proc2 in self.pipeline[i + 1 :]:
if isinstance(getattr(proc2, "model", None), Model): if isinstance(getattr(proc2, "model", None), Model):
proc1.find_listeners(proc2.model) proc1.find_listeners(proc2.model)

View File

@ -164,7 +164,9 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1") @registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool): def CharacterEmbed(
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
):
"""Construct an embedded representation based on character embeddings, using """Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is each word, taken from the beginning and end of the word equally. Padding is
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
), ),
StaticVectors(width, dropout=0.0), StaticVectors(width, dropout=0.0),
), ),
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)), with_array(
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
),
ragged2list(), ragged2list(),
) )
else: else:
model = chain( model = chain(
concatenate( concatenate(
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
), ),
), ),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), with_array(
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
),
ragged2list(), ragged2list(),
) )
return model return model

View File

@ -1,4 +1,4 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator

View File

@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
def test_spans_lca_matrix(en_tokenizer): def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation""" """Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept") tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4) doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=[2, 1, 1, 0],
deps=["dep"] * 4,
)
lca = doc[:2].get_lca_matrix() lca = doc[:2].get_lca_matrix()
assert lca.shape == (2, 2) assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the assert lca[0, 0] == 0 # the & the -> the

View File

@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)) doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=["dep"] * len(heads),
)
lefts = {} lefts = {}
rights = {} rights = {}

View File

@ -345,10 +345,7 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
), ),
( ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
{"a": 0.25, "b": 0.75},
),
], ],
) )
def test_language_factories_combine_score_weights(weights, expected): def test_language_factories_combine_score_weights(weights, expected):
@ -363,16 +360,10 @@ def test_language_factories_scores():
weights1 = {"a1": 0.5, "a2": 0.5} weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.factory( Language.factory(
f"{name}1", f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
scores=list(weights1),
default_score_weights=weights1,
func=func,
) )
Language.factory( Language.factory(
f"{name}2", f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
scores=list(weights2),
default_score_weights=weights2,
func=func,
) )
meta1 = Language.get_factory_meta(f"{name}1") meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.default_score_weights == weights1 assert meta1.default_score_weights == weights1

View File

@ -212,9 +212,17 @@ def test_issue1834():
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2], heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
deps=["dep"] * len(words), deps=["dep"] * len(words),
) )
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc]) print(
doc.has_annotation("DEP"),
[t.head.i for t in doc],
[t.is_sent_start for t in doc],
)
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc]) print(
new_doc.has_annotation("DEP"),
[t.head.i for t in new_doc],
[t.is_sent_start for t in new_doc],
)
assert new_doc[6].sent_start assert new_doc[6].sent_start
assert new_doc.has_annotation("DEP") assert new_doc.has_annotation("DEP")
assert new_doc.has_annotation("TAG") assert new_doc.has_annotation("TAG")

View File

@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105 # See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL} cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"] model = registry.make_from_config(cfg, validate=True)["model"]
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None) textcat = TextCategorizer(
en_vocab,
model,
labels=["ENTITY", "ACTION", "MODIFIER"],
threshold=0.5,
positive_label=None,
)
textcat.to_bytes(exclude=["vocab"]) textcat.to_bytes(exclude=["vocab"])

View File

@ -3,7 +3,6 @@ from click import NoSuchOption
from spacy.training import docs_to_json, biluo_tags_from_offsets from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides

View File

@ -291,8 +291,7 @@ def test_spacy_blank():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"value", "value", [False, None, ["x", "y"], Language, Vocab],
[False, None, ["x", "y"], Language, Vocab],
) )
def test_language_init_invalid_vocab(value): def test_language_init_invalid_vocab(value):
err_fragment = "invalid value" err_fragment = "invalid value"

View File

@ -95,7 +95,7 @@ def test_util_dot_section():
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten # Test that default values got overwritten
assert en_config["nlp"]["pipeline"] == ["textcat"] assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["pipeline"] == [] # default value [] assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object' # Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError): with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger") dot_to_object(en_config, "nlp.pipeline.tagger")

View File

@ -1,7 +1,6 @@
from typing import Dict, Iterable, Callable from typing import Dict, Iterable, Callable
import pytest import pytest
from thinc.api import Config from thinc.api import Config
from spacy import Language from spacy import Language
from spacy.util import load_model_from_config, registry, dot_to_object from spacy.util import load_model_from_config, registry, dot_to_object
from spacy.training import Example from spacy.training import Example

View File

@ -34,7 +34,17 @@ def doc():
# fmt: on # fmt: on
nlp = English() nlp = English()
words = [t.text for t in nlp.make_doc(text)] words = [t.text for t in nlp.make_doc(text)]
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents) doc = get_doc(
nlp.vocab,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
heads=heads,
deps=deps,
lemmas=lemmas,
ents=ents,
)
doc.cats = cats doc.cats = cats
return doc return doc