Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-09-29 21:39:28 +02:00
parent 604be54a5c
commit fa47f87924
42 changed files with 71 additions and 92 deletions

View File

@ -9,7 +9,8 @@ import sys
from ._util import app, Arg, Opt
from ..training import docs_to_json
from ..tokens import DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs
# Converters are matched by file extension except for ner/iob, which are

View File

@ -27,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
@debug_cli.command(
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
@app.command(
"debug-data",

View File

@ -134,7 +134,7 @@ def update_dvc_config(
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.

View File

@ -3,8 +3,7 @@ from ...tokens import Token
class EnglishLemmatizer(Lemmatizer):
"""English lemmatizer. Only overrides is_base_form.
"""
"""English lemmatizer. Only overrides is_base_form."""
def is_base_form(self, token: Token) -> bool:
"""

View File

@ -58,7 +58,7 @@ def noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i],)):
if list(filter(filter_func, doc[left_bound.i : right.i])):
break
else:
right_bound = right

View File

@ -108,8 +108,8 @@ _num_words = [
def like_num(text):
"""
Check if text resembles a number
"""
Check if text resembles a number
"""
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")

View File

@ -17,7 +17,7 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)

View File

@ -1189,7 +1189,7 @@ class Language:
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
init_vocab(
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:

View File

@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as

View File

@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence
matches = [
(int(self.vocab.strings[m_id]), m_id, s, e)
for m_id, s, e in matches
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
]
matches.sort()
for attr_id, match_id, start, end in matches:
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
try:
# The index can be negative, which makes it annoying to do
# the boundscheck. Let Span do it instead.
token = span[index]
token = span[index] # noqa: F841
except IndexError:
# The original exception is just our conditional logic, so we
# raise from.
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
span=[t.text for t in span],
index=index,
)
) from None
) from None
set_token_attrs(span[index], attrs)
return doc

View File

@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
return {}
@classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode

View File

@ -347,7 +347,7 @@ class TextCategorizer(Pipe):
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
labels: Optional[Dict] = None
labels: Optional[Dict] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.

View File

@ -132,7 +132,7 @@ def validate_init_settings(
block = "initialize" if not section else f"initialize.{section}"
title = f"Error validating initialization settings in [{block}]"
raise ConfigValidationError(
title=title, errors=e.errors(), config=settings, parent=name,
title=title, errors=e.errors(), config=settings, parent=name
) from None

View File

@ -32,9 +32,7 @@ class PRFScore:
def __add__(self, other):
return PRFScore(
tp=self.tp+other.tp,
fp=self.fp+other.fp,
fn=self.fn+other.fn
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
)
def score_set(self, cand: set, gold: set) -> None:
@ -485,7 +483,7 @@ class Scorer:
(pred_ent.start_char, pred_ent.end_char), None
)
label = gold_span.label_
if not label in f_per_type:
if label not in f_per_type:
f_per_type[label] = PRFScore()
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y
preds = set()
for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore()

View File

@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
assert [t.is_sent_start for t in doc] == [True, False, True, False]
# heads override sent_starts
doc = Doc(
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
)
assert [t.is_sent_start for t in doc] == [True, False, True, False]

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_de(de_tokenizer):
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
doc = de_tokenizer("Er lag auf seinem")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_el(el_tokenizer):
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -7,8 +7,7 @@ import pytest
def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
doc = en_tokenizer("This is a sentence")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
doc = es_tokenizer("en Oxford este verano")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
with pytest.raises(ValueError):

View File

@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
assert len(tokens) == 1
@pytest.mark.parametrize(
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
)
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
tokens = fr_tokenizer(text)
assert len(tokens) == 1

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("trouver des travaux antérieurs")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_id(id_tokenizer):
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
doc = id_tokenizer("sebelas")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
@pytest.mark.parametrize(
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
)
def test_ja_tokenizer_sub_tokens(
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
@pytest.mark.parametrize(
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
)
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
tokens = ne_tokenizer(text)

View File

@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
@pytest.mark.parametrize(
"text,length",
[
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
],
)

View File

@ -3,8 +3,7 @@ from spacy.tokens import Doc
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
"""
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
doc = sv_tokenizer("Studenten läste den bästa boken")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases
douglas_hash = mykb.add_alias(
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")

View File

@ -226,6 +226,7 @@ def test_positive_class_not_binary():
with pytest.raises(ValueError):
verify_textcat_config(nlp, pipe_config)
def test_textcat_evaluation():
train_examples = []
nlp = English()
@ -241,15 +242,17 @@ def test_textcat_evaluation():
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
scores = Scorer().score_cats(
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
)
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
assert scores["cats_f_per_type"]["summer"]["p"] == 0
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
assert scores["cats_micro_p"] == 4/5
assert scores["cats_micro_r"] == 4/6
assert scores["cats_micro_p"] == 4 / 5
assert scores["cats_micro_r"] == 4 / 6

View File

@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
encode_config["width"] = width
docs = get_batch(3)
tok2vec = build_Tok2Vec_model(
embed_arch(**embed_config),
encode_arch(**encode_config)
embed_arch(**embed_config), encode_arch(**encode_config)
)
tok2vec.initialize(docs)
vectors, backprop = tok2vec.begin_update(docs)

View File

@ -229,9 +229,7 @@ def test_issue3611():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
def test_issue3625():
@ -390,7 +388,7 @@ def test_issue3959():
def test_issue3962(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
"""Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
@ -428,7 +426,7 @@ def test_issue3962(en_vocab):
def test_issue3962_long(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
"""Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab):
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])

View File

@ -19,8 +19,7 @@ from ..util import make_tempdir
def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
"""
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
@ -72,9 +71,7 @@ def test_issue4030():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
# processing of an empty doc should result in 0.0 for all categories
doc = nlp("")
assert doc.cats["offensive"] == 0.0

View File

@ -7,7 +7,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from thinc.api import ConfigValidationError, Config
from thinc.api import ConfigValidationError
import srsly
import os

View File

@ -290,9 +290,7 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
"value", [False, None, ["x", "y"], Language, Vocab],
)
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"
with pytest.raises(ValueError) as e:

View File

@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
),
"encode": MaxoutWindowEncoder(
width=32, depth=2, maxout_pieces=2, window_size=1,
width=32, depth=2, maxout_pieces=2, window_size=1
),
}

View File

@ -137,7 +137,7 @@ def test_las_per_type(en_vocab):
examples = []
for input_, annot in test_las_apple:
doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
)
gold = {"heads": annot["heads"], "deps": annot["deps"]}
example = Example.from_dict(doc, gold)

View File

@ -496,8 +496,10 @@ def test_make_orth_variants(doc):
output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
train_examples = list(reader(nlp))
reader = Corpus(
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
)
list(reader(nlp))
@pytest.mark.skip("Outdated")

View File

@ -23,7 +23,7 @@ def dont_augment(nlp, example):
yield example
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
if random.random() >= level:
yield example
else:
@ -36,14 +36,14 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.
nlp,
raw_text,
orig_dict["token_annotation"],
lower=raw_text is not None and random.random() < lower
lower=raw_text is not None and random.random() < lower,
)
doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot
yield example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
orig_token_dict = copy.deepcopy(token_dict)
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndsv = orth_variants.get("single", [])

View File

@ -188,8 +188,8 @@ def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
"""RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
"""
return [
name

View File

@ -94,7 +94,7 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
def _resume_model(
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
) -> None:
msg = Printer(no_print=silent)
msg.info(f"Resume training tok2vec from: {resume_path}")

View File

@ -488,7 +488,7 @@ def load_config_from_str(
RETURNS (Config): The loaded config.
"""
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate,
text, overrides=overrides, interpolate=interpolate
)