Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-09-29 21:39:28 +02:00
parent 604be54a5c
commit fa47f87924
42 changed files with 71 additions and 92 deletions

View File

@ -9,7 +9,8 @@ import sys
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are

View File

@ -27,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
@debug_cli.command( @debug_cli.command(
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
) )
@app.command( @app.command(
"debug-data", "debug-data",

View File

@ -134,7 +134,7 @@ def update_dvc_config(
def run_dvc_commands( def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}, commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None: ) -> None:
"""Run a sequence of DVC commands in a subprocess, in order. """Run a sequence of DVC commands in a subprocess, in order.

View File

@ -3,8 +3,7 @@ from ...tokens import Token
class EnglishLemmatizer(Lemmatizer): class EnglishLemmatizer(Lemmatizer):
"""English lemmatizer. Only overrides is_base_form. """English lemmatizer. Only overrides is_base_form."""
"""
def is_base_form(self, token: Token) -> bool: def is_base_form(self, token: Token) -> bool:
""" """

View File

@ -58,7 +58,7 @@ def noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps doc, token, np_left_deps, np_right_deps, stop_deps
) )
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i],)): if list(filter(filter_func, doc[left_bound.i : right.i])):
break break
else: else:
right_bound = right right_bound = right

View File

@ -17,7 +17,7 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer") @registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True,): def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)

View File

@ -1189,7 +1189,7 @@ class Language:
# These are the settings provided in the [initialize] block in the config # These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
init_vocab( init_vocab(
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"], self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
) )
pretrain_cfg = config.get("pretraining") pretrain_cfg = config.get("pretraining")
if pretrain_cfg: if pretrain_cfg:

View File

@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
def analyze_pipes( def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows """Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as a table with the pipeline components and why they assign and require, as

View File

@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
matches = self.matcher(doc, allow_missing=True) matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence # Sort by the attribute ID, so that later rules have precendence
matches = [ matches = [
(int(self.vocab.strings[m_id]), m_id, s, e) (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
for m_id, s, e in matches
] ]
matches.sort() matches.sort()
for attr_id, match_id, start, end in matches: for attr_id, match_id, start, end in matches:
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
try: try:
# The index can be negative, which makes it annoying to do # The index can be negative, which makes it annoying to do
# the boundscheck. Let Span do it instead. # the boundscheck. Let Span do it instead.
token = span[index] token = span[index] # noqa: F841
except IndexError: except IndexError:
# The original exception is just our conditional logic, so we # The original exception is just our conditional logic, so we
# raise from. # raise from.

View File

@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
return {} return {}
@classmethod @classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups: def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None, """Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode settings. Confirm that all required tables for the language and mode

View File

@ -347,7 +347,7 @@ class TextCategorizer(Pipe):
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
nlp: Optional[Language] = None, nlp: Optional[Language] = None,
labels: Optional[Dict] = None labels: Optional[Dict] = None,
): ):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.

View File

@ -132,7 +132,7 @@ def validate_init_settings(
block = "initialize" if not section else f"initialize.{section}" block = "initialize" if not section else f"initialize.{section}"
title = f"Error validating initialization settings in [{block}]" title = f"Error validating initialization settings in [{block}]"
raise ConfigValidationError( raise ConfigValidationError(
title=title, errors=e.errors(), config=settings, parent=name, title=title, errors=e.errors(), config=settings, parent=name
) from None ) from None

View File

@ -32,9 +32,7 @@ class PRFScore:
def __add__(self, other): def __add__(self, other):
return PRFScore( return PRFScore(
tp=self.tp+other.tp, tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
fp=self.fp+other.fp,
fn=self.fn+other.fn
) )
def score_set(self, cand: set, gold: set) -> None: def score_set(self, cand: set, gold: set) -> None:
@ -485,7 +483,7 @@ class Scorer:
(pred_ent.start_char, pred_ent.end_char), None (pred_ent.start_char, pred_ent.end_char), None
) )
label = gold_span.label_ label = gold_span.label_
if not label in f_per_type: if label not in f_per_type:
f_per_type[label] = PRFScore() f_per_type[label] = PRFScore()
gold = gold_span.kb_id_ gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred, # only evaluating entities that overlap between gold and pred,
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
continue continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents} golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y align_x2y = eg.alignment.x2y
preds = set()
for pred_ent in eg.x.ents: for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores: if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore() scores[pred_ent.label_] = PRFScore()

View File

@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
assert [t.is_sent_start for t in doc] == [True, False, True, False] assert [t.is_sent_start for t in doc] == [True, False, True, False]
# heads override sent_starts # heads override sent_starts
doc = Doc( doc = Doc(
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4, en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
) )
assert [t.is_sent_start for t in doc] == [True, False, True, False] assert [t.is_sent_start for t in doc] == [True, False, True, False]

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_de(de_tokenizer): def test_noun_chunks_is_parsed_de(de_tokenizer):
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
"""
doc = de_tokenizer("Er lag auf seinem") doc = de_tokenizer("Er lag auf seinem")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_el(el_tokenizer): def test_noun_chunks_is_parsed_el(el_tokenizer):
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
"""
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -7,8 +7,7 @@ import pytest
def test_noun_chunks_is_parsed(en_tokenizer): def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
"""
doc = en_tokenizer("This is a sentence") doc = en_tokenizer("This is a sentence")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_es(es_tokenizer): def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
"""
doc = es_tokenizer("en Oxford este verano") doc = es_tokenizer("en Oxford este verano")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fa(fa_tokenizer): def test_noun_chunks_is_parsed_fa(fa_tokenizer):
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
"""
doc = fa_tokenizer("این یک جمله نمونه می باشد.") doc = fa_tokenizer("این یک جمله نمونه می باشد.")
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize( @pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text): def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
"""
doc = fr_tokenizer("trouver des travaux antérieurs") doc = fr_tokenizer("trouver des travaux antérieurs")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_id(id_tokenizer): def test_noun_chunks_is_parsed_id(id_tokenizer):
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
"""
doc = id_tokenizer("sebelas") doc = id_tokenizer("sebelas")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS, "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
) )
def test_ja_tokenizer_sub_tokens( def test_ja_tokenizer_sub_tokens(
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c

View File

@ -2,8 +2,7 @@ import pytest
def test_noun_chunks_is_parsed_nb(nb_tokenizer): def test_noun_chunks_is_parsed_nb(nb_tokenizer):
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
"""
doc = nb_tokenizer("Smørsausen brukes bl.a. til") doc = nb_tokenizer("Smørsausen brukes bl.a. til")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)], "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
) )
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
tokens = ne_tokenizer(text) tokens = ne_tokenizer(text)

View File

@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,), ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6), ("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
], ],
) )

View File

@ -3,8 +3,7 @@ from spacy.tokens import Doc
def test_noun_chunks_is_parsed_sv(sv_tokenizer): def test_noun_chunks_is_parsed_sv(sv_tokenizer):
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
"""
doc = sv_tokenizer("Studenten läste den bästa boken") doc = sv_tokenizer("Studenten läste den bästa boken")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases # adding aliases
douglas_hash = mykb.add_alias( mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam") candidates = mykb.get_alias_candidates("adam")

View File

@ -226,6 +226,7 @@ def test_positive_class_not_binary():
with pytest.raises(ValueError): with pytest.raises(ValueError):
verify_textcat_config(nlp, pipe_config) verify_textcat_config(nlp, pipe_config)
def test_textcat_evaluation(): def test_textcat_evaluation():
train_examples = [] train_examples = []
nlp = English() nlp = English()
@ -241,7 +242,9 @@ def test_textcat_evaluation():
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
train_examples.append(Example(pred2, ref2)) train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) scores = Scorer().score_cats(
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
)
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2 assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1 assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
assert scores["cats_f_per_type"]["summer"]["p"] == 0 assert scores["cats_f_per_type"]["summer"]["p"] == 0

View File

@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
encode_config["width"] = width encode_config["width"] = width
docs = get_batch(3) docs = get_batch(3)
tok2vec = build_Tok2Vec_model( tok2vec = build_Tok2Vec_model(
embed_arch(**embed_config), embed_arch(**embed_config), encode_arch(**encode_config)
encode_arch(**encode_config)
) )
tok2vec.initialize(docs) tok2vec.initialize(docs)
vectors, backprop = tok2vec.begin_update(docs) vectors, backprop = tok2vec.begin_update(docs)

View File

@ -229,9 +229,7 @@ def test_issue3611():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
nlp.update( nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
def test_issue3625(): def test_issue3625():
@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab):
def test_issue3972(en_vocab): def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs. """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
"""
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) matcher.add("B", [Doc(en_vocab, words=["New", "York"])])

View File

@ -19,8 +19,7 @@ from ..util import make_tempdir
def test_issue4002(en_vocab): def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes. """Test that the PhraseMatcher can match on overwritten NORM attributes."""
"""
matcher = PhraseMatcher(en_vocab, attr="NORM") matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"]) pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"] assert [t.norm_ for t in pattern1] == ["c", "d"]
@ -72,9 +71,7 @@ def test_issue4030():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
nlp.update( nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
# processing of an empty doc should result in 0.0 for all categories # processing of an empty doc should result in 0.0 for all categories
doc = nlp("") doc = nlp("")
assert doc.cats["offensive"] == 0.0 assert doc.cats["offensive"] == 0.0

View File

@ -7,7 +7,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from thinc.api import ConfigValidationError, Config from thinc.api import ConfigValidationError
import srsly import srsly
import os import os

View File

@ -290,9 +290,7 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model" assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize( @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
"value", [False, None, ["x", "y"], Language, Vocab],
)
def test_language_init_invalid_vocab(value): def test_language_init_invalid_vocab(value):
err_fragment = "invalid value" err_fragment = "invalid value"
with pytest.raises(ValueError) as e: with pytest.raises(ValueError) as e:

View File

@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
), ),
"encode": MaxoutWindowEncoder( "encode": MaxoutWindowEncoder(
width=32, depth=2, maxout_pieces=2, window_size=1, width=32, depth=2, maxout_pieces=2, window_size=1
), ),
} }

View File

@ -137,7 +137,7 @@ def test_las_per_type(en_vocab):
examples = [] examples = []
for input_, annot in test_las_apple: for input_, annot in test_las_apple:
doc = Doc( doc = Doc(
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"], en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
) )
gold = {"heads": annot["heads"], "deps": annot["deps"]} gold = {"heads": annot["heads"], "deps": annot["deps"]}
example = Example.from_dict(doc, gold) example = Example.from_dict(doc, gold)

View File

@ -496,8 +496,10 @@ def test_make_orth_variants(doc):
output_file = tmpdir / "roundtrip.spacy" output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file) DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)) reader = Corpus(
train_examples = list(reader(nlp)) output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
)
list(reader(nlp))
@pytest.mark.skip("Outdated") @pytest.mark.skip("Outdated")

View File

@ -36,7 +36,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.
nlp, nlp,
raw_text, raw_text,
orig_dict["token_annotation"], orig_dict["token_annotation"],
lower=raw_text is not None and random.random() < lower lower=raw_text is not None and random.random() < lower,
) )
doc = nlp.make_doc(variant_text) doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot orig_dict["token_annotation"] = variant_token_annot

View File

@ -94,7 +94,7 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
def _resume_model( def _resume_model(
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True, model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
) -> None: ) -> None:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
msg.info(f"Resume training tok2vec from: {resume_path}") msg.info(f"Resume training tok2vec from: {resume_path}")

View File

@ -488,7 +488,7 @@ def load_config_from_str(
RETURNS (Config): The loaded config. RETURNS (Config): The loaded config.
""" """
return Config(section_order=CONFIG_SECTION_ORDER).from_str( return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate, text, overrides=overrides, interpolate=interpolate
) )