Tidy up and auto-format

This commit is contained in:
Ines Montani 2021-01-15 11:57:36 +11:00
parent e8a97a2bd6
commit b0b743597c
23 changed files with 140 additions and 144 deletions

View File

@ -384,7 +384,10 @@ def debug_data(
# rare labels in projectivized train
rare_projectivized_labels = []
for label in gold_train_data["deps"]:
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label:
if (
gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
and DELIMITER in label
):
rare_projectivized_labels.append(
f"{label}: {gold_train_data['deps'][label]}"
)

View File

@ -30,7 +30,11 @@ def info_cli(
def info(
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True, exclude: List[str]
model: Optional[str] = None,
*,
markdown: bool = False,
silent: bool = True,
exclude: List[str],
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if model:
@ -98,7 +102,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
}
def get_markdown(data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None) -> str:
def get_markdown(
data: Dict[str, Any], title: Optional[str] = None, exclude: List[str] = None
) -> str:
"""Get data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
@ -115,7 +121,7 @@ def get_markdown(data: Dict[str, Any], title: Optional[str] = None, exclude: Lis
if isinstance(value, str):
try:
existing_path = Path(value).exists()
except:
except Exception:
# invalid Path, like a URL string
existing_path = False
if existing_path:

View File

@ -36,7 +36,7 @@ _num_words = [
"ትሪሊዮን",
"ኳድሪሊዮን",
"ገጅሊዮን",
"ባዝሊዮን"
"ባዝሊዮን",
]
_ordinal_words = [
@ -60,8 +60,7 @@ _ordinal_words = [
"አስራ ስምንተኛ",
"አስራ ዘጠነኛ",
"ሃያኛ",
"ሰላሳኛ"
"አርባኛ",
"ሰላሳኛ" "አርባኛ",
"አምሳኛ",
"ስድሳኛ",
"ሰባኛ",
@ -71,8 +70,10 @@ _ordinal_words = [
"ሺኛ",
"ሚሊዮንኛ",
"ቢሊዮንኛ",
"ትሪሊዮንኛ"
"ትሪሊዮንኛ",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]

View File

@ -7,7 +7,6 @@ _exc = {}
for exc_data in [
{ORTH: "ት/ቤት"},
{ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,4 +1,4 @@
from typing import Union, Iterator, Optional, List, Tuple
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors

View File

@ -36,7 +36,7 @@ _num_words = [
"ትሪልዮን",
"ኳድሪልዮን",
"ገጅልዮን",
"ባዝልዮን"
"ባዝልዮን",
]
_ordinal_words = [
@ -60,8 +60,7 @@ _ordinal_words = [
"ኣሰርተ ስምንተኛ",
"ኣሰርተ ዘጠነኛ",
"ሃያኛ",
"ሰላሳኛ"
"አርባኛ",
"ሰላሳኛ" "አርባኛ",
"አምሳኛ",
"ስድሳኛ",
"ሰባኛ",
@ -71,8 +70,10 @@ _ordinal_words = [
"ሺኛ",
"ሚሊዮንኛ",
"ቢሊዮንኛ",
"ትሪሊዮንኛ"
"ትሪሊዮንኛ",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]

View File

@ -8,7 +8,6 @@ for exc_data in [
{ORTH: "ት/ቤት"},
{ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
{ORTH: "ወ/ሪ", NORM: "ወይዘሪት"},
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -71,7 +71,9 @@ def build_text_classifier_v2(
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.maybe_get_dim("nO")
attention_layer = ParametricAttention(width) # TODO: benchmark performance difference of this layer
attention_layer = ParametricAttention(
width
) # TODO: benchmark performance difference of this layer
maxout_layer = Maxout(nO=width, nI=width)
linear_layer = Linear(nO=nO, nI=width)
cnn_model = (

View File

@ -89,7 +89,7 @@ def build_hash_embed_cnn_tok2vec(
# TODO: archive
@registry.architectures.register("spacy.Tok2Vec.v1")
def build_Tok2Vec_model(
def _build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
encode: Model[List[Floats2d], List[Floats2d]],
) -> Model[List[Doc], List[Floats2d]]:
@ -109,7 +109,6 @@ def build_Tok2Vec_model(
return tok2vec
@registry.architectures.register("spacy.Tok2Vec.v2")
def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
@ -130,7 +129,6 @@ def build_Tok2Vec_model(
return tok2vec
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(
width: int,
@ -280,7 +278,7 @@ def CharacterEmbed(
# TODO: archive
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
def MaxoutWindowEncoder(
def _MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with maxout activation, layer
@ -310,6 +308,7 @@ def MaxoutWindowEncoder(
model.attrs["receptive_field"] = window_size * depth
return model
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
def MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int
@ -344,7 +343,7 @@ def MaxoutWindowEncoder(
# TODO: archive
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(
def _MishWindowEncoder(
width: int, window_size: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with mish activation, layer

View File

@ -388,7 +388,6 @@ class TextCategorizer(TrainablePipe):
**kwargs,
)
def _validate_categories(self, examples: List[Example]):
"""Check whether the provided examples all have single-label cats annotations."""
for ex in examples:

View File

@ -187,5 +187,5 @@ class MultiLabel_TextCategorizer(TextCategorizer):
def _validate_categories(self, examples: List[Example]):
"""This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'. """
This method overwrites the more strict one from 'textcat'."""
pass

View File

@ -28,10 +28,12 @@ def pytest_runtest_setup(item):
def tokenizer():
return get_lang_class("xx")().tokenizer
@pytest.fixture(scope="session")
def am_tokenizer():
return get_lang_class("am")().tokenizer
@pytest.fixture(scope="session")
def ar_tokenizer():
return get_lang_class("ar")().tokenizer
@ -247,10 +249,12 @@ def th_tokenizer():
pytest.importorskip("pythainlp")
return get_lang_class("th")().tokenizer
@pytest.fixture(scope="session")
def ti_tokenizer():
return get_lang_class("ti")().tokenizer
@pytest.fixture(scope="session")
def tr_tokenizer():
return get_lang_class("tr")().tokenizer

View File

@ -17,17 +17,8 @@ def test_graph_edges_and_nodes():
assert graph.get_node((0,)) == node1
node2 = graph.add_node((1, 3))
assert list(node2) == [1, 3]
graph.add_edge(
node1,
node2,
label="one",
weight=-10.5
)
assert graph.has_edge(
node1,
node2,
label="one"
)
graph.add_edge(node1, node2, label="one", weight=-10.5)
assert graph.has_edge(node1, node2, label="one")
assert node1.heads() == []
assert [tuple(h) for h in node2.heads()] == [(0,)]
assert [tuple(t) for t in node1.tails()] == [(1, 3)]
@ -42,7 +33,7 @@ def test_graph_walk():
nodes=[(0,), (1,), (2,), (3,)],
edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
labels=None,
weights=None
weights=None,
)
node0, node1, node2, node3 = list(graph.nodes)
assert [tuple(h) for h in node0.heads()] == [(3,)]

View File

@ -1,5 +1,4 @@
import pytest
from spacy.lang.am.lex_attrs import like_num
def test_am_tokenizer_handles_long_text(am_tokenizer):

View File

@ -121,9 +121,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize(
"text,norm", [("Jan.", "January"), ("'cuz", "because")]
)
@pytest.mark.parametrize("text,norm", [("Jan.", "January"), ("'cuz", "because")])
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -1,5 +1,4 @@
import pytest
from spacy.lang.ti.lex_attrs import like_num
def test_ti_tokenizer_handles_long_text(ti_tokenizer):

View File

@ -389,7 +389,7 @@ def test_beam_ner_scores():
for j in range(len(doc)):
for label in ner.labels:
score = entity_scores[(j, j+1, label)]
score = entity_scores[(j, j + 1, label)]
eps = 0.00001
assert 0 - eps <= score <= 1 + eps

View File

@ -146,12 +146,12 @@ def test_no_resize(name):
def test_error_with_multi_labels():
nlp = Language()
textcat = nlp.add_pipe("textcat")
nlp.add_pipe("textcat")
train_examples = []
for text, annotations in TRAIN_DATA_MULTI_LABEL:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
with pytest.raises(ValueError):
optimizer = nlp.initialize(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
@pytest.mark.parametrize(
@ -273,7 +273,7 @@ def test_overfitting_IO_multi():
assert_equal(batch_cats_1, no_batch_cats)
def test_overfitting_IO_multi():
def test_overfitting_IO_multi_2():
# Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
fix_random_seed(0)
nlp = English()
@ -362,7 +362,9 @@ def test_positive_class():
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
get_examples = make_get_examples_multi_label(nlp)
with pytest.raises(TypeError):
textcat_multilabel.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
textcat_multilabel.initialize(
get_examples, labels=["POS", "NEG"], positive_label="POS"
)
textcat_multilabel.initialize(get_examples, labels=["FICTION", "DRAMA"])
assert textcat_multilabel.labels == ("FICTION", "DRAMA")
assert "positive_label" not in textcat_multilabel.cfg
@ -381,7 +383,9 @@ def test_positive_class_not_binary():
textcat = nlp.add_pipe("textcat")
get_examples = make_get_examples_multi_label(nlp)
with pytest.raises(ValueError):
textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
textcat.initialize(
get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
)
def test_textcat_evaluation():

View File

@ -13,7 +13,6 @@ from spacy.lang.el import Greek
from spacy.language import Language
import spacy
from thinc.api import compounding
from collections import defaultdict
from ..util import make_tempdir
@ -311,9 +310,7 @@ def test_issue4313():
# ensure the beam_parse still works with the new label
docs = [doc]
ner = nlp.get_pipe("beam_ner")
beams = ner.beam_parse(
docs, drop=0.0, beam_width=beam_width, beam_density=beam_density
)
ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
def test_issue4348():

View File

@ -251,7 +251,9 @@ def test_language_from_config_before_after_init():
nlp.initialize()
assert nlp.meta["before_init"] == "before"
assert nlp.meta["after_init"] == "after"
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
assert all(
[ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]
)
def test_language_from_config_before_after_init_invalid():

View File

@ -166,19 +166,11 @@ def test_Example_from_dict_with_entities(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots)
assert len(list(example.reference.ents)) == 2
assert [example.reference[i].ent_iob_ for i in range(7)] == [
"O",
"O",
"B",
"I",
"O",
"B",
"O",
]
# fmt: off
assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"]
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
# fmt: on
assert example.reference[2].ent_type_ == "LOC"
assert example.reference[3].ent_type_ == "LOC"
assert example.reference[5].ent_type_ == "LOC"
@ -299,7 +291,8 @@ def test_Example_missing_heads():
assert parsed_heads[2] == heads[2]
assert parsed_heads[4] == heads[4]
assert parsed_heads[5] == heads[5]
assert [t.has_head() for t in example.reference] == [True, True, True, False, True, True]
expected = [True, True, True, False, True, True]
assert [t.has_head() for t in example.reference] == expected
# Ensure that the missing head doesn't create an artificial new sentence start
assert example.get_aligned_sent_starts() == [True, False, False, False, False, False]
expected = [True, False, False, False, False, False]
assert example.get_aligned_sent_starts() == expected