Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-08-09 22:36:23 +02:00
parent 05dcab10aa
commit 3eaeb73342
15 changed files with 32 additions and 40 deletions

View File

@ -1,4 +1,4 @@
from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
from typing import Union, Iterable, Sequence, TypeVar, List, Callable
from typing import Optional, Any
from functools import partial
import itertools
@ -20,7 +20,7 @@ def configure_minibatch_by_padded_size(
get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
"""Create a batcher that uses the `batch_by_padded_size` strategy.
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
@ -92,7 +92,7 @@ def minibatch_by_padded_size(
) -> Iterable[List[ItemT]]:
"""Minibatch a sequence by the size of padded batches that would result,
with sequences binned by length within a window.
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
@ -123,7 +123,11 @@ def minibatch_by_padded_size(
def minibatch_by_words(
seqs: Iterable[ItemT], size: Sizing, tolerance=0.2, discard_oversize=False, get_length=len
seqs: Iterable[ItemT],
size: Sizing,
tolerance=0.2,
discard_oversize=False,
get_length=len,
) -> Iterable[List[ItemT]]:
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by

View File

@ -1,5 +1,3 @@
from typing import Optional
from ...pipeline import Lemmatizer
from ...tokens import Token

View File

@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema
@ -1439,10 +1438,7 @@ class Language:
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
nlp = lang_cls(
vocab=vocab,
create_tokenizer=create_tokenizer,
)
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):

View File

@ -34,11 +34,11 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
class Tok2Vec(Pipe):
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
attribute. This is mostly useful to share a single subnetwork between multiple
components, e.g. to have one embedding and CNN network shared between a
components, e.g. to have one embedding and CNN network shared between a
parser, tagger and NER.
In order to use the `Tok2Vec` predictions, subsequent components should use
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
layer will read data from the `doc.tensor` attribute during prediction.
During training, the `Tok2Vec` component will save its prediction and backprop
callback for each batch, so that the subsequent components can backpropagate
@ -46,6 +46,7 @@ class Tok2Vec(Pipe):
avoid relying on object identity within the models to achieve the parameter
sharing.
"""
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
"""Initialize a tok2vec component.
@ -239,6 +240,7 @@ class Tok2VecListener(Model):
from the Tok2Vec component into downstream components, and communicating
gradients back upstream.
"""
name = "tok2vec-listener"
def __init__(self, upstream_name: str, width: int) -> None:

View File

@ -426,7 +426,7 @@ class Scorer:
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label:

View File

@ -15,5 +15,7 @@ def morphology():
def test_morphology_pickle_roundtrip(morphology):
b = pickle.dumps(morphology)
reloaded_morphology = pickle.loads(b)
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
assert feat == "Feat1=Val1|Feat2=Val2"
feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
assert feat == "Feat3=Val3|Feat4=Val4"

View File

@ -144,8 +144,7 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
config = {
}
config = {}
ner1 = nlp1.create_pipe("ner", config=config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -164,8 +163,7 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
config = {
}
config = {}
ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity
@ -220,8 +218,7 @@ def test_overwrite_token():
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
config = {
}
config = {}
ner2 = nlp.create_pipe("ner", config=config)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")

View File

@ -1,8 +1,7 @@
import pytest
from spacy import util, registry
from spacy.lang.en import English
from spacy.lookups import Lookups, load_lookups
from spacy.lookups import Lookups
from ..util import make_tempdir

View File

@ -1,10 +1,8 @@
import pytest
from spacy import util
from spacy.gold import Example
from spacy.lang.en import English
from spacy.language import Language
from spacy.symbols import POS, NOUN
from ..util import make_tempdir

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
scores = nlp.evaluate(
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores

View File

@ -1,11 +1,9 @@
import pytest
import random
from spacy import util
from spacy.gold import Example
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lookups import Lookups

View File

@ -6,8 +6,7 @@ from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB
from spacy.symbols import ORTH, LEMMA, POS
def test_issue1061():

View File

@ -271,8 +271,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
nlp = Language()
config = {
}
config = {}
ner = nlp.create_pipe("ner", config=config)
example = Example.from_dict(
Doc(ner.vocab, words=["word"]),

View File

@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
attrs = {
"POS": ["PROPN", "PROPN"],
"LEMMA": ["New", "York"],
"DEP": ["pobj", "compound"],
}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]

View File

@ -138,8 +138,7 @@ def test_issue4042_bug2():
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {
}
config = {}
ner2 = nlp1.create_pipe("ner", config=config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
@ -301,8 +300,7 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
}
config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
ner.begin_training([])