Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-08-09 22:36:23 +02:00
parent 05dcab10aa
commit 3eaeb73342
15 changed files with 32 additions and 40 deletions

View File

@ -1,4 +1,4 @@
from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable from typing import Union, Iterable, Sequence, TypeVar, List, Callable
from typing import Optional, Any from typing import Optional, Any
from functools import partial from functools import partial
import itertools import itertools
@ -20,7 +20,7 @@ def configure_minibatch_by_padded_size(
get_length: Optional[Callable[[ItemT], int]] = None get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT: ) -> BatcherT:
"""Create a batcher that uses the `batch_by_padded_size` strategy. """Create a batcher that uses the `batch_by_padded_size` strategy.
The padded size is defined as the maximum length of sequences within the The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch. batch multiplied by the number of sequences in the batch.
@ -92,7 +92,7 @@ def minibatch_by_padded_size(
) -> Iterable[List[ItemT]]: ) -> Iterable[List[ItemT]]:
"""Minibatch a sequence by the size of padded batches that would result, """Minibatch a sequence by the size of padded batches that would result,
with sequences binned by length within a window. with sequences binned by length within a window.
The padded size is defined as the maximum length of sequences within the The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch. batch multiplied by the number of sequences in the batch.
@ -123,7 +123,11 @@ def minibatch_by_padded_size(
def minibatch_by_words( def minibatch_by_words(
seqs: Iterable[ItemT], size: Sizing, tolerance=0.2, discard_oversize=False, get_length=len seqs: Iterable[ItemT],
size: Sizing,
tolerance=0.2,
discard_oversize=False,
get_length=len,
) -> Iterable[List[ItemT]]: ) -> Iterable[List[ItemT]]:
"""Create minibatches of roughly a given number of words. If any examples """Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by are longer than the specified batch length, they will appear in a batch by

View File

@ -1,5 +1,3 @@
from typing import Optional
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token

View File

@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema from .schemas import ConfigSchema
@ -1439,10 +1438,7 @@ class Language:
or lang_cls is not cls or lang_cls is not cls
): ):
raise ValueError(Errors.E943.format(value=type(lang_cls))) raise ValueError(Errors.E943.format(value=type(lang_cls)))
nlp = lang_cls( nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
vocab=vocab,
create_tokenizer=create_tokenizer,
)
if after_creation is not None: if after_creation is not None:
nlp = after_creation(nlp) nlp = after_creation(nlp)
if not isinstance(nlp, cls): if not isinstance(nlp, cls):

View File

@ -34,11 +34,11 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
class Tok2Vec(Pipe): class Tok2Vec(Pipe):
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor """Apply a "token-to-vector" model and set its outputs in the doc.tensor
attribute. This is mostly useful to share a single subnetwork between multiple attribute. This is mostly useful to share a single subnetwork between multiple
components, e.g. to have one embedding and CNN network shared between a components, e.g. to have one embedding and CNN network shared between a
parser, tagger and NER. parser, tagger and NER.
In order to use the `Tok2Vec` predictions, subsequent components should use In order to use the `Tok2Vec` predictions, subsequent components should use
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
layer will read data from the `doc.tensor` attribute during prediction. layer will read data from the `doc.tensor` attribute during prediction.
During training, the `Tok2Vec` component will save its prediction and backprop During training, the `Tok2Vec` component will save its prediction and backprop
callback for each batch, so that the subsequent components can backpropagate callback for each batch, so that the subsequent components can backpropagate
@ -46,6 +46,7 @@ class Tok2Vec(Pipe):
avoid relying on object identity within the models to achieve the parameter avoid relying on object identity within the models to achieve the parameter
sharing. sharing.
""" """
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None: def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
"""Initialize a tok2vec component. """Initialize a tok2vec component.
@ -239,6 +240,7 @@ class Tok2VecListener(Model):
from the Tok2Vec component into downstream components, and communicating from the Tok2Vec component into downstream components, and communicating
gradients back upstream. gradients back upstream.
""" """
name = "tok2vec-listener" name = "tok2vec-listener"
def __init__(self, upstream_name: str, width: int) -> None: def __init__(self, upstream_name: str, width: int) -> None:

View File

@ -426,7 +426,7 @@ class Scorer:
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
} }
if len(labels) == 2 and not multi_label and positive_label: if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f'] positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
results[f"{attr}_score"] = positive_label_f results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})" results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label: elif not multi_label:

View File

@ -15,5 +15,7 @@ def morphology():
def test_morphology_pickle_roundtrip(morphology): def test_morphology_pickle_roundtrip(morphology):
b = pickle.dumps(morphology) b = pickle.dumps(morphology)
reloaded_morphology = pickle.loads(b) reloaded_morphology = pickle.loads(b)
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2" feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4" assert feat == "Feat1=Val1|Feat2=Val2"
feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
assert feat == "Feat3=Val3|Feat4=Val4"

View File

@ -144,8 +144,7 @@ def test_accept_blocked_token():
# 1. test normal behaviour # 1. test normal behaviour
nlp1 = English() nlp1 = English()
doc1 = nlp1("I live in New York") doc1 = nlp1("I live in New York")
config = { config = {}
}
ner1 = nlp1.create_pipe("ner", config=config) ner1 = nlp1.create_pipe("ner", config=config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -164,8 +163,7 @@ def test_accept_blocked_token():
# 2. test blocking behaviour # 2. test blocking behaviour
nlp2 = English() nlp2 = English()
doc2 = nlp2("I live in New York") doc2 = nlp2("I live in New York")
config = { config = {}
}
ner2 = nlp2.create_pipe("ner", config=config) ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity # set "New York" to a blocked entity
@ -220,8 +218,7 @@ def test_overwrite_token():
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O # Check that a new ner can overwrite O
config = { config = {}
}
ner2 = nlp.create_pipe("ner", config=config) ner2 = nlp.create_pipe("ner", config=config)
ner2.moves.add_action(5, "") ner2.moves.add_action(5, "")
ner2.add_label("GPE") ner2.add_label("GPE")

View File

@ -1,8 +1,7 @@
import pytest import pytest
from spacy import util, registry from spacy import util, registry
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lookups import Lookups, load_lookups from spacy.lookups import Lookups
from ..util import make_tempdir from ..util import make_tempdir

View File

@ -1,10 +1,8 @@
import pytest import pytest
from spacy import util from spacy import util
from spacy.gold import Example from spacy.gold import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.symbols import POS, NOUN
from ..util import make_tempdir from ..util import make_tempdir

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring # Test scoring
scores = nlp.evaluate( scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
assert scores["cats_micro_f"] == 1.0 assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0 assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores assert "cats_score_desc" in scores

View File

@ -1,11 +1,9 @@
import pytest import pytest
import random import random
from spacy import util from spacy import util
from spacy.gold import Example from spacy.gold import Example
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lookups import Lookups from spacy.lookups import Lookups

View File

@ -6,8 +6,7 @@ from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy.lookups import Lookups from spacy.symbols import ORTH, LEMMA, POS
from spacy.symbols import ORTH, LEMMA, POS, VERB
def test_issue1061(): def test_issue1061():

View File

@ -271,8 +271,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
nlp = Language() nlp = Language()
config = { config = {}
}
ner = nlp.create_pipe("ner", config=config) ner = nlp.create_pipe("ner", config=config)
example = Example.from_dict( example = Example.from_dict(
Doc(ner.vocab, words=["word"]), Doc(ner.vocab, words=["word"]),

View File

@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]] heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]} attrs = {
"POS": ["PROPN", "PROPN"],
"LEMMA": ["New", "York"],
"DEP": ["pobj", "compound"],
}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"] gold_text = ["I", "live", "in", "New", "York", "right", "now"]

View File

@ -138,8 +138,7 @@ def test_issue4042_bug2():
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
ner1.to_disk(output_dir) ner1.to_disk(output_dir)
config = { config = {}
}
ner2 = nlp1.create_pipe("ner", config=config) ner2 = nlp1.create_pipe("ner", config=config)
ner2.from_disk(output_dir) ner2.from_disk(output_dir)
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2
@ -301,8 +300,7 @@ def test_issue4313():
beam_width = 16 beam_width = 16
beam_density = 0.0001 beam_density = 0.0001
nlp = English() nlp = English()
config = { config = {}
}
ner = nlp.create_pipe("ner", config=config) ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
ner.begin_training([]) ner.begin_training([])