mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Tidy up and auto-format
This commit is contained in:
parent
05dcab10aa
commit
3eaeb73342
|
@ -1,4 +1,4 @@
|
|||
from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
|
||||
from typing import Union, Iterable, Sequence, TypeVar, List, Callable
|
||||
from typing import Optional, Any
|
||||
from functools import partial
|
||||
import itertools
|
||||
|
@ -20,7 +20,7 @@ def configure_minibatch_by_padded_size(
|
|||
get_length: Optional[Callable[[ItemT], int]] = None
|
||||
) -> BatcherT:
|
||||
"""Create a batcher that uses the `batch_by_padded_size` strategy.
|
||||
|
||||
|
||||
The padded size is defined as the maximum length of sequences within the
|
||||
batch multiplied by the number of sequences in the batch.
|
||||
|
||||
|
@ -92,7 +92,7 @@ def minibatch_by_padded_size(
|
|||
) -> Iterable[List[ItemT]]:
|
||||
"""Minibatch a sequence by the size of padded batches that would result,
|
||||
with sequences binned by length within a window.
|
||||
|
||||
|
||||
The padded size is defined as the maximum length of sequences within the
|
||||
batch multiplied by the number of sequences in the batch.
|
||||
|
||||
|
@ -123,7 +123,11 @@ def minibatch_by_padded_size(
|
|||
|
||||
|
||||
def minibatch_by_words(
|
||||
seqs: Iterable[ItemT], size: Sizing, tolerance=0.2, discard_oversize=False, get_length=len
|
||||
seqs: Iterable[ItemT],
|
||||
size: Sizing,
|
||||
tolerance=0.2,
|
||||
discard_oversize=False,
|
||||
get_length=len,
|
||||
) -> Iterable[List[ItemT]]:
|
||||
"""Create minibatches of roughly a given number of words. If any examples
|
||||
are longer than the specified batch length, they will appear in a batch by
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
from typing import Optional
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
|||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .tokens import Doc
|
||||
from .lookups import load_lookups
|
||||
from .tokenizer import Tokenizer
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema
|
||||
|
@ -1439,10 +1438,7 @@ class Language:
|
|||
or lang_cls is not cls
|
||||
):
|
||||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||||
nlp = lang_cls(
|
||||
vocab=vocab,
|
||||
create_tokenizer=create_tokenizer,
|
||||
)
|
||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||
if after_creation is not None:
|
||||
nlp = after_creation(nlp)
|
||||
if not isinstance(nlp, cls):
|
||||
|
|
|
@ -34,11 +34,11 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
|||
class Tok2Vec(Pipe):
|
||||
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
|
||||
attribute. This is mostly useful to share a single subnetwork between multiple
|
||||
components, e.g. to have one embedding and CNN network shared between a
|
||||
components, e.g. to have one embedding and CNN network shared between a
|
||||
parser, tagger and NER.
|
||||
|
||||
In order to use the `Tok2Vec` predictions, subsequent components should use
|
||||
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
|
||||
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
|
||||
layer will read data from the `doc.tensor` attribute during prediction.
|
||||
During training, the `Tok2Vec` component will save its prediction and backprop
|
||||
callback for each batch, so that the subsequent components can backpropagate
|
||||
|
@ -46,6 +46,7 @@ class Tok2Vec(Pipe):
|
|||
avoid relying on object identity within the models to achieve the parameter
|
||||
sharing.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
|
||||
"""Initialize a tok2vec component.
|
||||
|
||||
|
@ -239,6 +240,7 @@ class Tok2VecListener(Model):
|
|||
from the Tok2Vec component into downstream components, and communicating
|
||||
gradients back upstream.
|
||||
"""
|
||||
|
||||
name = "tok2vec-listener"
|
||||
|
||||
def __init__(self, upstream_name: str, width: int) -> None:
|
||||
|
|
|
@ -426,7 +426,7 @@ class Scorer:
|
|||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||
}
|
||||
if len(labels) == 2 and not multi_label and positive_label:
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
||||
results[f"{attr}_score"] = positive_label_f
|
||||
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||
elif not multi_label:
|
||||
|
|
|
@ -15,5 +15,7 @@ def morphology():
|
|||
def test_morphology_pickle_roundtrip(morphology):
|
||||
b = pickle.dumps(morphology)
|
||||
reloaded_morphology = pickle.loads(b)
|
||||
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
|
||||
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
|
||||
feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
|
||||
assert feat == "Feat1=Val1|Feat2=Val2"
|
||||
feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
|
||||
assert feat == "Feat3=Val3|Feat4=Val4"
|
||||
|
|
|
@ -144,8 +144,7 @@ def test_accept_blocked_token():
|
|||
# 1. test normal behaviour
|
||||
nlp1 = English()
|
||||
doc1 = nlp1("I live in New York")
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner1 = nlp1.create_pipe("ner", config=config)
|
||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||
|
@ -164,8 +163,7 @@ def test_accept_blocked_token():
|
|||
# 2. test blocking behaviour
|
||||
nlp2 = English()
|
||||
doc2 = nlp2("I live in New York")
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp2.create_pipe("ner", config=config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
|
@ -220,8 +218,7 @@ def test_overwrite_token():
|
|||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||
# Check that a new ner can overwrite O
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp.create_pipe("ner", config=config)
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import pytest
|
||||
|
||||
from spacy import util, registry
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups, load_lookups
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from spacy import util
|
||||
from spacy.gold import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.symbols import POS, NOUN
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
|
@ -117,9 +117,7 @@ def test_overfitting_IO():
|
|||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(
|
||||
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||
)
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
assert scores["cats_micro_f"] == 1.0
|
||||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
import pytest
|
||||
import random
|
||||
|
||||
from spacy import util
|
||||
from spacy.gold import Example
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||||
from spacy.symbols import POS, VERB
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups
|
||||
|
|
|
@ -6,8 +6,7 @@ from spacy.lang.en import English
|
|||
from spacy.lang.lex_attrs import LEX_ATTRS
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.symbols import ORTH, LEMMA, POS, VERB
|
||||
from spacy.symbols import ORTH, LEMMA, POS
|
||||
|
||||
|
||||
def test_issue1061():
|
||||
|
|
|
@ -271,8 +271,7 @@ def test_issue1963(en_tokenizer):
|
|||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||
def test_issue1967(label):
|
||||
nlp = Language()
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
example = Example.from_dict(
|
||||
Doc(ner.vocab, words=["word"]),
|
||||
|
|
|
@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
|
|||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
|
||||
attrs = {
|
||||
"POS": ["PROPN", "PROPN"],
|
||||
"LEMMA": ["New", "York"],
|
||||
"DEP": ["pobj", "compound"],
|
||||
}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
|
|
|
@ -138,8 +138,7 @@ def test_issue4042_bug2():
|
|||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp1.create_pipe("ner", config=config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
@ -301,8 +300,7 @@ def test_issue4313():
|
|||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
}
|
||||
config = {}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
|
|
Loading…
Reference in New Issue
Block a user