Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-08-05 16:00:59 +02:00
parent 2a4d56e730
commit e68459296d
24 changed files with 72 additions and 107 deletions

View File

@ -7,8 +7,6 @@ import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
from .. import util
from ..lang.en import English
from ..util import dot_to_object
@debug_cli.command("model")
@ -130,8 +128,8 @@ def _sentences():
]
def _get_docs():
nlp = English()
def _get_docs(lang: str = "en"):
nlp = util.get_lang_class(lang)()
return list(nlp.pipe(_sentences()))

View File

@ -1,5 +1,4 @@
from typing import Optional, List, Dict
from timeit import default_timer as timer
from wasabi import Printer
from pathlib import Path
import re

View File

@ -1,7 +1,6 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import tqdm
import re
import shutil
import requests

View File

@ -1,14 +1,8 @@
from .corpus import Corpus
from .example import Example
from .align import Alignment
from .iob_utils import iob_to_biluo, biluo_to_iob
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
from .iob_utils import spans_from_biluo_tags
from .iob_utils import tags_to_entities
from .gold_io import docs_to_json
from .gold_io import read_json_file
from .batchers import minibatch_by_padded_size, minibatch_by_words
from .corpus import Corpus # noqa: F401
from .example import Example # noqa: F401
from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401

View File

@ -3,7 +3,6 @@ from typing import Optional, Any
from functools import partial
import itertools
from .example import Example
from ..util import registry, minibatch
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
) -> BatcherT:
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
minibatch_by_words,
size=size,
discard_oversize=discard_oversize,
**optionals
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
)
@registry.batchers("batch_by_sequence.v1")
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
optionals = ({"get_length": get_length} if get_length is not None else {})
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(minibatch, size=size, **optionals)

View File

@ -1,4 +1,4 @@
from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs
from .json2docs import json2docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401

View File

@ -1,6 +1,5 @@
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path
import random
from .. import util
from .example import Example
@ -39,7 +38,12 @@ class Corpus:
"""
def __init__(
self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False,
self,
path,
*,
limit: int = 0,
gold_preproc: bool = False,
max_length: bool = False,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc

View File

@ -80,7 +80,7 @@ def _get_transition_table(
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels)
U_start, _ = (L_end, L_end + n_labels)
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end)

View File

@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
@Language.factory(
"attribute_ruler",
)
@Language.factory("attribute_ruler")
def make_attribute_ruler(
nlp: Language,
name: str,

View File

@ -1,17 +1,23 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
from typing import Iterable, TypeVar
from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator
from collections import defaultdict
from thinc.api import Optimizer
from pathlib import Path
from .attrs import NAMES
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .gold import Example # noqa: F401
ItemT = TypeVar("ItemT")
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
Reader = Callable[["Language", str], Iterable["Example"]]
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
# check that against this schema in the test suite to make sure it's always
# up to date.
Reader = Callable[["Language", str], Iterable["Example"]]
class ConfigSchemaTraining(BaseModel):
# fmt: off
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
extra = "forbid"
arbitrary_types_allowed = True
#eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
class ConfigSchemaNlp(BaseModel):
# fmt: off

View File

@ -291,6 +291,6 @@ def test_span_boundaries(doc):
for i in range(start, end):
assert span[i - start] == doc[i]
with pytest.raises(IndexError):
_ = span[-5]
span[-5]
with pytest.raises(IndexError):
_ = span[5]
span[5]

View File

@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese(
meta={
"tokenizer": {
"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
}
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
}
)
zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -1,6 +1,6 @@
import pytest
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
from spacy.pipeline._parser_internals import nonproj
from ..util import get_doc

View File

@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}})
nlp.add_pipe(
"attribute_ruler",
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"

View File

@ -117,12 +117,15 @@ def test_kb_default(nlp):
assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0
assert entity_linker.kb.entity_vector_length == 64 # default value from pipeline.entity_linker
# default value from pipeline.entity_linker
assert entity_linker.kb.entity_vector_length == 64
def test_kb_custom_length(nlp):
"""Test that the default (empty) KB can be configured with a custom entity length"""
entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}})
entity_linker = nlp.add_pipe(
"entity_linker", config={"kb": {"entity_vector_length": 35}}
)
assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
scores = nlp.evaluate(
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores

View File

@ -88,14 +88,9 @@ def my_parser():
width=321,
rows=5432,
also_embed_subwords=True,
also_use_static_vectors=False
also_use_static_vectors=False,
),
MaxoutWindowEncoder(
width=321,
window_size=3,
maxout_pieces=4,
depth=2
)
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5

View File

@ -1,5 +1,4 @@
import spacy
import pytest
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin

View File

@ -711,16 +711,18 @@ def test_alignment_different_texts():
with pytest.raises(ValueError):
Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc):
a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
example = Example(doc1, doc2)
assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
# fmt: off
expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
# fmt: on
assert example.get_aligned("ORTH", as_string=True) == expected1
with doc1.retokenize() as retokenizer:
retokenizer.merge(doc1[0:2])
retokenizer.merge(doc1[5:7])
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
assert example.get_aligned("ORTH", as_string=True) == expected2

View File

@ -24,6 +24,7 @@ def get_textcat_kwargs():
"nO": 7,
}
def get_textcat_cnn_kwargs():
return {
"tok2vec": test_tok2vec(),
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
"nO": 13,
}
def get_all_params(model):
params = []
for node in model.walk():
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
# This actually creates models, so seems best to put it in a function.
return {
"embed": MultiHashEmbed(
width=32,
rows=500,
also_embed_subwords=True,
also_use_static_vectors=False
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
),
"encode": MaxoutWindowEncoder(
width=32,
depth=2,
maxout_pieces=2,
window_size=1,
)
width=32, depth=2, maxout_pieces=2, window_size=1,
),
}

View File

@ -19,14 +19,9 @@ def test_empty_doc():
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True
also_embed_subwords=True,
),
MaxoutWindowEncoder(
width=width,
depth=4,
window_size=1,
maxout_pieces=3
)
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update([doc])
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True
also_embed_subwords=True,
),
MaxoutWindowEncoder(
width=width,
depth=4,
window_size=1,
maxout_pieces=3,
)
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)

View File

@ -85,27 +85,24 @@ def test_util_dot_section():
"""
nlp_config = Config().from_str(cfg_string)
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.pipeline.tagger")
dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.unknownattribute")
dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -1,5 +1,5 @@
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING
from typing import Iterator, Type, Pattern, TYPE_CHECKING
from types import ModuleType
import os
import importlib
@ -764,7 +764,6 @@ def normalize_slice(
return start, stop
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or
@ -1113,6 +1112,3 @@ def minibatch(items, size):
if len(batch) == 0:
break
yield list(batch)