Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-08-05 16:00:59 +02:00
parent 2a4d56e730
commit e68459296d
24 changed files with 72 additions and 107 deletions

View File

@ -7,8 +7,6 @@ import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
from .. import util
from ..lang.en import English
from ..util import dot_to_object
@debug_cli.command("model")
@ -130,8 +128,8 @@ def _sentences():
]
def _get_docs():
nlp = English()
def _get_docs(lang: str = "en"):
nlp = util.get_lang_class(lang)()
return list(nlp.pipe(_sentences()))

View File

@ -1,5 +1,4 @@
from typing import Optional, List, Dict
from timeit import default_timer as timer
from wasabi import Printer
from pathlib import Path
import re

View File

@ -1,7 +1,6 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import tqdm
import re
import shutil
import requests

View File

@ -1,14 +1,8 @@
from .corpus import Corpus
from .example import Example
from .align import Alignment
from .iob_utils import iob_to_biluo, biluo_to_iob
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
from .iob_utils import spans_from_biluo_tags
from .iob_utils import tags_to_entities
from .gold_io import docs_to_json
from .gold_io import read_json_file
from .batchers import minibatch_by_padded_size, minibatch_by_words
from .corpus import Corpus # noqa: F401
from .example import Example # noqa: F401
from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401

View File

@ -3,7 +3,6 @@ from typing import Optional, Any
from functools import partial
import itertools
from .example import Example
from ..util import registry, minibatch
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
) -> BatcherT:
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
minibatch_by_words,
size=size,
discard_oversize=discard_oversize,
**optionals
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
)
@registry.batchers("batch_by_sequence.v1")
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
optionals = ({"get_length": get_length} if get_length is not None else {})
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(minibatch, size=size, **optionals)

View File

@ -1,4 +1,4 @@
from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs
from .json2docs import json2docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401

View File

@ -1,6 +1,5 @@
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path
import random
from .. import util
from .example import Example
@ -25,7 +24,7 @@ class Corpus:
path (Path): The directory or filename to read from.
gold_preproc (bool): Whether to set up the Example object with gold-standard
sentences and tokens for the predictions. Gold preprocessing helps
sentences and tokens for the predictions. Gold preprocessing helps
the annotations align to the tokenization, and may result in sequences
of more consistent length. However, it may reduce run-time accuracy due
to train/test skew. Defaults to False.
@ -39,7 +38,12 @@ class Corpus:
"""
def __init__(
self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False,
self,
path,
*,
limit: int = 0,
gold_preproc: bool = False,
max_length: bool = False,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc

View File

@ -80,7 +80,7 @@ def _get_transition_table(
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels)
U_start, _ = (L_end, L_end + n_labels)
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end)

View File

@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
@Language.factory(
"attribute_ruler",
)
@Language.factory("attribute_ruler")
def make_attribute_ruler(
nlp: Language,
name: str,
@ -58,7 +56,7 @@ class AttributeRuler(Pipe):
self.vocab = vocab
self.matcher = Matcher(self.vocab)
self.attrs = []
self._attrs_unnormed = [] # store for reference
self._attrs_unnormed = [] # store for reference
self.indices = []
if pattern_dicts:

View File

@ -1,17 +1,23 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
from typing import Iterable, TypeVar
from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator
from collections import defaultdict
from thinc.api import Optimizer
from pathlib import Path
from .attrs import NAMES
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .gold import Example # noqa: F401
ItemT = TypeVar("ItemT")
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
Reader = Callable[["Language", str], Iterable["Example"]]
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
# check that against this schema in the test suite to make sure it's always
# up to date.
Reader = Callable[["Language", str], Iterable["Example"]]
class ConfigSchemaTraining(BaseModel):
# fmt: off
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
extra = "forbid"
arbitrary_types_allowed = True
#eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
class ConfigSchemaNlp(BaseModel):
# fmt: off

View File

@ -291,6 +291,6 @@ def test_span_boundaries(doc):
for i in range(start, end):
assert span[i - start] == doc[i]
with pytest.raises(IndexError):
_ = span[-5]
span[-5]
with pytest.raises(IndexError):
_ = span[5]
span[5]

View File

@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese(
meta={
"tokenizer": {
"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
}
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
}
)
zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -21,7 +21,7 @@ re_pattern5 = "B*A*B"
longest1 = "A A A A A"
longest2 = "A A A A A"
longest3 = "A A"
longest4 = "B A A A A A B" # "FIRST" would be "B B"
longest4 = "B A A A A A B" # "FIRST" would be "B B"
longest5 = "B B A A A A A B"

View File

@ -1,6 +1,6 @@
import pytest
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
from spacy.pipeline._parser_internals import nonproj
from ..util import get_doc

View File

@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}})
nlp.add_pipe(
"attribute_ruler",
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"

View File

@ -117,12 +117,15 @@ def test_kb_default(nlp):
assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0
assert entity_linker.kb.entity_vector_length == 64 # default value from pipeline.entity_linker
# default value from pipeline.entity_linker
assert entity_linker.kb.entity_vector_length == 64
def test_kb_custom_length(nlp):
"""Test that the default (empty) KB can be configured with a custom entity length"""
entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}})
entity_linker = nlp.add_pipe(
"entity_linker", config={"kb": {"entity_vector_length": 35}}
)
assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
scores = nlp.evaluate(
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores

View File

@ -88,14 +88,9 @@ def my_parser():
width=321,
rows=5432,
also_embed_subwords=True,
also_use_static_vectors=False
also_use_static_vectors=False,
),
MaxoutWindowEncoder(
width=321,
window_size=3,
maxout_pieces=4,
depth=2
)
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5

View File

@ -1,5 +1,4 @@
import spacy
import pytest
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin

View File

@ -711,16 +711,18 @@ def test_alignment_different_texts():
with pytest.raises(ValueError):
Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc):
a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
example = Example(doc1, doc2)
assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
# fmt: off
expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
# fmt: on
assert example.get_aligned("ORTH", as_string=True) == expected1
with doc1.retokenize() as retokenizer:
retokenizer.merge(doc1[0:2])
retokenizer.merge(doc1[5:7])
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
assert example.get_aligned("ORTH", as_string=True) == expected2

View File

@ -24,6 +24,7 @@ def get_textcat_kwargs():
"nO": 7,
}
def get_textcat_cnn_kwargs():
return {
"tok2vec": test_tok2vec(),
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
"nO": 13,
}
def get_all_params(model):
params = []
for node in model.walk():
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
# This actually creates models, so seems best to put it in a function.
return {
"embed": MultiHashEmbed(
width=32,
rows=500,
also_embed_subwords=True,
also_use_static_vectors=False
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
),
"encode": MaxoutWindowEncoder(
width=32,
depth=2,
maxout_pieces=2,
window_size=1,
)
width=32, depth=2, maxout_pieces=2, window_size=1,
),
}

View File

@ -19,14 +19,9 @@ def test_empty_doc():
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True
also_embed_subwords=True,
),
MaxoutWindowEncoder(
width=width,
depth=4,
window_size=1,
maxout_pieces=3
)
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update([doc])
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True
also_embed_subwords=True,
),
MaxoutWindowEncoder(
width=width,
depth=4,
window_size=1,
maxout_pieces=3,
)
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)

View File

@ -85,27 +85,24 @@ def test_util_dot_section():
"""
nlp_config = Config().from_str(cfg_string)
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.pipeline.tagger")
dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.unknownattribute")
dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -1,5 +1,5 @@
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING
from typing import Iterator, Type, Pattern, TYPE_CHECKING
from types import ModuleType
import os
import importlib
@ -764,7 +764,6 @@ def normalize_slice(
return start, stop
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or
@ -1113,6 +1112,3 @@ def minibatch(items, size):
if len(batch) == 0:
break
yield list(batch)