Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-08-05 16:00:59 +02:00
parent 2a4d56e730
commit e68459296d
24 changed files with 72 additions and 107 deletions

View File

@ -7,8 +7,6 @@ import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
from .. import util from .. import util
from ..lang.en import English
from ..util import dot_to_object
@debug_cli.command("model") @debug_cli.command("model")
@ -130,8 +128,8 @@ def _sentences():
] ]
def _get_docs(): def _get_docs(lang: str = "en"):
nlp = English() nlp = util.get_lang_class(lang)()
return list(nlp.pipe(_sentences())) return list(nlp.pipe(_sentences()))

View File

@ -1,5 +1,4 @@
from typing import Optional, List, Dict from typing import Optional, List, Dict
from timeit import default_timer as timer
from wasabi import Printer from wasabi import Printer
from pathlib import Path from pathlib import Path
import re import re

View File

@ -1,7 +1,6 @@
from typing import Optional from typing import Optional
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import tqdm
import re import re
import shutil import shutil
import requests import requests

View File

@ -1,14 +1,8 @@
from .corpus import Corpus from .corpus import Corpus # noqa: F401
from .example import Example from .example import Example # noqa: F401
from .align import Alignment from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
from .iob_utils import spans_from_biluo_tags from .gold_io import docs_to_json, read_json_file # noqa: F401
from .iob_utils import tags_to_entities from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .gold_io import docs_to_json
from .gold_io import read_json_file
from .batchers import minibatch_by_padded_size, minibatch_by_words

View File

@ -3,7 +3,6 @@ from typing import Optional, Any
from functools import partial from functools import partial
import itertools import itertools
from .example import Example
from ..util import registry, minibatch from ..util import registry, minibatch
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
) -> BatcherT: ) -> BatcherT:
optionals = {"get_length": get_length} if get_length is not None else {} optionals = {"get_length": get_length} if get_length is not None else {}
return partial( return partial(
minibatch_by_words, minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
size=size,
discard_oversize=discard_oversize,
**optionals
) )
@registry.batchers("batch_by_sequence.v1") @registry.batchers("batch_by_sequence.v1")
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT: def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
optionals = ({"get_length": get_length} if get_length is not None else {}) optionals = {"get_length": get_length} if get_length is not None else {}
return partial(minibatch, size=size, **optionals) return partial(minibatch, size=size, **optionals)

View File

@ -1,4 +1,4 @@
from .iob2docs import iob2docs # noqa: F401 from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs from .json2docs import json2docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401 from .conllu2docs import conllu2docs # noqa: F401

View File

@ -1,6 +1,5 @@
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from pathlib import Path from pathlib import Path
import random
from .. import util from .. import util
from .example import Example from .example import Example
@ -25,7 +24,7 @@ class Corpus:
path (Path): The directory or filename to read from. path (Path): The directory or filename to read from.
gold_preproc (bool): Whether to set up the Example object with gold-standard gold_preproc (bool): Whether to set up the Example object with gold-standard
sentences and tokens for the predictions. Gold preprocessing helps sentences and tokens for the predictions. Gold preprocessing helps
the annotations align to the tokenization, and may result in sequences the annotations align to the tokenization, and may result in sequences
of more consistent length. However, it may reduce run-time accuracy due of more consistent length. However, it may reduce run-time accuracy due
to train/test skew. Defaults to False. to train/test skew. Defaults to False.
@ -39,7 +38,12 @@ class Corpus:
""" """
def __init__( def __init__(
self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False, self,
path,
*,
limit: int = 0,
gold_preproc: bool = False,
max_length: bool = False,
) -> None: ) -> None:
self.path = util.ensure_path(path) self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc self.gold_preproc = gold_preproc

View File

@ -80,7 +80,7 @@ def _get_transition_table(
B_start, B_end = (0, n_labels) B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels) I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels) L_start, L_end = (I_end, I_end + n_labels)
U_start, _ = (L_end, L_end + n_labels) U_start, _ = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express # Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations. # that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end) B_range = numpy.arange(B_start, B_end)

View File

@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
@Language.factory( @Language.factory("attribute_ruler")
"attribute_ruler",
)
def make_attribute_ruler( def make_attribute_ruler(
nlp: Language, nlp: Language,
name: str, name: str,
@ -58,7 +56,7 @@ class AttributeRuler(Pipe):
self.vocab = vocab self.vocab = vocab
self.matcher = Matcher(self.vocab) self.matcher = Matcher(self.vocab)
self.attrs = [] self.attrs = []
self._attrs_unnormed = [] # store for reference self._attrs_unnormed = [] # store for reference
self.indices = [] self.indices = []
if pattern_dicts: if pattern_dicts:

View File

@ -1,17 +1,23 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
from typing import Iterable, TypeVar from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator from pydantic import root_validator
from collections import defaultdict from collections import defaultdict
from thinc.api import Optimizer from thinc.api import Optimizer
from pathlib import Path
from .attrs import NAMES from .attrs import NAMES
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .gold import Example # noqa: F401
ItemT = TypeVar("ItemT") ItemT = TypeVar("ItemT")
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
Reader = Callable[["Language", str], Iterable["Example"]]
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
# check that against this schema in the test suite to make sure it's always # check that against this schema in the test suite to make sure it's always
# up to date. # up to date.
Reader = Callable[["Language", str], Iterable["Example"]]
class ConfigSchemaTraining(BaseModel): class ConfigSchemaTraining(BaseModel):
# fmt: off # fmt: off
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
extra = "forbid" extra = "forbid"
arbitrary_types_allowed = True arbitrary_types_allowed = True
#eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
class ConfigSchemaNlp(BaseModel): class ConfigSchemaNlp(BaseModel):
# fmt: off # fmt: off

View File

@ -291,6 +291,6 @@ def test_span_boundaries(doc):
for i in range(start, end): for i in range(start, end):
assert span[i - start] == doc[i] assert span[i - start] == doc[i]
with pytest.raises(IndexError): with pytest.raises(IndexError):
_ = span[-5] span[-5]
with pytest.raises(IndexError): with pytest.raises(IndexError):
_ = span[5] span[5]

View File

@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese( nlp = Chinese(
meta={ meta={
"tokenizer": { "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
}
} }
) )
zh_tokenizer_serialize(nlp.tokenizer) zh_tokenizer_serialize(nlp.tokenizer)

View File

@ -21,7 +21,7 @@ re_pattern5 = "B*A*B"
longest1 = "A A A A A" longest1 = "A A A A A"
longest2 = "A A A A A" longest2 = "A A A A A"
longest3 = "A A" longest3 = "A A"
longest4 = "B A A A A A B" # "FIRST" would be "B B" longest4 = "B A A A A A B" # "FIRST" would be "B B"
longest5 = "B B A A A A A B" longest5 = "B B A A A A A B"

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
from spacy.pipeline._parser_internals import nonproj from spacy.pipeline._parser_internals import nonproj
from ..util import get_doc from ..util import get_doc

View File

@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
def test_attributeruler_init_patterns(nlp, pattern_dicts): def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns # initialize with patterns
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert doc[3].morph_ == "Case=Nom|Number=Sing"
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset # initialize with patterns from asset
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}}) nlp.add_pipe(
"attribute_ruler",
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[2].morph_ == "Case=Nom|Number=Plur"

View File

@ -117,12 +117,15 @@ def test_kb_default(nlp):
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0 assert entity_linker.kb.get_size_aliases() == 0
assert entity_linker.kb.entity_vector_length == 64 # default value from pipeline.entity_linker # default value from pipeline.entity_linker
assert entity_linker.kb.entity_vector_length == 64
def test_kb_custom_length(nlp): def test_kb_custom_length(nlp):
"""Test that the default (empty) KB can be configured with a custom entity length""" """Test that the default (empty) KB can be configured with a custom entity length"""
entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}}) entity_linker = nlp.add_pipe(
"entity_linker", config={"kb": {"entity_vector_length": 35}}
)
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0 assert entity_linker.kb.get_size_aliases() == 0

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring # Test scoring
scores = nlp.evaluate( scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
assert scores["cats_f"] == 1.0 assert scores["cats_f"] == 1.0
assert scores["cats_score"] == 1.0 assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores assert "cats_score_desc" in scores

View File

@ -88,14 +88,9 @@ def my_parser():
width=321, width=321,
rows=5432, rows=5432,
also_embed_subwords=True, also_embed_subwords=True,
also_use_static_vectors=False also_use_static_vectors=False,
), ),
MaxoutWindowEncoder( MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
width=321,
window_size=3,
maxout_pieces=4,
depth=2
)
) )
parser = build_tb_parser_model( parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5

View File

@ -1,5 +1,4 @@
import spacy import spacy
import pytest
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin

View File

@ -711,16 +711,18 @@ def test_alignment_different_texts():
with pytest.raises(ValueError): with pytest.raises(ValueError):
Alignment.from_strings(other_tokens, spacy_tokens) Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc): def test_retokenized_docs(doc):
a = doc.to_array(["TAG"]) a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
example = Example(doc1, doc2) example = Example(doc1, doc2)
# fmt: off
assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.'] expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
# fmt: on
assert example.get_aligned("ORTH", as_string=True) == expected1
with doc1.retokenize() as retokenizer: with doc1.retokenize() as retokenizer:
retokenizer.merge(doc1[0:2]) retokenizer.merge(doc1[0:2])
retokenizer.merge(doc1[5:7]) retokenizer.merge(doc1[5:7])
assert example.get_aligned("ORTH", as_string=True) == expected2
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']

View File

@ -24,6 +24,7 @@ def get_textcat_kwargs():
"nO": 7, "nO": 7,
} }
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return { return {
"tok2vec": test_tok2vec(), "tok2vec": test_tok2vec(),
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
"nO": 13, "nO": 13,
} }
def get_all_params(model): def get_all_params(model):
params = [] params = []
for node in model.walk(): for node in model.walk():
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
# This actually creates models, so seems best to put it in a function. # This actually creates models, so seems best to put it in a function.
return { return {
"embed": MultiHashEmbed( "embed": MultiHashEmbed(
width=32, width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
rows=500,
also_embed_subwords=True,
also_use_static_vectors=False
), ),
"encode": MaxoutWindowEncoder( "encode": MaxoutWindowEncoder(
width=32, width=32, depth=2, maxout_pieces=2, window_size=1,
depth=2, ),
maxout_pieces=2,
window_size=1,
)
} }

View File

@ -19,14 +19,9 @@ def test_empty_doc():
width=width, width=width,
rows=embed_size, rows=embed_size,
also_use_static_vectors=False, also_use_static_vectors=False,
also_embed_subwords=True also_embed_subwords=True,
), ),
MaxoutWindowEncoder( MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
width=width,
depth=4,
window_size=1,
maxout_pieces=3
)
) )
tok2vec.initialize() tok2vec.initialize()
vectors, backprop = tok2vec.begin_update([doc]) vectors, backprop = tok2vec.begin_update([doc])
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
width=width, width=width,
rows=embed_size, rows=embed_size,
also_use_static_vectors=False, also_use_static_vectors=False,
also_embed_subwords=True also_embed_subwords=True,
), ),
MaxoutWindowEncoder( MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
width=width,
depth=4,
window_size=1,
maxout_pieces=3,
)
) )
tok2vec.initialize() tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch) vectors, backprop = tok2vec.begin_update(batch)

View File

@ -85,27 +85,24 @@ def test_util_dot_section():
""" """
nlp_config = Config().from_str(cfg_string) nlp_config = Config().from_str(cfg_string)
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True) en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH) default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl" default_config["nlp"]["lang"] = "nl"
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True) nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK # Test that creation went OK
assert isinstance(en_nlp, English) assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch) assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == [] assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"] assert en_nlp.pipe_names == ["textcat"]
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes # not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten # Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"] assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True assert nl_config["nlp"]["load_vocab_data"] # default value True
# Test proper functioning of 'dot_to_object' # Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError): with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.pipeline.tagger") dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError): with pytest.raises(KeyError):
obj = dot_to_object(en_config, "nlp.unknownattribute") dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data") assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data") assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -1,5 +1,5 @@
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING from typing import Iterator, Type, Pattern, TYPE_CHECKING
from types import ModuleType from types import ModuleType
import os import os
import importlib import importlib
@ -764,7 +764,6 @@ def normalize_slice(
return start, stop return start, stop
def filter_spans(spans: Iterable["Span"]) -> List["Span"]: def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for """Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or creating named entities (where one token can only be part of one entity) or
@ -1113,6 +1112,3 @@ def minibatch(items, size):
if len(batch) == 0: if len(batch) == 0:
break break
yield list(batch) yield list(batch)