Fix imports, types and default configs

This commit is contained in:
Ines Montani 2020-08-07 18:40:54 +02:00
parent b1d83fc13e
commit 3a193eb8f1
14 changed files with 14 additions and 24 deletions

View File

@ -5,6 +5,7 @@ from thinc.types import Floats2d
from ...util import registry from ...util import registry
from .._precomputable_affine import PrecomputableAffine from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel from ..tb_framework import TransitionModel
from ...tokens import Doc
@registry.architectures.register("spacy.TransitionBasedParser.v1") @registry.architectures.register("spacy.TransitionBasedParser.v1")
@ -18,7 +19,7 @@ def build_tb_parser_model(
) -> Model: ) -> Model:
""" """
Build a transition-based parser model. Can apply to NER or dependency-parsing. Build a transition-based parser model. Can apply to NER or dependency-parsing.
Transition-based parsing is an approach to structured prediction where the Transition-based parsing is an approach to structured prediction where the
task of predicting the structure is mapped to a series of state transitions. task of predicting the structure is mapped to a series of state transitions.
You might find this tutorial helpful as background: You might find this tutorial helpful as background:
@ -35,7 +36,7 @@ def build_tb_parser_model(
and applying the non-linearity. and applying the non-linearity.
* upper (optional): A feed-forward network that predicts scores from the * upper (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is state representation. If not present, the output from the lower model is
ued as action scores directly. used as action scores directly.
tok2vec (Model[List[Doc], List[Floats2d]]): tok2vec (Model[List[Doc], List[Floats2d]]):
Subnetwork to map tokens into vector representations. Subnetwork to map tokens into vector representations.
@ -44,10 +45,10 @@ def build_tb_parser_model(
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
feature sets are designed for the NER. The recommended feature sets are feature sets are designed for the NER. The recommended feature sets are
3 for NER, and 8 for the dependency parser. 3 for NER, and 8 for the dependency parser.
TODO: This feature should be split into two, state_type: ["deps", "ner"] TODO: This feature should be split into two, state_type: ["deps", "ner"]
and extra_state_features: [True, False]. This would map into: and extra_state_features: [True, False]. This would map into:
(deps, False): 8 (deps, False): 8
(deps, True): 13 (deps, True): 13
(ner, False): 3 (ner, False): 3

View File

@ -10,7 +10,7 @@ from .._iob import IOB
from ...util import registry from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1") @registry.architectures.register("spacy.BILUOTagger.v1")
def BiluoTagger( def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]] tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
@ -59,7 +59,7 @@ def IOBTagger(
token and uses greedy decoding with transition-constraints to return a valid token and uses greedy decoding with transition-constraints to return a valid
IOB tag sequence. IOB tag sequence.
A IOB tag sequence encodes a sequence of non-overlapping labelled spans An IOB tag sequence encodes a sequence of non-overlapping labelled spans
into tags assigned to each token. The first token of a span is given the into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
All other tokens are assigned the tag O. All other tokens are assigned the tag O.

View File

@ -3,7 +3,7 @@ from thinc.api import zero_init, with_array, Softmax, chain, Model
from thinc.types import Floats2d from thinc.types import Floats2d
from ...util import registry from ...util import registry
from ..tokens import Doc from ...tokens import Doc
@registry.architectures.register("spacy.Tagger.v1") @registry.architectures.register("spacy.Tagger.v1")

View File

@ -77,7 +77,7 @@ def build_Tok2Vec_model(
"""Construct a tok2vec model out of embedding and encoding subnetworks. """Construct a tok2vec model out of embedding and encoding subnetworks.
See https://explosion.ai/blog/deep-learning-formula-nlp See https://explosion.ai/blog/deep-learning-formula-nlp
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-indepdent embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
word vector representations. word vector representations.
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
embeddings, using an architecture such as a CNN, BiLSTM or transformer. embeddings, using an architecture such as a CNN, BiLSTM or transformer.
@ -187,7 +187,7 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
are between 16 and 64. are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the are between 3 and 8, although it may depend on the length of words in the
language. language.
""" """
model = chain( model = chain(
concatenate( concatenate(
@ -212,7 +212,7 @@ def MaxoutWindowEncoder(
normalization and residual connections. normalization and residual connections.
width (int): The input and output width. These are required to be the same, width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300. width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1. to construct the convolution. Recommended value is 1.
@ -244,7 +244,7 @@ def MishWindowEncoder(
normalization and residual connections. normalization and residual connections.
width (int): The input and output width. These are required to be the same, width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300. width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1. to construct the convolution. Recommended value is 1.
@ -266,7 +266,7 @@ def BiLSTMEncoder(
"""Encode context using bidirectonal LSTM layers. Requires PyTorch. """Encode context using bidirectonal LSTM layers. Requires PyTorch.
width (int): The input and output width. These are required to be the same, width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300. width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1. to construct the convolution. Recommended value is 1.

View File

@ -27,7 +27,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -29,7 +29,6 @@ embed_size = 300
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -29,7 +29,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 2 maxout_pieces = 2
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -25,7 +25,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -25,7 +25,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 2 maxout_pieces = 2
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -15,7 +15,7 @@ from .pipe import Pipe
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.BiluoTagger.v1" @architectures = "spacy.BILUOTagger.v1"
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v1"
@ -26,7 +26,6 @@ embed_size = 7000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -31,7 +31,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -48,7 +48,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """

View File

@ -20,7 +20,6 @@ embed_size = 2000
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
""" """
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]

View File

@ -48,7 +48,6 @@ window_size = 1
embed_size = 2000 embed_size = 2000
maxout_pieces = 3 maxout_pieces = 3
subword_features = true subword_features = true
dropout = null
[components.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
@ -78,7 +77,6 @@ embed_size = 5555
window_size = 1 window_size = 1
maxout_pieces = 7 maxout_pieces = 7
subword_features = false subword_features = false
dropout = null
""" """