mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Fix imports, types and default configs
This commit is contained in:
parent
b1d83fc13e
commit
3a193eb8f1
|
@ -5,6 +5,7 @@ from thinc.types import Floats2d
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from .._precomputable_affine import PrecomputableAffine
|
from .._precomputable_affine import PrecomputableAffine
|
||||||
from ..tb_framework import TransitionModel
|
from ..tb_framework import TransitionModel
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
|
@ -18,7 +19,7 @@ def build_tb_parser_model(
|
||||||
) -> Model:
|
) -> Model:
|
||||||
"""
|
"""
|
||||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||||
|
|
||||||
Transition-based parsing is an approach to structured prediction where the
|
Transition-based parsing is an approach to structured prediction where the
|
||||||
task of predicting the structure is mapped to a series of state transitions.
|
task of predicting the structure is mapped to a series of state transitions.
|
||||||
You might find this tutorial helpful as background:
|
You might find this tutorial helpful as background:
|
||||||
|
@ -35,7 +36,7 @@ def build_tb_parser_model(
|
||||||
and applying the non-linearity.
|
and applying the non-linearity.
|
||||||
* upper (optional): A feed-forward network that predicts scores from the
|
* upper (optional): A feed-forward network that predicts scores from the
|
||||||
state representation. If not present, the output from the lower model is
|
state representation. If not present, the output from the lower model is
|
||||||
ued as action scores directly.
|
used as action scores directly.
|
||||||
|
|
||||||
tok2vec (Model[List[Doc], List[Floats2d]]):
|
tok2vec (Model[List[Doc], List[Floats2d]]):
|
||||||
Subnetwork to map tokens into vector representations.
|
Subnetwork to map tokens into vector representations.
|
||||||
|
@ -44,10 +45,10 @@ def build_tb_parser_model(
|
||||||
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
||||||
feature sets are designed for the NER. The recommended feature sets are
|
feature sets are designed for the NER. The recommended feature sets are
|
||||||
3 for NER, and 8 for the dependency parser.
|
3 for NER, and 8 for the dependency parser.
|
||||||
|
|
||||||
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
||||||
and extra_state_features: [True, False]. This would map into:
|
and extra_state_features: [True, False]. This would map into:
|
||||||
|
|
||||||
(deps, False): 8
|
(deps, False): 8
|
||||||
(deps, True): 13
|
(deps, True): 13
|
||||||
(ner, False): 3
|
(ner, False): 3
|
||||||
|
|
|
@ -10,7 +10,7 @@ from .._iob import IOB
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.BiluoTagger.v1")
|
@registry.architectures.register("spacy.BILUOTagger.v1")
|
||||||
def BiluoTagger(
|
def BiluoTagger(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
@ -59,7 +59,7 @@ def IOBTagger(
|
||||||
token and uses greedy decoding with transition-constraints to return a valid
|
token and uses greedy decoding with transition-constraints to return a valid
|
||||||
IOB tag sequence.
|
IOB tag sequence.
|
||||||
|
|
||||||
A IOB tag sequence encodes a sequence of non-overlapping labelled spans
|
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
|
||||||
into tags assigned to each token. The first token of a span is given the
|
into tags assigned to each token. The first token of a span is given the
|
||||||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
|
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
|
||||||
All other tokens are assigned the tag O.
|
All other tokens are assigned the tag O.
|
||||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..tokens import Doc
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tagger.v1")
|
@registry.architectures.register("spacy.Tagger.v1")
|
||||||
|
|
|
@ -77,7 +77,7 @@ def build_Tok2Vec_model(
|
||||||
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
||||||
See https://explosion.ai/blog/deep-learning-formula-nlp
|
See https://explosion.ai/blog/deep-learning-formula-nlp
|
||||||
|
|
||||||
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-indepdent
|
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
|
||||||
word vector representations.
|
word vector representations.
|
||||||
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
||||||
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
||||||
|
@ -187,7 +187,7 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
are between 16 and 64.
|
are between 16 and 64.
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
are between 3 and 8, although it may depend on the length of words in the
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
language.
|
language.
|
||||||
"""
|
"""
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
|
@ -212,7 +212,7 @@ def MaxoutWindowEncoder(
|
||||||
normalization and residual connections.
|
normalization and residual connections.
|
||||||
|
|
||||||
width (int): The input and output width. These are required to be the same,
|
width (int): The input and output width. These are required to be the same,
|
||||||
to allow residual connections. This value will be determined by the
|
to allow residual connections. This value will be determined by the
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
window_size (int): The number of words to concatenate around each token
|
window_size (int): The number of words to concatenate around each token
|
||||||
to construct the convolution. Recommended value is 1.
|
to construct the convolution. Recommended value is 1.
|
||||||
|
@ -244,7 +244,7 @@ def MishWindowEncoder(
|
||||||
normalization and residual connections.
|
normalization and residual connections.
|
||||||
|
|
||||||
width (int): The input and output width. These are required to be the same,
|
width (int): The input and output width. These are required to be the same,
|
||||||
to allow residual connections. This value will be determined by the
|
to allow residual connections. This value will be determined by the
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
window_size (int): The number of words to concatenate around each token
|
window_size (int): The number of words to concatenate around each token
|
||||||
to construct the convolution. Recommended value is 1.
|
to construct the convolution. Recommended value is 1.
|
||||||
|
@ -266,7 +266,7 @@ def BiLSTMEncoder(
|
||||||
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
|
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
|
||||||
|
|
||||||
width (int): The input and output width. These are required to be the same,
|
width (int): The input and output width. These are required to be the same,
|
||||||
to allow residual connections. This value will be determined by the
|
to allow residual connections. This value will be determined by the
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
window_size (int): The number of words to concatenate around each token
|
window_size (int): The number of words to concatenate around each token
|
||||||
to construct the convolution. Recommended value is 1.
|
to construct the convolution. Recommended value is 1.
|
||||||
|
|
|
@ -27,7 +27,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,6 @@ embed_size = 300
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .pipe import Pipe
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.BiluoTagger.v1"
|
@architectures = "spacy.BILUOTagger.v1"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
@ -26,7 +26,6 @@ embed_size = 7000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ embed_size = 2000
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,6 @@ window_size = 1
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
subword_features = true
|
subword_features = true
|
||||||
dropout = null
|
|
||||||
|
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
@ -78,7 +77,6 @@ embed_size = 5555
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 7
|
maxout_pieces = 7
|
||||||
subword_features = false
|
subword_features = false
|
||||||
dropout = null
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user