mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 18:24:08 +03:00
ec45f704b1
* Drop python 3.6/3.7, remove unneeded compat * Remove unused import * Minimal python 3.8+ docs updates
115 lines
3.9 KiB
Python
115 lines
3.9 KiB
Python
from typing import Optional, List, Tuple, Any, Literal
|
|
from thinc.types import Floats2d
|
|
from thinc.api import Model
|
|
import warnings
|
|
|
|
from ...errors import Errors, Warnings
|
|
from ...util import registry
|
|
from ..tb_framework import TransitionModel
|
|
from ...tokens.doc import Doc
|
|
|
|
TransitionSystem = Any # TODO
|
|
State = Any # TODO
|
|
|
|
|
|
@registry.architectures.register("spacy.TransitionBasedParser.v2")
|
|
def transition_parser_v2(
|
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
state_type: Literal["parser", "ner"],
|
|
extra_state_tokens: bool,
|
|
hidden_width: int,
|
|
maxout_pieces: int,
|
|
use_upper: bool,
|
|
nO: Optional[int] = None,
|
|
) -> Model:
|
|
if not use_upper:
|
|
warnings.warn(Warnings.W400)
|
|
|
|
return build_tb_parser_model(
|
|
tok2vec,
|
|
state_type,
|
|
extra_state_tokens,
|
|
hidden_width,
|
|
maxout_pieces,
|
|
nO=nO,
|
|
)
|
|
|
|
|
|
@registry.architectures.register("spacy.TransitionBasedParser.v3")
|
|
def transition_parser_v3(
|
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
state_type: Literal["parser", "ner"],
|
|
extra_state_tokens: bool,
|
|
hidden_width: int,
|
|
maxout_pieces: int,
|
|
nO: Optional[int] = None,
|
|
) -> Model:
|
|
return build_tb_parser_model(
|
|
tok2vec,
|
|
state_type,
|
|
extra_state_tokens,
|
|
hidden_width,
|
|
maxout_pieces,
|
|
nO=nO,
|
|
)
|
|
|
|
|
|
def build_tb_parser_model(
|
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
state_type: Literal["parser", "ner"],
|
|
extra_state_tokens: bool,
|
|
hidden_width: int,
|
|
maxout_pieces: int,
|
|
nO: Optional[int] = None,
|
|
) -> Model:
|
|
"""
|
|
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
|
|
|
Transition-based parsing is an approach to structured prediction where the
|
|
task of predicting the structure is mapped to a series of state transitions.
|
|
You might find this tutorial helpful as background:
|
|
https://explosion.ai/blog/parsing-english-in-python
|
|
|
|
The neural network state prediction model consists of either two or three
|
|
subnetworks:
|
|
|
|
* tok2vec: Map each token into a vector representations. This subnetwork
|
|
is run once for each batch.
|
|
* lower: Construct a feature-specific vector for each (token, feature) pair.
|
|
This is also run once for each batch. Constructing the state
|
|
representation is then simply a matter of summing the component features
|
|
and applying the non-linearity.
|
|
* upper (optional): A feed-forward network that predicts scores from the
|
|
state representation. If not present, the output from the lower model is
|
|
used as action scores directly.
|
|
|
|
tok2vec (Model[List[Doc], List[Floats2d]]):
|
|
Subnetwork to map tokens into vector representations.
|
|
state_type (str):
|
|
String value denoting the type of parser model: "parser" or "ner"
|
|
extra_state_tokens (bool): Whether or not to use additional tokens in the context
|
|
to construct the state vector. Defaults to `False`, which means 3 and 8
|
|
for the NER and parser respectively. When set to `True`, this would become 6
|
|
feature sets (for the NER) or 13 (for the parser).
|
|
hidden_width (int): The width of the hidden layer.
|
|
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
|
Recommended values are 1, 2 or 3.
|
|
nO (int or None): The number of actions the model will predict between.
|
|
Usually inferred from data at the beginning of training, or loaded from
|
|
disk.
|
|
"""
|
|
if state_type == "parser":
|
|
nr_feature_tokens = 13 if extra_state_tokens else 8
|
|
elif state_type == "ner":
|
|
nr_feature_tokens = 6 if extra_state_tokens else 3
|
|
else:
|
|
raise ValueError(Errors.E917.format(value=state_type))
|
|
return TransitionModel(
|
|
tok2vec=tok2vec,
|
|
state_tokens=nr_feature_tokens,
|
|
hidden_width=hidden_width,
|
|
maxout_pieces=maxout_pieces,
|
|
nO=nO,
|
|
unseen_classes=set(),
|
|
)
|