mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-18 20:40:34 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
1075b7ebb7
|
@ -8,7 +8,6 @@ requires = [
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0rc0,<8.1.0",
|
"thinc>=8.0.0rc0,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pytokenizations",
|
|
||||||
"pathy"
|
"pathy"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -14,8 +14,7 @@ pathy
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.5.0,<2.0.0
|
pydantic>=1.5.0,<1.7.0
|
||||||
pytokenizations
|
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
|
@ -51,8 +51,8 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.5.0,<2.0.0
|
pydantic>=1.5.0,<1.7.0
|
||||||
pytokenizations
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -49,6 +49,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline._parser_internals.stateclass",
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
"spacy.pipeline._parser_internals.transition_system",
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
|
"spacy.training.align",
|
||||||
"spacy.training.gold_io",
|
"spacy.training.gold_io",
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0rc1"
|
__version__ = "3.0.0rc2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -93,27 +93,42 @@ def evaluate(
|
||||||
"SPEED": "speed",
|
"SPEED": "speed",
|
||||||
}
|
}
|
||||||
results = {}
|
results = {}
|
||||||
|
data = {}
|
||||||
for metric, key in metrics.items():
|
for metric, key in metrics.items():
|
||||||
if key in scores:
|
if key in scores:
|
||||||
if key == "cats_score":
|
if key == "cats_score":
|
||||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||||
if key == "speed":
|
if isinstance(scores[key], (int, float)):
|
||||||
results[metric] = f"{scores[key]:.0f}"
|
if key == "speed":
|
||||||
|
results[metric] = f"{scores[key]:.0f}"
|
||||||
|
else:
|
||||||
|
results[metric] = f"{scores[key]*100:.2f}"
|
||||||
else:
|
else:
|
||||||
results[metric] = f"{scores[key]*100:.2f}"
|
results[metric] = "-"
|
||||||
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||||
|
|
||||||
msg.table(results, title="Results")
|
msg.table(results, title="Results")
|
||||||
|
|
||||||
|
if "morph_per_feat" in scores:
|
||||||
|
if scores["morph_per_feat"]:
|
||||||
|
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
|
||||||
|
data["morph_per_feat"] = scores["morph_per_feat"]
|
||||||
|
if "dep_las_per_type" in scores:
|
||||||
|
if scores["dep_las_per_type"]:
|
||||||
|
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
|
||||||
|
data["dep_las_per_type"] = scores["dep_las_per_type"]
|
||||||
if "ents_per_type" in scores:
|
if "ents_per_type" in scores:
|
||||||
if scores["ents_per_type"]:
|
if scores["ents_per_type"]:
|
||||||
print_ents_per_type(msg, scores["ents_per_type"])
|
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
|
||||||
|
data["ents_per_type"] = scores["ents_per_type"]
|
||||||
if "cats_f_per_type" in scores:
|
if "cats_f_per_type" in scores:
|
||||||
if scores["cats_f_per_type"]:
|
if scores["cats_f_per_type"]:
|
||||||
print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
|
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
|
||||||
|
data["cats_f_per_type"] = scores["cats_f_per_type"]
|
||||||
if "cats_auc_per_type" in scores:
|
if "cats_auc_per_type" in scores:
|
||||||
if scores["cats_auc_per_type"]:
|
if scores["cats_auc_per_type"]:
|
||||||
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
||||||
|
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
|
@ -157,7 +172,7 @@ def render_parses(
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
|
||||||
data = [
|
data = [
|
||||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||||
for k, v in scores.items()
|
for k, v in scores.items()
|
||||||
|
@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
|
||||||
data,
|
data,
|
||||||
header=("", "P", "R", "F"),
|
header=("", "P", "R", "F"),
|
||||||
aligns=("l", "r", "r", "r"),
|
aligns=("l", "r", "r", "r"),
|
||||||
title="NER (per type)",
|
title=f"{name} (per {type})",
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
|
||||||
data = [
|
|
||||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
|
||||||
for k, v in scores.items()
|
|
||||||
]
|
|
||||||
msg.table(
|
|
||||||
data,
|
|
||||||
header=("", "P", "R", "F"),
|
|
||||||
aligns=("l", "r", "r", "r"),
|
|
||||||
title="Textcat F (per label)",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ def init_vectors_cli(
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Saved nlp object with vectors to output directory. You can now use the "
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
"path to it in your config as the 'vectors' setting in [initialize].",
|
||||||
output_dir.resolve(),
|
output_dir.resolve(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ def init_labels_cli(
|
||||||
extract the labels."""
|
extract the labels."""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir(parents=True)
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
|
|
|
@ -136,15 +136,19 @@ factory = "textcat"
|
||||||
|
|
||||||
{% if optimize == "accuracy" %}
|
{% if optimize == "accuracy" %}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
@ -271,15 +275,19 @@ factory = "textcat"
|
||||||
|
|
||||||
{% if optimize == "accuracy" %}
|
{% if optimize == "accuracy" %}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
|
|
@ -44,7 +44,7 @@ def train_cli(
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
if output_path is not None and not output_path.exists():
|
if output_path is not None and not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir(parents=True)
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
|
|
|
@ -398,8 +398,8 @@ class Errors:
|
||||||
E163 = ("cumsum was found to be unstable: its last element does not "
|
E163 = ("cumsum was found to be unstable: its last element does not "
|
||||||
"correspond to sum")
|
"correspond to sum")
|
||||||
E164 = ("x is neither increasing nor decreasing: {x}.")
|
E164 = ("x is neither increasing nor decreasing: {x}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in the gold labels: {label}. "
|
||||||
"that case.")
|
"ROC AUC score is not defined in that case.")
|
||||||
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
|
@ -456,6 +456,8 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
||||||
|
"relevant section in the config, but found type {type} instead.")
|
||||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||||
"is not set or None. If you've implemented a custom component, make "
|
"is not set or None. If you've implemented a custom component, make "
|
||||||
"sure to store the component model as `self.model` in your "
|
"sure to store the component model as `self.model` in your "
|
||||||
|
@ -562,7 +564,10 @@ class Errors:
|
||||||
"a string value from {expected} but got: '{arg}'")
|
"a string value from {expected} but got: '{arg}'")
|
||||||
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
||||||
"a list, but got: {arg_type}")
|
"a list, but got: {arg_type}")
|
||||||
E949 = ("Can only create an alignment when the texts are the same.")
|
E949 = ("Unable to align tokens for the predicted and reference docs. It "
|
||||||
|
"is only possible to align the docs when both texts are the same "
|
||||||
|
"except for whitespace and capitalization. The predicted tokens "
|
||||||
|
"start with: {x}. The reference tokens start with: {y}.")
|
||||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||||
|
|
|
@ -286,10 +286,10 @@ cdef class DependencyMatcher:
|
||||||
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
||||||
for matched_tree in matched_trees:
|
for matched_tree in matched_trees:
|
||||||
matched_key_trees.append((key, matched_tree))
|
matched_key_trees.append((key, matched_tree))
|
||||||
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
||||||
on_match = self._callbacks.get(match_id)
|
on_match = self._callbacks.get(match_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matched_key_trees)
|
on_match(self, doc, i, matched_key_trees)
|
||||||
return matched_key_trees
|
return matched_key_trees
|
||||||
|
|
||||||
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
|
from thinc.types import Floats2d
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
|
@ -10,12 +12,13 @@ from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
def build_simple_cnn_text_classifier(
|
def build_simple_cnn_text_classifier(
|
||||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||||
) -> Model:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
"""
|
"""
|
||||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||||
|
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
"""
|
"""
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
|
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
||||||
if exclusive_classes:
|
if exclusive_classes:
|
||||||
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
model = cnn >> output_layer
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
else:
|
else:
|
||||||
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||||
model = (
|
model = cnn >> linear_layer >> Logistic()
|
||||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
|
||||||
)
|
|
||||||
model.set_ref("output_layer", linear_layer)
|
model.set_ref("output_layer", linear_layer)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
|
@ -45,8 +47,7 @@ def build_bow_text_classifier(
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
# Don't document this yet, I'm not sure it's right.
|
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO)
|
sparse_linear = SparseLinear(nO)
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||||
|
@ -59,6 +60,39 @@ def build_bow_text_classifier(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||||
|
def build_text_classifier(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
linear_model: Model[List[Doc], Floats2d],
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
|
width = tok2vec.get_dim("nO")
|
||||||
|
cnn_model = (
|
||||||
|
tok2vec
|
||||||
|
>> list2ragged()
|
||||||
|
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
|
||||||
|
>> reduce_sum()
|
||||||
|
>> residual(Maxout(nO=width, nI=width))
|
||||||
|
>> Linear(nO=nO, nI=width)
|
||||||
|
>> Dropout(0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
nO_double = nO * 2 if nO else None
|
||||||
|
if exclusive_classes:
|
||||||
|
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||||
|
else:
|
||||||
|
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||||
|
model = (linear_model | cnn_model) >> output_layer
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
if model.has_dim("nO") is not False:
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
return model
|
||||||
|
|
||||||
|
# TODO: move to legacy
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier(
|
||||||
width: int,
|
width: int,
|
||||||
|
@ -158,11 +192,8 @@ def build_text_classifier(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(
|
def build_text_classifier_lowdata(
|
||||||
width: int,
|
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||||
pretrained_vectors: Optional[bool],
|
) -> Model[List[Doc], Floats2d]:
|
||||||
dropout: Optional[float],
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
# Don't document this yet, I'm not sure it's right.
|
# Don't document this yet, I'm not sure it's right.
|
||||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
|
|
|
@ -106,7 +106,7 @@ def MultiHashEmbed(
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedding layer that separately embeds a number of lexical
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representation.
|
||||||
|
|
||||||
The features used can be configured with the 'attrs' argument. The suggested
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
|
|
|
@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#score
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||||
"""
|
"""
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
validate_examples(examples, "AttributeRuler.score")
|
validate_examples(examples, "AttributeRuler.score")
|
||||||
results = {}
|
results = {}
|
||||||
attrs = set()
|
attrs = set()
|
||||||
|
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
|
||||||
elif attr == POS:
|
elif attr == POS:
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
elif attr == MORPH:
|
elif attr == MORPH:
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
elif attr == LEMMA:
|
elif attr == LEMMA:
|
||||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
||||||
"""
|
"""
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
validate_examples(examples, "DependencyParser.score")
|
validate_examples(examples, "DependencyParser.score")
|
||||||
def dep_getter(token, attr):
|
def dep_getter(token, attr):
|
||||||
dep = getattr(token, attr)
|
dep = getattr(token, attr)
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
return dep
|
return dep
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||||
kwargs.setdefault("getter", dep_getter)
|
kwargs.setdefault("getter", dep_getter)
|
||||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
|
|
|
@ -10,7 +10,7 @@ from ..errors import Errors
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import get_ner_prf
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ class EntityRuler(Pipe):
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
validate_examples(examples, "EntityRuler.score")
|
validate_examples(examples, "EntityRuler.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return get_ner_prf(examples)
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
|
|
@ -251,10 +251,13 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
||||||
"""
|
"""
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
validate_examples(examples, "Morphologizer.score")
|
validate_examples(examples, "Morphologizer.score")
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||||
"morph", **kwargs))
|
"morph", getter=morph_key_getter, **kwargs))
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
|
||||||
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "EntityRecognizer.score")
|
validate_examples(examples, "EntityRecognizer.score")
|
||||||
score_per_type = get_ner_prf(examples)
|
return get_ner_prf(examples)
|
||||||
totals = PRFScore()
|
|
||||||
for prf in score_per_type.values():
|
|
||||||
totals += prf
|
|
||||||
return {
|
|
||||||
"ents_p": totals.precision,
|
|
||||||
"ents_r": totals.recall,
|
|
||||||
"ents_f": totals.fscore,
|
|
||||||
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
|
||||||
}
|
|
||||||
|
|
|
@ -155,8 +155,11 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
||||||
"""
|
"""
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
validate_examples(examples, "Sentencizer.score")
|
validate_examples(examples, "Sentencizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
||||||
"""
|
"""
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
validate_examples(examples, "SentenceRecognizer.score")
|
validate_examples(examples, "SentenceRecognizer.score")
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -16,15 +16,30 @@ from ..vocab import Vocab
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
pretrained_vectors = null
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = 64
|
width = 64
|
||||||
conv_depth = 2
|
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
embed_size = 2000
|
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 2
|
||||||
|
|
||||||
|
[model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
dropout = null
|
no_output_layer = false
|
||||||
"""
|
"""
|
||||||
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
@ -60,9 +75,11 @@ subword_features = true
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
"cats_p": None,
|
"cats_micro_p": None,
|
||||||
"cats_r": None,
|
"cats_micro_r": None,
|
||||||
"cats_f": None,
|
"cats_micro_f": None,
|
||||||
|
"cats_macro_p": None,
|
||||||
|
"cats_macro_r": None,
|
||||||
"cats_macro_f": None,
|
"cats_macro_f": None,
|
||||||
"cats_macro_auc": None,
|
"cats_macro_auc": None,
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
|
|
287
spacy/scorer.py
287
spacy/scorer.py
|
@ -1,9 +1,9 @@
|
||||||
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
|
from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from .training import Example
|
from .training import Example
|
||||||
from .tokens import Token, Doc, Span
|
from .tokens import Token, Doc, Span, MorphAnalysis
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import get_lang_class, SimpleFrozenList
|
from .util import get_lang_class, SimpleFrozenList
|
||||||
from .morphology import Morphology
|
from .morphology import Morphology
|
||||||
|
@ -13,7 +13,8 @@ if TYPE_CHECKING:
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
|
DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
|
||||||
|
MISSING_VALUES = frozenset([None, 0, ""])
|
||||||
|
|
||||||
|
|
||||||
class PRFScore:
|
class PRFScore:
|
||||||
|
@ -24,6 +25,9 @@ class PRFScore:
|
||||||
self.fp = 0
|
self.fp = 0
|
||||||
self.fn = 0
|
self.fn = 0
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return self.tp + self.fp + self.fn
|
||||||
|
|
||||||
def __iadd__(self, other):
|
def __iadd__(self, other):
|
||||||
self.tp += other.tp
|
self.tp += other.tp
|
||||||
self.fp += other.fp
|
self.fp += other.fp
|
||||||
|
@ -59,7 +63,9 @@ class PRFScore:
|
||||||
|
|
||||||
|
|
||||||
class ROCAUCScore:
|
class ROCAUCScore:
|
||||||
"""An AUC ROC score."""
|
"""An AUC ROC score. This is only defined for binary classification.
|
||||||
|
Use the method is_binary before calculating the score, otherwise it
|
||||||
|
may throw an error."""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.golds = []
|
self.golds = []
|
||||||
|
@ -71,16 +77,16 @@ class ROCAUCScore:
|
||||||
self.cands.append(cand)
|
self.cands.append(cand)
|
||||||
self.golds.append(gold)
|
self.golds.append(gold)
|
||||||
|
|
||||||
|
def is_binary(self):
|
||||||
|
return len(np.unique(self.golds)) == 2
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def score(self):
|
def score(self):
|
||||||
|
if not self.is_binary():
|
||||||
|
raise ValueError(Errors.E165.format(label=set(self.golds)))
|
||||||
if len(self.golds) == self.saved_score_at_len:
|
if len(self.golds) == self.saved_score_at_len:
|
||||||
return self.saved_score
|
return self.saved_score
|
||||||
try:
|
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
||||||
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
|
||||||
# catch ValueError: Only one class present in y_true.
|
|
||||||
# ROC AUC score is not defined in that case.
|
|
||||||
except ValueError:
|
|
||||||
self.saved_score = -float("inf")
|
|
||||||
self.saved_score_at_len = len(self.golds)
|
self.saved_score_at_len = len(self.golds)
|
||||||
return self.saved_score
|
return self.saved_score
|
||||||
|
|
||||||
|
@ -92,7 +98,7 @@ class Scorer:
|
||||||
self,
|
self,
|
||||||
nlp: Optional["Language"] = None,
|
nlp: Optional["Language"] = None,
|
||||||
default_lang: str = "xx",
|
default_lang: str = "xx",
|
||||||
default_pipeline=DEFAULT_PIPELINE,
|
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
|
@ -124,13 +130,13 @@ class Scorer:
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
|
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
|
||||||
"""Returns accuracy and PRF scores for tokenization.
|
"""Returns accuracy and PRF scores for tokenization.
|
||||||
* token_acc: # correct tokens / # gold tokens
|
* token_acc: # correct tokens / # gold tokens
|
||||||
* token_p/r/f: PRF for token character spans
|
* token_p/r/f: PRF for token character spans
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the scores
|
RETURNS (Dict[str, Any]): A dictionary containing the scores
|
||||||
token_acc/p/r/f.
|
token_acc/p/r/f.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
||||||
|
@ -140,6 +146,8 @@ class Scorer:
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
if gold_doc.has_unknown_spaces:
|
||||||
|
continue
|
||||||
align = example.alignment
|
align = example.alignment
|
||||||
gold_spans = set()
|
gold_spans = set()
|
||||||
pred_spans = set()
|
pred_spans = set()
|
||||||
|
@ -156,12 +164,20 @@ class Scorer:
|
||||||
else:
|
else:
|
||||||
acc_score.tp += 1
|
acc_score.tp += 1
|
||||||
prf_score.score_set(pred_spans, gold_spans)
|
prf_score.score_set(pred_spans, gold_spans)
|
||||||
return {
|
if len(acc_score) > 0:
|
||||||
"token_acc": acc_score.fscore,
|
return {
|
||||||
"token_p": prf_score.precision,
|
"token_acc": acc_score.fscore,
|
||||||
"token_r": prf_score.recall,
|
"token_p": prf_score.precision,
|
||||||
"token_f": prf_score.fscore,
|
"token_r": prf_score.recall,
|
||||||
}
|
"token_f": prf_score.fscore,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"token_acc": None,
|
||||||
|
"token_p": None,
|
||||||
|
"token_r": None,
|
||||||
|
"token_f": None
|
||||||
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr(
|
def score_token_attr(
|
||||||
|
@ -169,8 +185,9 @@ class Scorer:
|
||||||
attr: str,
|
attr: str,
|
||||||
*,
|
*,
|
||||||
getter: Callable[[Token, str], Any] = getattr,
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
missing_values: Set[Any] = MISSING_VALUES,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> Dict[str, float]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns an accuracy score for a token-level attribute.
|
"""Returns an accuracy score for a token-level attribute.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
|
@ -178,7 +195,7 @@ class Scorer:
|
||||||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
|
||||||
under the key attr_acc.
|
under the key attr_acc.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
||||||
|
@ -189,17 +206,27 @@ class Scorer:
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
align = example.alignment
|
align = example.alignment
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
|
missing_indices = set()
|
||||||
for gold_i, token in enumerate(gold_doc):
|
for gold_i, token in enumerate(gold_doc):
|
||||||
gold_tags.add((gold_i, getter(token, attr)))
|
value = getter(token, attr)
|
||||||
|
if value not in missing_values:
|
||||||
|
gold_tags.add((gold_i, getter(token, attr)))
|
||||||
|
else:
|
||||||
|
missing_indices.add(gold_i)
|
||||||
pred_tags = set()
|
pred_tags = set()
|
||||||
for token in pred_doc:
|
for token in pred_doc:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
if align.x2y.lengths[token.i] == 1:
|
if align.x2y.lengths[token.i] == 1:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
pred_tags.add((gold_i, getter(token, attr)))
|
if gold_i not in missing_indices:
|
||||||
|
pred_tags.add((gold_i, getter(token, attr)))
|
||||||
tag_score.score_set(pred_tags, gold_tags)
|
tag_score.score_set(pred_tags, gold_tags)
|
||||||
return {f"{attr}_acc": tag_score.fscore}
|
score_key = f"{attr}_acc"
|
||||||
|
if len(tag_score) == 0:
|
||||||
|
return {score_key: None}
|
||||||
|
else:
|
||||||
|
return {score_key: tag_score.fscore}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_token_attr_per_feat(
|
def score_token_attr_per_feat(
|
||||||
|
@ -207,8 +234,9 @@ class Scorer:
|
||||||
attr: str,
|
attr: str,
|
||||||
*,
|
*,
|
||||||
getter: Callable[[Token, str], Any] = getattr,
|
getter: Callable[[Token, str], Any] = getattr,
|
||||||
|
missing_values: Set[Any] = MISSING_VALUES,
|
||||||
**cfg,
|
**cfg,
|
||||||
):
|
) -> Dict[str, Any]:
|
||||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
|
@ -216,7 +244,7 @@ class Scorer:
|
||||||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||||
getter(token, attr) should return the value of the attribute for an
|
getter(token, attr) should return the value of the attribute for an
|
||||||
individual token.
|
individual token.
|
||||||
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
RETURNS (dict): A dictionary containing the per-feat PRF scores under
|
||||||
the key attr_per_feat.
|
the key attr_per_feat.
|
||||||
"""
|
"""
|
||||||
per_feat = {}
|
per_feat = {}
|
||||||
|
@ -225,9 +253,11 @@ class Scorer:
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
align = example.alignment
|
align = example.alignment
|
||||||
gold_per_feat = {}
|
gold_per_feat = {}
|
||||||
|
missing_indices = set()
|
||||||
for gold_i, token in enumerate(gold_doc):
|
for gold_i, token in enumerate(gold_doc):
|
||||||
morph = str(getter(token, attr))
|
value = getter(token, attr)
|
||||||
if morph:
|
morph = gold_doc.vocab.strings[value]
|
||||||
|
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
|
||||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||||
field, values = feat.split(Morphology.FIELD_SEP)
|
field, values = feat.split(Morphology.FIELD_SEP)
|
||||||
if field not in per_feat:
|
if field not in per_feat:
|
||||||
|
@ -235,27 +265,35 @@ class Scorer:
|
||||||
if field not in gold_per_feat:
|
if field not in gold_per_feat:
|
||||||
gold_per_feat[field] = set()
|
gold_per_feat[field] = set()
|
||||||
gold_per_feat[field].add((gold_i, feat))
|
gold_per_feat[field].add((gold_i, feat))
|
||||||
|
else:
|
||||||
|
missing_indices.add(gold_i)
|
||||||
pred_per_feat = {}
|
pred_per_feat = {}
|
||||||
for token in pred_doc:
|
for token in pred_doc:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
if align.x2y.lengths[token.i] == 1:
|
if align.x2y.lengths[token.i] == 1:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
morph = str(getter(token, attr))
|
if gold_i not in missing_indices:
|
||||||
if morph:
|
value = getter(token, attr)
|
||||||
for feat in morph.split("|"):
|
morph = gold_doc.vocab.strings[value]
|
||||||
field, values = feat.split("=")
|
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
|
||||||
if field not in per_feat:
|
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||||
per_feat[field] = PRFScore()
|
field, values = feat.split(Morphology.FIELD_SEP)
|
||||||
if field not in pred_per_feat:
|
if field not in per_feat:
|
||||||
pred_per_feat[field] = set()
|
per_feat[field] = PRFScore()
|
||||||
pred_per_feat[field].add((gold_i, feat))
|
if field not in pred_per_feat:
|
||||||
|
pred_per_feat[field] = set()
|
||||||
|
pred_per_feat[field].add((gold_i, feat))
|
||||||
for field in per_feat:
|
for field in per_feat:
|
||||||
per_feat[field].score_set(
|
per_feat[field].score_set(
|
||||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||||
)
|
)
|
||||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
score_key = f"{attr}_per_feat"
|
||||||
return {f"{attr}_per_feat": result}
|
if any([len(v) for v in per_feat.values()]):
|
||||||
|
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||||
|
return {score_key: result}
|
||||||
|
else:
|
||||||
|
return {score_key: None}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_spans(
|
def score_spans(
|
||||||
|
@ -263,6 +301,7 @@ class Scorer:
|
||||||
attr: str,
|
attr: str,
|
||||||
*,
|
*,
|
||||||
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
|
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
|
||||||
|
has_annotation: Optional[Callable[[Doc], bool]] = None,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF scores for labeled spans.
|
"""Returns PRF scores for labeled spans.
|
||||||
|
@ -282,18 +321,10 @@ class Scorer:
|
||||||
for example in examples:
|
for example in examples:
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
# TODO
|
# Option to handle docs without sents
|
||||||
# This is a temporary hack to work around the problem that the scorer
|
if has_annotation is not None:
|
||||||
# fails if you have examples that are not fully annotated for all
|
if not has_annotation(gold_doc):
|
||||||
# the tasks in your pipeline. For instance, you might have a corpus
|
continue
|
||||||
# of NER annotations that does not set sentence boundaries, but the
|
|
||||||
# pipeline includes a parser or senter, and then the score_weights
|
|
||||||
# are used to evaluate that component. When the scorer attempts
|
|
||||||
# to read the sentences from the gold document, it fails.
|
|
||||||
try:
|
|
||||||
list(getter(gold_doc, attr))
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
# Find all labels in gold and doc
|
# Find all labels in gold and doc
|
||||||
labels = set(
|
labels = set(
|
||||||
[k.label_ for k in getter(gold_doc, attr)]
|
[k.label_ for k in getter(gold_doc, attr)]
|
||||||
|
@ -321,13 +352,21 @@ class Scorer:
|
||||||
v.score_set(pred_per_type[k], gold_per_type[k])
|
v.score_set(pred_per_type[k], gold_per_type[k])
|
||||||
# Score for all labels
|
# Score for all labels
|
||||||
score.score_set(pred_spans, gold_spans)
|
score.score_set(pred_spans, gold_spans)
|
||||||
results = {
|
if len(score) > 0:
|
||||||
f"{attr}_p": score.precision,
|
return {
|
||||||
f"{attr}_r": score.recall,
|
f"{attr}_p": score.precision,
|
||||||
f"{attr}_f": score.fscore,
|
f"{attr}_r": score.recall,
|
||||||
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
f"{attr}_f": score.fscore,
|
||||||
}
|
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||||
return results
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
f"{attr}_p": None,
|
||||||
|
f"{attr}_r": None,
|
||||||
|
f"{attr}_f": None,
|
||||||
|
f"{attr}_per_type": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_cats(
|
def score_cats(
|
||||||
|
@ -362,9 +401,13 @@ class Scorer:
|
||||||
for all:
|
for all:
|
||||||
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
||||||
attr_score_desc (text description of the overall score),
|
attr_score_desc (text description of the overall score),
|
||||||
|
attr_micro_p,
|
||||||
|
attr_micro_r,
|
||||||
attr_micro_f,
|
attr_micro_f,
|
||||||
|
attr_macro_p,
|
||||||
|
attr_macro_r,
|
||||||
attr_macro_f,
|
attr_macro_f,
|
||||||
attr_auc,
|
attr_macro_auc,
|
||||||
attr_f_per_type,
|
attr_f_per_type,
|
||||||
attr_auc_per_type
|
attr_auc_per_type
|
||||||
|
|
||||||
|
@ -384,9 +427,6 @@ class Scorer:
|
||||||
pred_cats = getter(example.predicted, attr)
|
pred_cats = getter(example.predicted, attr)
|
||||||
gold_cats = getter(example.reference, attr)
|
gold_cats = getter(example.reference, attr)
|
||||||
|
|
||||||
# I think the AUC metric is applicable regardless of whether we're
|
|
||||||
# doing multi-label classification? Unsure. If not, move this into
|
|
||||||
# the elif pred_cats and gold_cats block below.
|
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pred_score = pred_cats.get(label, 0.0)
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
gold_score = gold_cats.get(label, 0.0)
|
gold_score = gold_cats.get(label, 0.0)
|
||||||
|
@ -431,7 +471,9 @@ class Scorer:
|
||||||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||||
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
||||||
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
|
# Limit macro_auc to those labels with gold annotations,
|
||||||
|
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
|
||||||
|
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
|
||||||
results = {
|
results = {
|
||||||
f"{attr}_score": None,
|
f"{attr}_score": None,
|
||||||
f"{attr}_score_desc": None,
|
f"{attr}_score_desc": None,
|
||||||
|
@ -443,7 +485,7 @@ class Scorer:
|
||||||
f"{attr}_macro_f": macro_f,
|
f"{attr}_macro_f": macro_f,
|
||||||
f"{attr}_macro_auc": macro_auc,
|
f"{attr}_macro_auc": macro_auc,
|
||||||
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
if len(labels) == 2 and not multi_label and positive_label:
|
if len(labels) == 2 and not multi_label and positive_label:
|
||||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
||||||
|
@ -534,6 +576,7 @@ class Scorer:
|
||||||
head_attr: str = "head",
|
head_attr: str = "head",
|
||||||
head_getter: Callable[[Token, str], Token] = getattr,
|
head_getter: Callable[[Token, str], Token] = getattr,
|
||||||
ignore_labels: Iterable[str] = SimpleFrozenList(),
|
ignore_labels: Iterable[str] = SimpleFrozenList(),
|
||||||
|
missing_values: Set[Any] = MISSING_VALUES,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||||
|
@ -558,6 +601,7 @@ class Scorer:
|
||||||
unlabelled = PRFScore()
|
unlabelled = PRFScore()
|
||||||
labelled = PRFScore()
|
labelled = PRFScore()
|
||||||
labelled_per_dep = dict()
|
labelled_per_dep = dict()
|
||||||
|
missing_indices = set()
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_doc = example.reference
|
gold_doc = example.reference
|
||||||
pred_doc = example.predicted
|
pred_doc = example.predicted
|
||||||
|
@ -567,13 +611,16 @@ class Scorer:
|
||||||
for gold_i, token in enumerate(gold_doc):
|
for gold_i, token in enumerate(gold_doc):
|
||||||
dep = getter(token, attr)
|
dep = getter(token, attr)
|
||||||
head = head_getter(token, head_attr)
|
head = head_getter(token, head_attr)
|
||||||
if dep not in ignore_labels:
|
if dep not in missing_values:
|
||||||
gold_deps.add((gold_i, head.i, dep))
|
if dep not in ignore_labels:
|
||||||
if dep not in labelled_per_dep:
|
gold_deps.add((gold_i, head.i, dep))
|
||||||
labelled_per_dep[dep] = PRFScore()
|
if dep not in labelled_per_dep:
|
||||||
if dep not in gold_deps_per_dep:
|
labelled_per_dep[dep] = PRFScore()
|
||||||
gold_deps_per_dep[dep] = set()
|
if dep not in gold_deps_per_dep:
|
||||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
gold_deps_per_dep[dep] = set()
|
||||||
|
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||||
|
else:
|
||||||
|
missing_indices.add(gold_i)
|
||||||
pred_deps = set()
|
pred_deps = set()
|
||||||
pred_deps_per_dep = {}
|
pred_deps_per_dep = {}
|
||||||
for token in pred_doc:
|
for token in pred_doc:
|
||||||
|
@ -583,25 +630,26 @@ class Scorer:
|
||||||
gold_i = None
|
gold_i = None
|
||||||
else:
|
else:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||||
dep = getter(token, attr)
|
if gold_i not in missing_indices:
|
||||||
head = head_getter(token, head_attr)
|
dep = getter(token, attr)
|
||||||
if dep not in ignore_labels and token.orth_.strip():
|
head = head_getter(token, head_attr)
|
||||||
if align.x2y.lengths[head.i] == 1:
|
if dep not in ignore_labels and token.orth_.strip():
|
||||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
if align.x2y.lengths[head.i] == 1:
|
||||||
else:
|
gold_head = align.x2y[head.i].dataXd[0, 0]
|
||||||
gold_head = None
|
else:
|
||||||
# None is indistinct, so we can't just add it to the set
|
gold_head = None
|
||||||
# Multiple (None, None) deps are possible
|
# None is indistinct, so we can't just add it to the set
|
||||||
if gold_i is None or gold_head is None:
|
# Multiple (None, None) deps are possible
|
||||||
unlabelled.fp += 1
|
if gold_i is None or gold_head is None:
|
||||||
labelled.fp += 1
|
unlabelled.fp += 1
|
||||||
else:
|
labelled.fp += 1
|
||||||
pred_deps.add((gold_i, gold_head, dep))
|
else:
|
||||||
if dep not in labelled_per_dep:
|
pred_deps.add((gold_i, gold_head, dep))
|
||||||
labelled_per_dep[dep] = PRFScore()
|
if dep not in labelled_per_dep:
|
||||||
if dep not in pred_deps_per_dep:
|
labelled_per_dep[dep] = PRFScore()
|
||||||
pred_deps_per_dep[dep] = set()
|
if dep not in pred_deps_per_dep:
|
||||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
pred_deps_per_dep[dep] = set()
|
||||||
|
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||||
labelled.score_set(pred_deps, gold_deps)
|
labelled.score_set(pred_deps, gold_deps)
|
||||||
for dep in labelled_per_dep:
|
for dep in labelled_per_dep:
|
||||||
labelled_per_dep[dep].score_set(
|
labelled_per_dep[dep].score_set(
|
||||||
|
@ -610,29 +658,34 @@ class Scorer:
|
||||||
unlabelled.score_set(
|
unlabelled.score_set(
|
||||||
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
||||||
)
|
)
|
||||||
return {
|
if len(unlabelled) > 0:
|
||||||
f"{attr}_uas": unlabelled.fscore,
|
return {
|
||||||
f"{attr}_las": labelled.fscore,
|
f"{attr}_uas": unlabelled.fscore,
|
||||||
f"{attr}_las_per_type": {
|
f"{attr}_las": labelled.fscore,
|
||||||
k: v.to_dict() for k, v in labelled_per_dep.items()
|
f"{attr}_las_per_type": {
|
||||||
},
|
k: v.to_dict() for k, v in labelled_per_dep.items()
|
||||||
}
|
},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
f"{attr}_uas": None,
|
||||||
|
f"{attr}_las": None,
|
||||||
|
f"{attr}_las_per_type": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
||||||
"""Compute per-entity PRFScore objects for a sequence of examples. The
|
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples.
|
||||||
results are returned as a dictionary keyed by the entity type. You can
|
|
||||||
add the PRFScore objects to get micro-averaged total.
|
|
||||||
"""
|
"""
|
||||||
scores = defaultdict(PRFScore)
|
score_per_type = defaultdict(PRFScore)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
if not eg.y.has_annotation("ENT_IOB"):
|
if not eg.y.has_annotation("ENT_IOB"):
|
||||||
continue
|
continue
|
||||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||||
align_x2y = eg.alignment.x2y
|
align_x2y = eg.alignment.x2y
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in scores:
|
if pred_ent.label_ not in score_per_type:
|
||||||
scores[pred_ent.label_] = PRFScore()
|
score_per_type[pred_ent.label_] = PRFScore()
|
||||||
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
|
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
|
||||||
if len(indices):
|
if len(indices):
|
||||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||||
|
@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
||||||
if all(token.ent_iob != 0 for token in g_span):
|
if all(token.ent_iob != 0 for token in g_span):
|
||||||
key = (pred_ent.label_, indices[0], indices[-1] + 1)
|
key = (pred_ent.label_, indices[0], indices[-1] + 1)
|
||||||
if key in golds:
|
if key in golds:
|
||||||
scores[pred_ent.label_].tp += 1
|
score_per_type[pred_ent.label_].tp += 1
|
||||||
golds.remove(key)
|
golds.remove(key)
|
||||||
else:
|
else:
|
||||||
scores[pred_ent.label_].fp += 1
|
score_per_type[pred_ent.label_].fp += 1
|
||||||
for label, start, end in golds:
|
for label, start, end in golds:
|
||||||
scores[label].fn += 1
|
score_per_type[label].fn += 1
|
||||||
return scores
|
totals = PRFScore()
|
||||||
|
for prf in score_per_type.values():
|
||||||
|
totals += prf
|
||||||
|
if len(totals) > 0:
|
||||||
|
return {
|
||||||
|
"ents_p": totals.precision,
|
||||||
|
"ents_r": totals.recall,
|
||||||
|
"ents_f": totals.fscore,
|
||||||
|
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"ents_p": None,
|
||||||
|
"ents_r": None,
|
||||||
|
"ents_f": None,
|
||||||
|
"ents_per_type": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score):
|
||||||
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
||||||
"""
|
"""
|
||||||
if len(np.unique(y_true)) != 2:
|
if len(np.unique(y_true)) != 2:
|
||||||
raise ValueError(Errors.E165)
|
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
|
||||||
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
||||||
return _auc(fpr, tpr)
|
return _auc(fpr, tpr)
|
||||||
|
|
||||||
|
|
|
@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc):
|
||||||
pattern = [
|
pattern = [
|
||||||
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
|
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
|
||||||
]
|
]
|
||||||
|
nomatch_pattern = [
|
||||||
|
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}},
|
||||||
|
]
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
matcher = DependencyMatcher(en_vocab)
|
||||||
mock = Mock()
|
mock = Mock()
|
||||||
matcher.add("pattern", [pattern], on_match=mock)
|
matcher.add("pattern", [pattern], on_match=mock)
|
||||||
|
matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||||
|
|
||||||
# check that matches with and without callback are the same (#4590)
|
# check that matches with and without callback are the same (#4590)
|
||||||
|
|
|
@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
# "cat" is the only correct lemma
|
# "cat" is the only correct lemma
|
||||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||||
# the empty morphs are correct
|
# no morphs are set
|
||||||
assert scores["morph_acc"] == pytest.approx(0.6)
|
assert scores["morph_acc"] == None
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_rule_order(nlp):
|
def test_attributeruler_rule_order(nlp):
|
||||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
||||||
from thinc.api import Model, Linear, ConfigValidationError
|
from thinc.api import Model, Linear, ConfigValidationError
|
||||||
|
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
|
||||||
name = "test_class_component_model"
|
name = "test_class_component_model"
|
||||||
default_config = {
|
default_config = {
|
||||||
"model": {
|
"model": {
|
||||||
"@architectures": "spacy.TextCatEnsemble.v1",
|
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||||
"exclusive_classes": False,
|
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||||
"pretrained_vectors": None,
|
"linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
|
||||||
"width": 64,
|
"no_output_layer": False},
|
||||||
"embed_size": 2000,
|
|
||||||
"window_size": 1,
|
|
||||||
"conv_depth": 2,
|
|
||||||
"ngram_size": 1,
|
|
||||||
"dropout": None,
|
|
||||||
},
|
},
|
||||||
"value1": 10,
|
"value1": 10,
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,7 +140,7 @@ def test_overfitting_IO():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||||
# Set exclusive labels
|
# Set exclusive labels
|
||||||
config = {"model": {"exclusive_classes": True}}
|
config = {"model": {"linear_model": {"exclusive_classes": True}}}
|
||||||
textcat = nlp.add_pipe("textcat", config=config)
|
textcat = nlp.add_pipe("textcat", config=config)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
|
@ -192,9 +192,8 @@ def test_overfitting_IO():
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
|
||||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
||||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
||||||
],
|
],
|
||||||
|
|
|
@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_kwargs():
|
def get_textcat_bow_kwargs():
|
||||||
return {
|
return {
|
||||||
"width": 64,
|
"exclusive_classes": True,
|
||||||
"embed_size": 2000,
|
|
||||||
"pretrained_vectors": None,
|
|
||||||
"exclusive_classes": False,
|
|
||||||
"ngram_size": 1,
|
"ngram_size": 1,
|
||||||
"window_size": 1,
|
"no_output_layer": False,
|
||||||
"conv_depth": 2,
|
"nO": 34,
|
||||||
"dropout": None,
|
|
||||||
"nO": 7,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {
|
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
"tok2vec": test_tok2vec(),
|
|
||||||
"exclusive_classes": False,
|
|
||||||
"nO": 13,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -105,7 +96,7 @@ def test_multi_hash_embed():
|
||||||
"seed,model_func,kwargs",
|
"seed,model_func,kwargs",
|
||||||
[
|
[
|
||||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
|
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
|
||||||
(0, build_text_classifier, get_textcat_kwargs()),
|
(0, build_bow_text_classifier, get_textcat_bow_kwargs()),
|
||||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
|
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
|
||||||
"seed,model_func,kwargs,get_X",
|
"seed,model_func,kwargs,get_X",
|
||||||
[
|
[
|
||||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||||
(0, build_text_classifier, get_textcat_kwargs(), get_docs),
|
(0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
|
||||||
"seed,dropout,model_func,kwargs,get_X",
|
"seed,dropout,model_func,kwargs,get_X",
|
||||||
[
|
[
|
||||||
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||||
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
|
(0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||||
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
|
||||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partial_annotation(en_tokenizer):
|
||||||
|
pred_doc = en_tokenizer("a b c d e")
|
||||||
|
pred_doc[0].tag_ = "A"
|
||||||
|
pred_doc[0].pos_ = "X"
|
||||||
|
pred_doc[0].set_morph("Feat=Val")
|
||||||
|
pred_doc[0].dep_ = "dep"
|
||||||
|
|
||||||
|
# unannotated reference
|
||||||
|
ref_doc = en_tokenizer("a b c d e")
|
||||||
|
ref_doc.has_unknown_spaces = True
|
||||||
|
example = Example(pred_doc, ref_doc)
|
||||||
|
scorer = Scorer()
|
||||||
|
scores = scorer.score([example])
|
||||||
|
for key in scores:
|
||||||
|
# cats doesn't have an unset state
|
||||||
|
if key.startswith("cats"):
|
||||||
|
continue
|
||||||
|
assert scores[key] == None
|
||||||
|
|
||||||
|
# partially annotated reference, not overlapping with predicted annotation
|
||||||
|
ref_doc = en_tokenizer("a b c d e")
|
||||||
|
ref_doc.has_unknown_spaces = True
|
||||||
|
ref_doc[1].tag_ = "A"
|
||||||
|
ref_doc[1].pos_ = "X"
|
||||||
|
ref_doc[1].set_morph("Feat=Val")
|
||||||
|
ref_doc[1].dep_ = "dep"
|
||||||
|
example = Example(pred_doc, ref_doc)
|
||||||
|
scorer = Scorer()
|
||||||
|
scores = scorer.score([example])
|
||||||
|
assert scores["token_acc"] == None
|
||||||
|
assert scores["tag_acc"] == 0.0
|
||||||
|
assert scores["pos_acc"] == 0.0
|
||||||
|
assert scores["morph_acc"] == 0.0
|
||||||
|
assert scores["dep_uas"] == 1.0
|
||||||
|
assert scores["dep_las"] == 0.0
|
||||||
|
assert scores["sents_f"] == None
|
||||||
|
|
||||||
|
# partially annotated reference, overlapping with predicted annotation
|
||||||
|
ref_doc = en_tokenizer("a b c d e")
|
||||||
|
ref_doc.has_unknown_spaces = True
|
||||||
|
ref_doc[0].tag_ = "A"
|
||||||
|
ref_doc[0].pos_ = "X"
|
||||||
|
ref_doc[1].set_morph("Feat=Val")
|
||||||
|
ref_doc[1].dep_ = "dep"
|
||||||
|
example = Example(pred_doc, ref_doc)
|
||||||
|
scorer = Scorer()
|
||||||
|
scores = scorer.score([example])
|
||||||
|
assert scores["token_acc"] == None
|
||||||
|
assert scores["tag_acc"] == 1.0
|
||||||
|
assert scores["pos_acc"] == 1.0
|
||||||
|
assert scores["morph_acc"] == 0.0
|
||||||
|
assert scores["dep_uas"] == 1.0
|
||||||
|
assert scores["dep_las"] == 0.0
|
||||||
|
assert scores["sents_f"] == None
|
||||||
|
|
||||||
|
|
||||||
def test_roc_auc_score():
|
def test_roc_auc_score():
|
||||||
# Binary classification, toy tests from scikit-learn test suite
|
# Binary classification, toy tests from scikit-learn test suite
|
||||||
y_true = [0, 1]
|
y_true = [0, 1]
|
||||||
|
@ -334,7 +390,8 @@ def test_roc_auc_score():
|
||||||
score = ROCAUCScore()
|
score = ROCAUCScore()
|
||||||
score.score_set(0.25, 0)
|
score.score_set(0.25, 0)
|
||||||
score.score_set(0.75, 0)
|
score.score_set(0.75, 0)
|
||||||
assert score.score == -float("inf")
|
with pytest.raises(ValueError):
|
||||||
|
s = score.score
|
||||||
|
|
||||||
y_true = [1, 1]
|
y_true = [1, 1]
|
||||||
y_score = [0.25, 0.75]
|
y_score = [0.25, 0.75]
|
||||||
|
@ -344,4 +401,5 @@ def test_roc_auc_score():
|
||||||
score = ROCAUCScore()
|
score = ROCAUCScore()
|
||||||
score.score_set(0.25, 1)
|
score.score_set(0.25, 1)
|
||||||
score.score_set(0.75, 1)
|
score.score_set(0.75, 1)
|
||||||
assert score.score == -float("inf")
|
with pytest.raises(ValueError):
|
||||||
|
s = score.score
|
||||||
|
|
|
@ -51,7 +51,7 @@ def test_readers():
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||||
assert scores["cats_score"]
|
assert scores["cats_score"] == 0.0
|
||||||
# ensure the pipeline runs
|
# ensure the pipeline runs
|
||||||
doc = nlp("Quick test")
|
doc = nlp("Quick test")
|
||||||
assert doc.cats
|
assert doc.cats
|
||||||
|
|
|
@ -2,6 +2,7 @@ import numpy
|
||||||
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
|
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
|
||||||
from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
||||||
from spacy.training import Corpus, docs_to_json, Example
|
from spacy.training import Corpus, docs_to_json, Example
|
||||||
|
from spacy.training.align import get_alignments
|
||||||
from spacy.training.converters import json_to_docs
|
from spacy.training.converters import json_to_docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
|
@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Outdated")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tokens_a,tokens_b,expected",
|
"tokens_a,tokens_b,expected",
|
||||||
[
|
[
|
||||||
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
|
(["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
|
||||||
(
|
(
|
||||||
["a", "b", '"', "c"],
|
["a", "b", '"', "c"],
|
||||||
['ab"', "c"],
|
['ab"', "c"],
|
||||||
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
|
([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
|
||||||
),
|
),
|
||||||
(["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
|
(["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
|
||||||
(
|
(
|
||||||
["ab", "c", "d"],
|
["ab", "c", "d"],
|
||||||
["a", "b", "cd"],
|
["a", "b", "cd"],
|
||||||
(6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
|
([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
["a", "b", "cd"],
|
["a", "b", "cd"],
|
||||||
["a", "b", "c", "d"],
|
["a", "b", "c", "d"],
|
||||||
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
|
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||||
),
|
),
|
||||||
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa
|
a2b, b2a = get_alignments(tokens_a, tokens_b)
|
||||||
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa
|
assert (a2b, b2a) == expected # noqa
|
||||||
# check symmetry
|
# check symmetry
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa
|
a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa
|
||||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa
|
assert (b2a, a2b) == expected # noqa
|
||||||
|
|
||||||
|
|
||||||
def test_goldparse_startswith_space(en_tokenizer):
|
def test_goldparse_startswith_space(en_tokenizer):
|
||||||
|
@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
|
||||||
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_goldparse_endswith_space(en_tokenizer):
|
||||||
|
text = "a\n"
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
gold_words = ["a"]
|
||||||
|
entities = ["U-DATE"]
|
||||||
|
deps = ["ROOT"]
|
||||||
|
heads = [0]
|
||||||
|
example = Example.from_dict(
|
||||||
|
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
||||||
|
)
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["U-DATE", "O"]
|
||||||
|
assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_constructor():
|
def test_gold_constructor():
|
||||||
"""Test that the Example constructor works fine"""
|
"""Test that the Example constructor works fine"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -676,6 +691,87 @@ def test_alignment_different_texts():
|
||||||
Alignment.from_strings(other_tokens, spacy_tokens)
|
Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment_spaces(en_vocab):
|
||||||
|
# single leading whitespace
|
||||||
|
other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||||
|
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||||
|
|
||||||
|
# multiple leading whitespace tokens
|
||||||
|
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||||
|
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
# both with leading whitespace, not identical
|
||||||
|
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
# same leading whitespace, different tokenization
|
||||||
|
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
|
||||||
|
assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
# only one with trailing whitespace
|
||||||
|
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
|
||||||
|
|
||||||
|
# different trailing whitespace
|
||||||
|
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
|
||||||
|
|
||||||
|
# same trailing whitespace, different tokenization
|
||||||
|
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
# differing whitespace is allowed
|
||||||
|
other_tokens = ["a", " \n ", "b", "c"]
|
||||||
|
spacy_tokens = ["a", "b", " ", "c"]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 3]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 2, 3]
|
||||||
|
|
||||||
|
# other differences in whitespace are allowed
|
||||||
|
other_tokens = [" ", "a"]
|
||||||
|
spacy_tokens = [" ", "a", " "]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
|
||||||
|
other_tokens = ["a", " "]
|
||||||
|
spacy_tokens = ["a", " "]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
|
||||||
|
|
||||||
def test_retokenized_docs(doc):
|
def test_retokenized_docs(doc):
|
||||||
a = doc.to_array(["TAG"])
|
a = doc.to_array(["TAG"])
|
||||||
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||||
|
|
|
@ -399,14 +399,13 @@ cdef class Doc:
|
||||||
return True
|
return True
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int range_start = 0
|
cdef int range_start = 0
|
||||||
|
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
||||||
|
attr = SENT_START
|
||||||
attr = intify_attr(attr)
|
attr = intify_attr(attr)
|
||||||
# adjust attributes
|
# adjust attributes
|
||||||
if attr == HEAD:
|
if attr == HEAD:
|
||||||
# HEAD does not have an unset state, so rely on DEP
|
# HEAD does not have an unset state, so rely on DEP
|
||||||
attr = DEP
|
attr = DEP
|
||||||
elif attr == self.vocab.strings["IS_SENT_START"]:
|
|
||||||
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
|
|
||||||
attr = SENT_START
|
|
||||||
# special cases for sentence boundaries
|
# special cases for sentence boundaries
|
||||||
if attr == SENT_START:
|
if attr == SENT_START:
|
||||||
if "sents" in self.user_hooks:
|
if "sents" in self.user_hooks:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .corpus import Corpus # noqa: F401
|
from .corpus import Corpus # noqa: F401
|
||||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||||
from .align import Alignment # noqa: F401
|
from .alignment import Alignment # noqa: F401
|
||||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||||
|
|
66
spacy/training/align.pyx
Normal file
66
spacy/training/align.pyx
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
from itertools import chain
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||||
|
# Create character-to-token mappings
|
||||||
|
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
||||||
|
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
||||||
|
str_a = "".join(A).lower()
|
||||||
|
str_b = "".join(B).lower()
|
||||||
|
cdef int len_str_a = len(str_a)
|
||||||
|
cdef int len_str_b = len(str_b)
|
||||||
|
# Check that the two texts only differ in whitespace and capitalization
|
||||||
|
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
|
||||||
|
len_str_a != len(char_to_token_a) or \
|
||||||
|
len_str_b != len(char_to_token_b):
|
||||||
|
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
|
||||||
|
cdef int char_idx_a = 0
|
||||||
|
cdef int char_idx_b = 0
|
||||||
|
cdef int token_idx_a = 0
|
||||||
|
cdef int token_idx_b = 0
|
||||||
|
cdef int prev_token_idx_a = -1
|
||||||
|
cdef int prev_token_idx_b = -1
|
||||||
|
a2b = []
|
||||||
|
b2a = []
|
||||||
|
while char_idx_a < len_str_a and char_idx_b < len_str_b:
|
||||||
|
# Find the current token position from the character position
|
||||||
|
token_idx_a = char_to_token_a[char_idx_a]
|
||||||
|
token_idx_b = char_to_token_b[char_idx_b]
|
||||||
|
# Add a set for the next token if a token boundary has been crossed
|
||||||
|
if prev_token_idx_a != token_idx_a:
|
||||||
|
a2b.append(set())
|
||||||
|
if prev_token_idx_b != token_idx_b:
|
||||||
|
b2a.append(set())
|
||||||
|
# Process the alignment at the current position
|
||||||
|
if A[token_idx_a] == B[token_idx_b]:
|
||||||
|
# Current tokens are identical
|
||||||
|
a2b[-1].add(token_idx_b)
|
||||||
|
b2a[-1].add(token_idx_a)
|
||||||
|
char_idx_a += len(A[token_idx_a])
|
||||||
|
char_idx_b += len(B[token_idx_b])
|
||||||
|
elif str_a[char_idx_a] == str_b[char_idx_b]:
|
||||||
|
# Current chars are identical
|
||||||
|
a2b[-1].add(token_idx_b)
|
||||||
|
b2a[-1].add(token_idx_a)
|
||||||
|
char_idx_a += 1
|
||||||
|
char_idx_b += 1
|
||||||
|
elif str_a[char_idx_a].isspace():
|
||||||
|
# Skip unaligned whitespace char in A
|
||||||
|
char_idx_a += 1
|
||||||
|
elif str_b[char_idx_b].isspace():
|
||||||
|
# Skip unaligned whitespace char in B
|
||||||
|
char_idx_b += 1
|
||||||
|
else:
|
||||||
|
# This should never happen
|
||||||
|
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
|
||||||
|
prev_token_idx_a = token_idx_a
|
||||||
|
prev_token_idx_b = token_idx_b
|
||||||
|
# Process unaligned trailing whitespace
|
||||||
|
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
|
||||||
|
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
|
||||||
|
# Return values as sorted lists per token position
|
||||||
|
return [sorted(x) for x in a2b], [sorted(x) for x in b2a]
|
|
@ -2,9 +2,8 @@ from typing import List
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.types import Ragged
|
from thinc.types import Ragged
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import tokenizations
|
|
||||||
|
|
||||||
from ..errors import Errors
|
from .align import get_alignments
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -20,9 +19,7 @@ class Alignment:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||||
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
|
x2y, y2x = get_alignments(A, B)
|
||||||
raise ValueError(Errors.E949)
|
|
||||||
x2y, y2x = tokenizations.get_alignments(A, B)
|
|
||||||
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.span import Span
|
from ..tokens.span import Span
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align import Alignment
|
from .alignment import Alignment
|
||||||
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||||
from .iob_utils import biluo_tags_to_spans
|
from .iob_utils import biluo_tags_to_spans
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
|
|
@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
|
if not isinstance(T["train_corpus"], str):
|
||||||
|
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
|
||||||
|
if not isinstance(T["dev_corpus"], str):
|
||||||
|
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
|
||||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import registry, load_model_from_config, dot_to_object
|
from ..util import registry, load_model_from_config, resolve_dot_names
|
||||||
|
|
||||||
|
|
||||||
def pretrain(
|
def pretrain(
|
||||||
|
@ -38,7 +38,7 @@ def pretrain(
|
||||||
_config = nlp.config.interpolate()
|
_config = nlp.config.interpolate()
|
||||||
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
|
||||||
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
|
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
|
||||||
corpus = dot_to_object(T, P["corpus"])
|
corpus = resolve_dot_names(_config, [P["corpus"]])[0]
|
||||||
batcher = P["batcher"]
|
batcher = P["batcher"]
|
||||||
model = create_pretraining_model(nlp, P)
|
model = create_pretraining_model(nlp, P)
|
||||||
optimizer = P["optimizer"]
|
optimizer = P["optimizer"]
|
||||||
|
|
|
@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
|
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it through
|
||||||
a feed-forward subnetwork to build a mixed representations. The features used
|
a feed-forward subnetwork to build a mixed representation. The features used
|
||||||
can be configured with the `attrs` argument. The suggested attributes are
|
can be configured with the `attrs` argument. The suggested attributes are
|
||||||
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
||||||
some subword information, without construction a fully character-based
|
some subword information, without construction a fully character-based
|
||||||
|
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
|
||||||
different architectures and settings to determine what works best on your
|
different architectures and settings to determine what works best on your
|
||||||
specific data and challenge.
|
specific data and challenge.
|
||||||
|
|
||||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.TextCatEnsemble.v1"
|
> @architectures = "spacy.TextCatEnsemble.v2"
|
||||||
> exclusive_classes = false
|
|
||||||
> pretrained_vectors = null
|
|
||||||
> width = 64
|
|
||||||
> embed_size = 2000
|
|
||||||
> conv_depth = 2
|
|
||||||
> window_size = 1
|
|
||||||
> ngram_size = 1
|
|
||||||
> dropout = null
|
|
||||||
> nO = null
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.linear_model]
|
||||||
|
> @architectures = "spacy.TextCatBOW.v1"
|
||||||
|
> exclusive_classes = true
|
||||||
|
> ngram_size = 1
|
||||||
|
> no_output_layer = false
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> @architectures = "spacy.Tok2Vec.v1"
|
||||||
|
>
|
||||||
|
> [model.tok2vec.embed]
|
||||||
|
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
> width = 64
|
||||||
|
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
|
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
> include_static_vectors = false
|
||||||
|
>
|
||||||
|
> [model.tok2vec.encode]
|
||||||
|
> @architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
> width = ${model.tok2vec.embed.width}
|
||||||
|
> window_size = 1
|
||||||
|
> maxout_pieces = 3
|
||||||
|
> depth = 2
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
Stacked ensemble of a linear bag-of-words model and a neural network model. The
|
||||||
network has an internal CNN Tok2Vec layer and uses attention.
|
neural network is built upon a Tok2Vec layer and uses attention. The setting for
|
||||||
|
whether or not this model should cater for multi-label classification, is taken
|
||||||
|
from the linear model, where it is stored in `model.attrs["multi_label"]`.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
||||||
|
|
||||||
|
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
|
|
|
@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
|
||||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||||
|
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
||||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
|
@ -68,6 +68,8 @@ Scores the tokenization:
|
||||||
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
|
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
|
||||||
character spans
|
character spans
|
||||||
|
|
||||||
|
Docs with `has_unknown_spaces` are skipped during scoring.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
@ -81,7 +83,8 @@ Scores the tokenization:
|
||||||
|
|
||||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores a single token attribute.
|
Scores a single token attribute. Tokens with missing values in the reference doc
|
||||||
|
are skipped during scoring.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -90,20 +93,22 @@ Scores a single token attribute.
|
||||||
> print(scores["pos_acc"])
|
> print(scores["pos_acc"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| `attr` | The attribute to score. ~~str~~ |
|
| `attr` | The attribute to score. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||||
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
|
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Scores a single token attribute per feature for a token attribute in the
|
Scores a single token attribute per feature for a token attribute in the
|
||||||
Universal Dependencies
|
Universal Dependencies
|
||||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||||
format.
|
format. Tokens with missing values in the reference doc are skipped during
|
||||||
|
scoring.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -112,13 +117,14 @@ format.
|
||||||
> print(scores["morph_per_feat"])
|
> print(scores["morph_per_feat"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| `attr` | The attribute to score. ~~str~~ |
|
| `attr` | The attribute to score. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||||
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
|
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
|
||||||
|
|
||||||
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
||||||
|
|
||||||
|
@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
|
||||||
> print(scores["ents_f"])
|
> print(scores["ents_f"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| `attr` | The attribute to score. ~~str~~ |
|
| `attr` | The attribute to score. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
|
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
|
||||||
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
|
|
||||||
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
|
||||||
|
with missing values for the `attr` (typically `dep`) are skipped during scoring.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||||
> print(scores["dep_uas"], scores["dep_las"])
|
> print(scores["dep_uas"], scores["dep_las"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| `attr` | The attribute to score. ~~str~~ |
|
| `attr` | The attribute to score. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||||
| `head_attr` | The attribute containing the head token. ~~str~~ |
|
| `head_attr` | The attribute containing the head token. ~~str~~ |
|
||||||
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
|
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
|
||||||
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
|
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
|
|
||||||
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||||
|
|
||||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||||
containing scores for each label like `Doc.cats`. The reported overall score
|
containing scores for each label like `Doc.cats`. The returned dictionary
|
||||||
depends on the scorer settings:
|
contains the following scores:
|
||||||
|
|
||||||
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
|
- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
|
||||||
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
|
each label is weighted equally
|
||||||
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
|
- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
|
||||||
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
|
across evaluations per label
|
||||||
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
|
- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
|
||||||
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
|
scores, keyed by label
|
||||||
|
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
|
||||||
|
description)
|
||||||
|
|
||||||
|
The reported `{attr}_score` depends on the classification properties:
|
||||||
|
|
||||||
|
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
|
||||||
|
of the positive label
|
||||||
|
- **3+ exclusive classes**, macro-averaged F-score:
|
||||||
|
`{attr}_score = {attr}_macro_f`
|
||||||
|
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers))
|
||||||
|
|
||||||
Computing similarity scores can be helpful in many situations, but it's also
|
Computing similarity scores can be helpful in many situations, but it's also
|
||||||
important to maintain **realistic expectations** about what information it can
|
important to maintain **realistic expectations** about what information it can
|
||||||
provide. Words can be related to each over in many ways, so a single
|
provide. Words can be related to each other in many ways, so a single
|
||||||
"similarity" score will always be a **mix of different signals**, and vectors
|
"similarity" score will always be a **mix of different signals**, and vectors
|
||||||
trained on different data can produce very different results that may not be
|
trained on different data can produce very different results that may not be
|
||||||
useful for your purpose. Here are some important considerations to keep in mind:
|
useful for your purpose. Here are some important considerations to keep in mind:
|
||||||
|
|
|
@ -130,16 +130,31 @@ factory = "textcat"
|
||||||
labels = []
|
labels = []
|
||||||
|
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
exclusive_classes = false
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
|
||||||
dropout = 0
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 64
|
||||||
|
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||||
|
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = ${components.textcat.model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 2
|
||||||
|
|
||||||
|
[components.textcat.model.linear_model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = false
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
```
|
```
|
||||||
|
|
||||||
spaCy has two additional built-in `textcat` architectures, and you can easily
|
spaCy has two additional built-in `textcat` architectures, and you can easily
|
||||||
|
@ -687,7 +702,7 @@ Before the model can be used, it needs to be
|
||||||
[initialized](/usage/training#initialization). This function receives a callback
|
[initialized](/usage/training#initialization). This function receives a callback
|
||||||
to access the full **training data set**, or a representative sample. This data
|
to access the full **training data set**, or a representative sample. This data
|
||||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
||||||
labels can be provided to `initialize`, or you can call
|
labels can be provided to `initialize`, or you can call
|
||||||
`RelationExtractor.add_label` directly. The number of labels defines the output
|
`RelationExtractor.add_label` directly. The number of labels defines the output
|
||||||
dimensionality of the network, and will be used to do
|
dimensionality of the network, and will be used to do
|
||||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||||
|
|
|
@ -1244,15 +1244,10 @@ labels = []
|
||||||
# This function is created and then passed to the "textcat" component as
|
# This function is created and then passed to the "textcat" component as
|
||||||
# the argument "model"
|
# the argument "model"
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatEnsemble.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
pretrained_vectors = null
|
|
||||||
width = 64
|
|
||||||
conv_depth = 2
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
dropout = null
|
no_output_layer = false
|
||||||
|
|
||||||
[components.other_textcat]
|
[components.other_textcat]
|
||||||
factory = "textcat"
|
factory = "textcat"
|
||||||
|
|
|
@ -1142,7 +1142,7 @@ pattern = [
|
||||||
{
|
{
|
||||||
"LEFT_ID": "anchor_founded",
|
"LEFT_ID": "anchor_founded",
|
||||||
"REL_OP": ">",
|
"REL_OP": ">",
|
||||||
"RIGHT_ID": "subject",
|
"RIGHT_ID": "founded_subject",
|
||||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||||
}
|
}
|
||||||
# ...
|
# ...
|
||||||
|
@ -1212,7 +1212,7 @@ pattern = [
|
||||||
{
|
{
|
||||||
"LEFT_ID": "anchor_founded",
|
"LEFT_ID": "anchor_founded",
|
||||||
"REL_OP": ">",
|
"REL_OP": ">",
|
||||||
"RIGHT_ID": "subject",
|
"RIGHT_ID": "founded_subject",
|
||||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -717,7 +717,7 @@ tabular results to a file:
|
||||||
```python
|
```python
|
||||||
### functions.py
|
### functions.py
|
||||||
import sys
|
import sys
|
||||||
from typing import IO, Tuple, Callable, Dict, Any
|
from typing import IO, Tuple, Callable, Dict, Any, Optional
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -729,7 +729,7 @@ def custom_logger(log_path):
|
||||||
stdout: IO=sys.stdout,
|
stdout: IO=sys.stdout,
|
||||||
stderr: IO=sys.stderr
|
stderr: IO=sys.stderr
|
||||||
) -> Tuple[Callable, Callable]:
|
) -> Tuple[Callable, Callable]:
|
||||||
stdout.write(f"Logging to {log_path}\n")
|
stdout.write(f"Logging to {log_path}\\n")
|
||||||
log_file = Path(log_path).open("w", encoding="utf8")
|
log_file = Path(log_path).open("w", encoding="utf8")
|
||||||
log_file.write("step\\t")
|
log_file.write("step\\t")
|
||||||
log_file.write("score\\t")
|
log_file.write("score\\t")
|
||||||
|
|
|
@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||||
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
| [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||||
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
|
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
|
||||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||||
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||||
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
|
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
|
||||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||||
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
||||||
|
@ -1032,9 +1032,9 @@ change your names and imports:
|
||||||
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
|
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
|
||||||
and maintaining one of the many awesome [plugins and extensions](/universe).
|
and maintaining one of the many awesome [plugins and extensions](/universe).
|
||||||
We've tried to make it as easy as possible for you to upgrade your packages for
|
We've tried to make it as easy as possible for you to upgrade your packages for
|
||||||
spaCy v3.0. The most common use case for plugins is providing pipeline components
|
spaCy v3.0. The most common use case for plugins is providing pipeline
|
||||||
and extension attributes. When migrating your plugin, double-check the
|
components and extension attributes. When migrating your plugin, double-check
|
||||||
following:
|
the following:
|
||||||
|
|
||||||
- Use the [`@Language.factory`](/api/language#factory) decorator to register
|
- Use the [`@Language.factory`](/api/language#factory) decorator to register
|
||||||
your component and assign it a name. This allows users to refer to your
|
your component and assign it a name. This allows users to refer to your
|
||||||
|
|
|
@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg)
|
||||||
Since each visualization is generated as a separate SVG, exporting `.svg` files
|
Since each visualization is generated as a separate SVG, exporting `.svg` files
|
||||||
only works if you're rendering **one single doc** at a time. (This makes sense –
|
only works if you're rendering **one single doc** at a time. (This makes sense –
|
||||||
after all, each visualization should be a standalone graphic.) So instead of
|
after all, each visualization should be a standalone graphic.) So instead of
|
||||||
rendering all `Doc`s at one, loop over them and export them separately.
|
rendering all `Doc`s at once, loop over them and export them separately.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,7 @@ function formatAccuracy(data) {
|
||||||
? null
|
? null
|
||||||
: {
|
: {
|
||||||
label,
|
label,
|
||||||
value: value.toFixed(2),
|
value: (value * 100).toFixed(2),
|
||||||
help: MODEL_META[label],
|
help: MODEL_META[label],
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in New Issue
Block a user