mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 18:10:35 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
1075b7ebb7
|
@ -8,7 +8,6 @@ requires = [
|
|||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0rc0,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -14,8 +14,7 @@ pathy
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.5.0,<2.0.0
|
||||
pytokenizations
|
||||
pydantic>=1.5.0,<1.7.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
|
|
@ -51,8 +51,8 @@ install_requires =
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.5.0,<2.0.0
|
||||
pytokenizations
|
||||
pydantic>=1.5.0,<1.7.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
|
1
setup.py
1
setup.py
|
@ -49,6 +49,7 @@ MOD_NAMES = [
|
|||
"spacy.pipeline._parser_internals.stateclass",
|
||||
"spacy.pipeline._parser_internals.transition_system",
|
||||
"spacy.tokenizer",
|
||||
"spacy.training.align",
|
||||
"spacy.training.gold_io",
|
||||
"spacy.tokens.doc",
|
||||
"spacy.tokens.span",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0rc1"
|
||||
__version__ = "3.0.0rc2"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -93,27 +93,42 @@ def evaluate(
|
|||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if "morph_per_feat" in scores:
|
||||
if scores["morph_per_feat"]:
|
||||
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
|
||||
data["morph_per_feat"] = scores["morph_per_feat"]
|
||||
if "dep_las_per_type" in scores:
|
||||
if scores["dep_las_per_type"]:
|
||||
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
|
||||
data["dep_las_per_type"] = scores["dep_las_per_type"]
|
||||
if "ents_per_type" in scores:
|
||||
if scores["ents_per_type"]:
|
||||
print_ents_per_type(msg, scores["ents_per_type"])
|
||||
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
|
||||
data["ents_per_type"] = scores["ents_per_type"]
|
||||
if "cats_f_per_type" in scores:
|
||||
if scores["cats_f_per_type"]:
|
||||
print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
|
||||
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
|
||||
data["cats_f_per_type"] = scores["cats_f_per_type"]
|
||||
if "cats_auc_per_type" in scores:
|
||||
if scores["cats_auc_per_type"]:
|
||||
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
||||
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
|
@ -157,7 +172,7 @@ def render_parses(
|
|||
file_.write(html)
|
||||
|
||||
|
||||
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||
for k, v in scores.items()
|
||||
|
@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
|
|||
data,
|
||||
header=("", "P", "R", "F"),
|
||||
aligns=("l", "r", "r", "r"),
|
||||
title="NER (per type)",
|
||||
)
|
||||
|
||||
|
||||
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
||||
for k, v in scores.items()
|
||||
]
|
||||
msg.table(
|
||||
data,
|
||||
header=("", "P", "R", "F"),
|
||||
aligns=("l", "r", "r", "r"),
|
||||
title="Textcat F (per label)",
|
||||
title=f"{name} (per {type})",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ def init_vectors_cli(
|
|||
nlp.to_disk(output_dir)
|
||||
msg.good(
|
||||
"Saved nlp object with vectors to output directory. You can now use the "
|
||||
"path to it in your config as the 'vectors' setting in [initialize.vocab].",
|
||||
"path to it in your config as the 'vectors' setting in [initialize].",
|
||||
output_dir.resolve(),
|
||||
)
|
||||
|
||||
|
@ -100,7 +100,7 @@ def init_labels_cli(
|
|||
extract the labels."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
output_path.mkdir(parents=True)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
|
|
|
@ -136,15 +136,19 @@ factory = "textcat"
|
|||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
|
@ -271,15 +275,19 @@ factory = "textcat"
|
|||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
|
|
|
@ -44,7 +44,7 @@ def train_cli(
|
|||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if output_path is not None and not output_path.exists():
|
||||
output_path.mkdir()
|
||||
output_path.mkdir(parents=True)
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
|
|
|
@ -398,8 +398,8 @@ class Errors:
|
|||
E163 = ("cumsum was found to be unstable: its last element does not "
|
||||
"correspond to sum")
|
||||
E164 = ("x is neither increasing nor decreasing: {x}.")
|
||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||
"that case.")
|
||||
E165 = ("Only one class present in the gold labels: {label}. "
|
||||
"ROC AUC score is not defined in that case.")
|
||||
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
|
||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
|
@ -456,6 +456,8 @@ class Errors:
|
|||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E897 = ("Field '{field}' should be a dot-notation string referring to the "
|
||||
"relevant section in the config, but found type {type} instead.")
|
||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||
"is not set or None. If you've implemented a custom component, make "
|
||||
"sure to store the component model as `self.model` in your "
|
||||
|
@ -562,7 +564,10 @@ class Errors:
|
|||
"a string value from {expected} but got: '{arg}'")
|
||||
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
||||
"a list, but got: {arg_type}")
|
||||
E949 = ("Can only create an alignment when the texts are the same.")
|
||||
E949 = ("Unable to align tokens for the predicted and reference docs. It "
|
||||
"is only possible to align the docs when both texts are the same "
|
||||
"except for whitespace and capitalization. The predicted tokens "
|
||||
"start with: {x}. The reference tokens start with: {y}.")
|
||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
|
|
|
@ -286,10 +286,10 @@ cdef class DependencyMatcher:
|
|||
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
||||
for matched_tree in matched_trees:
|
||||
matched_key_trees.append((key, matched_tree))
|
||||
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(match_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matched_key_trees)
|
||||
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(match_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matched_key_trees)
|
||||
return matched_key_trees
|
||||
|
||||
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
|
@ -10,12 +12,13 @@ from ...util import registry
|
|||
from ..extract_ngrams import extract_ngrams
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||
def build_simple_cnn_text_classifier(
|
||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||
) -> Model:
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
"""
|
||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||
|
@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
|
|||
is applied instead, so that outputs are in the range [0, 1].
|
||||
"""
|
||||
with Model.define_operators({">>": chain}):
|
||||
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||
model = cnn >> output_layer
|
||||
model.set_ref("output_layer", output_layer)
|
||||
else:
|
||||
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||
model = (
|
||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||
)
|
||||
model = cnn >> linear_layer >> Logistic()
|
||||
model.set_ref("output_layer", linear_layer)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_dim("nO", nO)
|
||||
|
@ -45,8 +47,7 @@ def build_bow_text_classifier(
|
|||
ngram_size: int,
|
||||
no_output_layer: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
with Model.define_operators({">>": chain}):
|
||||
sparse_linear = SparseLinear(nO)
|
||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||
|
@ -59,6 +60,39 @@ def build_bow_text_classifier(
|
|||
return model
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||
def build_text_classifier(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
linear_model: Model[List[Doc], Floats2d],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
width = tok2vec.get_dim("nO")
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
|
||||
>> reduce_sum()
|
||||
>> residual(Maxout(nO=width, nI=width))
|
||||
>> Linear(nO=nO, nI=width)
|
||||
>> Dropout(0.0)
|
||||
)
|
||||
|
||||
nO_double = nO * 2 if nO else None
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||
else:
|
||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
if model.has_dim("nO") is not False:
|
||||
model.set_dim("nO", nO)
|
||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||
model.attrs["multi_label"] = not exclusive_classes
|
||||
return model
|
||||
|
||||
# TODO: move to legacy
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||
def build_text_classifier(
|
||||
width: int,
|
||||
|
@ -158,11 +192,8 @@ def build_text_classifier(
|
|||
|
||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||
def build_text_classifier_lowdata(
|
||||
width: int,
|
||||
pretrained_vectors: Optional[bool],
|
||||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
|
|
|
@ -106,7 +106,7 @@ def MultiHashEmbed(
|
|||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it
|
||||
through a feed-forward subnetwork to build a mixed representations.
|
||||
through a feed-forward subnetwork to build a mixed representation.
|
||||
|
||||
The features used can be configured with the 'attrs' argument. The suggested
|
||||
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||
|
|
|
@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#score
|
||||
"""
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "AttributeRuler.score")
|
||||
results = {}
|
||||
attrs = set()
|
||||
|
@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
|
|||
elif attr == POS:
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
elif attr == MORPH:
|
||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
elif attr == LEMMA:
|
||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
|
|
@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "DependencyParser.score")
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", **kwargs))
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..errors import Errors
|
|||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..scorer import Scorer
|
||||
from ..scorer import get_ner_prf
|
||||
from ..training import validate_examples
|
||||
|
||||
|
||||
|
@ -340,7 +340,7 @@ class EntityRuler(Pipe):
|
|||
|
||||
def score(self, examples, **kwargs):
|
||||
validate_examples(examples, "EntityRuler.score")
|
||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
return get_ner_prf(examples)
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
|
|
|
@ -251,10 +251,13 @@ class Morphologizer(Tagger):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#score
|
||||
"""
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "Morphologizer.score")
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", **kwargs))
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
return results
|
||||
|
|
|
@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
|
|||
DOCS: https://nightly.spacy.io/api/entityrecognizer#score
|
||||
"""
|
||||
validate_examples(examples, "EntityRecognizer.score")
|
||||
score_per_type = get_ner_prf(examples)
|
||||
totals = PRFScore()
|
||||
for prf in score_per_type.values():
|
||||
totals += prf
|
||||
return {
|
||||
"ents_p": totals.precision,
|
||||
"ents_r": totals.recall,
|
||||
"ents_f": totals.fscore,
|
||||
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||
}
|
||||
return get_ner_prf(examples)
|
||||
|
|
|
@ -155,8 +155,11 @@ class Sentencizer(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/sentencizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "Sentencizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
|
|
|
@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
|
|||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "SentenceRecognizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
|
|
@ -16,15 +16,30 @@ from ..vocab import Vocab
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
pretrained_vectors = null
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 2
|
||||
|
||||
[model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
dropout = null
|
||||
no_output_layer = false
|
||||
"""
|
||||
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
@ -60,9 +75,11 @@ subword_features = true
|
|||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
"cats_p": None,
|
||||
"cats_r": None,
|
||||
"cats_f": None,
|
||||
"cats_micro_p": None,
|
||||
"cats_micro_r": None,
|
||||
"cats_micro_f": None,
|
||||
"cats_macro_p": None,
|
||||
"cats_macro_r": None,
|
||||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
|
|
287
spacy/scorer.py
287
spacy/scorer.py
|
@ -1,9 +1,9 @@
|
|||
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
|
||||
from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
from .training import Example
|
||||
from .tokens import Token, Doc, Span
|
||||
from .tokens import Token, Doc, Span, MorphAnalysis
|
||||
from .errors import Errors
|
||||
from .util import get_lang_class, SimpleFrozenList
|
||||
from .morphology import Morphology
|
||||
|
@ -13,7 +13,8 @@ if TYPE_CHECKING:
|
|||
from .language import Language # noqa: F401
|
||||
|
||||
|
||||
DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
|
||||
DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
|
||||
MISSING_VALUES = frozenset([None, 0, ""])
|
||||
|
||||
|
||||
class PRFScore:
|
||||
|
@ -24,6 +25,9 @@ class PRFScore:
|
|||
self.fp = 0
|
||||
self.fn = 0
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.tp + self.fp + self.fn
|
||||
|
||||
def __iadd__(self, other):
|
||||
self.tp += other.tp
|
||||
self.fp += other.fp
|
||||
|
@ -59,7 +63,9 @@ class PRFScore:
|
|||
|
||||
|
||||
class ROCAUCScore:
|
||||
"""An AUC ROC score."""
|
||||
"""An AUC ROC score. This is only defined for binary classification.
|
||||
Use the method is_binary before calculating the score, otherwise it
|
||||
may throw an error."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.golds = []
|
||||
|
@ -71,16 +77,16 @@ class ROCAUCScore:
|
|||
self.cands.append(cand)
|
||||
self.golds.append(gold)
|
||||
|
||||
def is_binary(self):
|
||||
return len(np.unique(self.golds)) == 2
|
||||
|
||||
@property
|
||||
def score(self):
|
||||
if not self.is_binary():
|
||||
raise ValueError(Errors.E165.format(label=set(self.golds)))
|
||||
if len(self.golds) == self.saved_score_at_len:
|
||||
return self.saved_score
|
||||
try:
|
||||
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
||||
# catch ValueError: Only one class present in y_true.
|
||||
# ROC AUC score is not defined in that case.
|
||||
except ValueError:
|
||||
self.saved_score = -float("inf")
|
||||
self.saved_score = _roc_auc_score(self.golds, self.cands)
|
||||
self.saved_score_at_len = len(self.golds)
|
||||
return self.saved_score
|
||||
|
||||
|
@ -92,7 +98,7 @@ class Scorer:
|
|||
self,
|
||||
nlp: Optional["Language"] = None,
|
||||
default_lang: str = "xx",
|
||||
default_pipeline=DEFAULT_PIPELINE,
|
||||
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
||||
**cfg,
|
||||
) -> None:
|
||||
"""Initialize the Scorer.
|
||||
|
@ -124,13 +130,13 @@ class Scorer:
|
|||
return scores
|
||||
|
||||
@staticmethod
|
||||
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
|
||||
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
|
||||
"""Returns accuracy and PRF scores for tokenization.
|
||||
* token_acc: # correct tokens / # gold tokens
|
||||
* token_p/r/f: PRF for token character spans
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
RETURNS (Dict[str, float]): A dictionary containing the scores
|
||||
RETURNS (Dict[str, Any]): A dictionary containing the scores
|
||||
token_acc/p/r/f.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
|
||||
|
@ -140,6 +146,8 @@ class Scorer:
|
|||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
if gold_doc.has_unknown_spaces:
|
||||
continue
|
||||
align = example.alignment
|
||||
gold_spans = set()
|
||||
pred_spans = set()
|
||||
|
@ -156,12 +164,20 @@ class Scorer:
|
|||
else:
|
||||
acc_score.tp += 1
|
||||
prf_score.score_set(pred_spans, gold_spans)
|
||||
return {
|
||||
"token_acc": acc_score.fscore,
|
||||
"token_p": prf_score.precision,
|
||||
"token_r": prf_score.recall,
|
||||
"token_f": prf_score.fscore,
|
||||
}
|
||||
if len(acc_score) > 0:
|
||||
return {
|
||||
"token_acc": acc_score.fscore,
|
||||
"token_p": prf_score.precision,
|
||||
"token_r": prf_score.recall,
|
||||
"token_f": prf_score.fscore,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"token_acc": None,
|
||||
"token_p": None,
|
||||
"token_r": None,
|
||||
"token_f": None
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def score_token_attr(
|
||||
|
@ -169,8 +185,9 @@ class Scorer:
|
|||
attr: str,
|
||||
*,
|
||||
getter: Callable[[Token, str], Any] = getattr,
|
||||
missing_values: Set[Any] = MISSING_VALUES,
|
||||
**cfg,
|
||||
) -> Dict[str, float]:
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns an accuracy score for a token-level attribute.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
|
@ -178,7 +195,7 @@ class Scorer:
|
|||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
RETURNS (Dict[str, float]): A dictionary containing the accuracy score
|
||||
RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
|
||||
under the key attr_acc.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
|
||||
|
@ -189,17 +206,27 @@ class Scorer:
|
|||
pred_doc = example.predicted
|
||||
align = example.alignment
|
||||
gold_tags = set()
|
||||
missing_indices = set()
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
gold_tags.add((gold_i, getter(token, attr)))
|
||||
value = getter(token, attr)
|
||||
if value not in missing_values:
|
||||
gold_tags.add((gold_i, getter(token, attr)))
|
||||
else:
|
||||
missing_indices.add(gold_i)
|
||||
pred_tags = set()
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
pred_tags.add((gold_i, getter(token, attr)))
|
||||
if gold_i not in missing_indices:
|
||||
pred_tags.add((gold_i, getter(token, attr)))
|
||||
tag_score.score_set(pred_tags, gold_tags)
|
||||
return {f"{attr}_acc": tag_score.fscore}
|
||||
score_key = f"{attr}_acc"
|
||||
if len(tag_score) == 0:
|
||||
return {score_key: None}
|
||||
else:
|
||||
return {score_key: tag_score.fscore}
|
||||
|
||||
@staticmethod
|
||||
def score_token_attr_per_feat(
|
||||
|
@ -207,8 +234,9 @@ class Scorer:
|
|||
attr: str,
|
||||
*,
|
||||
getter: Callable[[Token, str], Any] = getattr,
|
||||
missing_values: Set[Any] = MISSING_VALUES,
|
||||
**cfg,
|
||||
):
|
||||
) -> Dict[str, Any]:
|
||||
"""Return PRF scores per feat for a token attribute in UFEATS format.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
|
@ -216,7 +244,7 @@ class Scorer:
|
|||
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||
getter(token, attr) should return the value of the attribute for an
|
||||
individual token.
|
||||
RETURNS (dict): A dictionary containing the per-feat PRF scores unders
|
||||
RETURNS (dict): A dictionary containing the per-feat PRF scores under
|
||||
the key attr_per_feat.
|
||||
"""
|
||||
per_feat = {}
|
||||
|
@ -225,9 +253,11 @@ class Scorer:
|
|||
gold_doc = example.reference
|
||||
align = example.alignment
|
||||
gold_per_feat = {}
|
||||
missing_indices = set()
|
||||
for gold_i, token in enumerate(gold_doc):
|
||||
morph = str(getter(token, attr))
|
||||
if morph:
|
||||
value = getter(token, attr)
|
||||
morph = gold_doc.vocab.strings[value]
|
||||
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
|
||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in per_feat:
|
||||
|
@ -235,27 +265,35 @@ class Scorer:
|
|||
if field not in gold_per_feat:
|
||||
gold_per_feat[field] = set()
|
||||
gold_per_feat[field].add((gold_i, feat))
|
||||
else:
|
||||
missing_indices.add(gold_i)
|
||||
pred_per_feat = {}
|
||||
for token in pred_doc:
|
||||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
morph = str(getter(token, attr))
|
||||
if morph:
|
||||
for feat in morph.split("|"):
|
||||
field, values = feat.split("=")
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
if field not in pred_per_feat:
|
||||
pred_per_feat[field] = set()
|
||||
pred_per_feat[field].add((gold_i, feat))
|
||||
if gold_i not in missing_indices:
|
||||
value = getter(token, attr)
|
||||
morph = gold_doc.vocab.strings[value]
|
||||
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
|
||||
for feat in morph.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in per_feat:
|
||||
per_feat[field] = PRFScore()
|
||||
if field not in pred_per_feat:
|
||||
pred_per_feat[field] = set()
|
||||
pred_per_feat[field].add((gold_i, feat))
|
||||
for field in per_feat:
|
||||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||
)
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {f"{attr}_per_feat": result}
|
||||
score_key = f"{attr}_per_feat"
|
||||
if any([len(v) for v in per_feat.values()]):
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {score_key: result}
|
||||
else:
|
||||
return {score_key: None}
|
||||
|
||||
@staticmethod
|
||||
def score_spans(
|
||||
|
@ -263,6 +301,7 @@ class Scorer:
|
|||
attr: str,
|
||||
*,
|
||||
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
|
||||
has_annotation: Optional[Callable[[Doc], bool]] = None,
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns PRF scores for labeled spans.
|
||||
|
@ -282,18 +321,10 @@ class Scorer:
|
|||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
gold_doc = example.reference
|
||||
# TODO
|
||||
# This is a temporary hack to work around the problem that the scorer
|
||||
# fails if you have examples that are not fully annotated for all
|
||||
# the tasks in your pipeline. For instance, you might have a corpus
|
||||
# of NER annotations that does not set sentence boundaries, but the
|
||||
# pipeline includes a parser or senter, and then the score_weights
|
||||
# are used to evaluate that component. When the scorer attempts
|
||||
# to read the sentences from the gold document, it fails.
|
||||
try:
|
||||
list(getter(gold_doc, attr))
|
||||
except ValueError:
|
||||
continue
|
||||
# Option to handle docs without sents
|
||||
if has_annotation is not None:
|
||||
if not has_annotation(gold_doc):
|
||||
continue
|
||||
# Find all labels in gold and doc
|
||||
labels = set(
|
||||
[k.label_ for k in getter(gold_doc, attr)]
|
||||
|
@ -321,13 +352,21 @@ class Scorer:
|
|||
v.score_set(pred_per_type[k], gold_per_type[k])
|
||||
# Score for all labels
|
||||
score.score_set(pred_spans, gold_spans)
|
||||
results = {
|
||||
f"{attr}_p": score.precision,
|
||||
f"{attr}_r": score.recall,
|
||||
f"{attr}_f": score.fscore,
|
||||
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||
}
|
||||
return results
|
||||
if len(score) > 0:
|
||||
return {
|
||||
f"{attr}_p": score.precision,
|
||||
f"{attr}_r": score.recall,
|
||||
f"{attr}_f": score.fscore,
|
||||
f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||
}
|
||||
else:
|
||||
return {
|
||||
f"{attr}_p": None,
|
||||
f"{attr}_r": None,
|
||||
f"{attr}_f": None,
|
||||
f"{attr}_per_type": None,
|
||||
}
|
||||
|
||||
|
||||
@staticmethod
|
||||
def score_cats(
|
||||
|
@ -362,9 +401,13 @@ class Scorer:
|
|||
for all:
|
||||
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
|
||||
attr_score_desc (text description of the overall score),
|
||||
attr_micro_p,
|
||||
attr_micro_r,
|
||||
attr_micro_f,
|
||||
attr_macro_p,
|
||||
attr_macro_r,
|
||||
attr_macro_f,
|
||||
attr_auc,
|
||||
attr_macro_auc,
|
||||
attr_f_per_type,
|
||||
attr_auc_per_type
|
||||
|
||||
|
@ -384,9 +427,6 @@ class Scorer:
|
|||
pred_cats = getter(example.predicted, attr)
|
||||
gold_cats = getter(example.reference, attr)
|
||||
|
||||
# I think the AUC metric is applicable regardless of whether we're
|
||||
# doing multi-label classification? Unsure. If not, move this into
|
||||
# the elif pred_cats and gold_cats block below.
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label, 0.0)
|
||||
|
@ -431,7 +471,9 @@ class Scorer:
|
|||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
|
||||
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
|
||||
# Limit macro_auc to those labels with gold annotations,
|
||||
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
|
||||
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
|
||||
results = {
|
||||
f"{attr}_score": None,
|
||||
f"{attr}_score_desc": None,
|
||||
|
@ -443,7 +485,7 @@ class Scorer:
|
|||
f"{attr}_macro_f": macro_f,
|
||||
f"{attr}_macro_auc": macro_auc,
|
||||
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||
f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
|
||||
}
|
||||
if len(labels) == 2 and not multi_label and positive_label:
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
||||
|
@ -534,6 +576,7 @@ class Scorer:
|
|||
head_attr: str = "head",
|
||||
head_getter: Callable[[Token, str], Token] = getattr,
|
||||
ignore_labels: Iterable[str] = SimpleFrozenList(),
|
||||
missing_values: Set[Any] = MISSING_VALUES,
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||
|
@ -558,6 +601,7 @@ class Scorer:
|
|||
unlabelled = PRFScore()
|
||||
labelled = PRFScore()
|
||||
labelled_per_dep = dict()
|
||||
missing_indices = set()
|
||||
for example in examples:
|
||||
gold_doc = example.reference
|
||||
pred_doc = example.predicted
|
||||
|
@ -567,13 +611,16 @@ class Scorer:
|
|||
for gold_i, token in enumerate(gold_doc):
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels:
|
||||
gold_deps.add((gold_i, head.i, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep] = set()
|
||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||
if dep not in missing_values:
|
||||
if dep not in ignore_labels:
|
||||
gold_deps.add((gold_i, head.i, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep] = set()
|
||||
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
|
||||
else:
|
||||
missing_indices.add(gold_i)
|
||||
pred_deps = set()
|
||||
pred_deps_per_dep = {}
|
||||
for token in pred_doc:
|
||||
|
@ -583,25 +630,26 @@ class Scorer:
|
|||
gold_i = None
|
||||
else:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels and token.orth_.strip():
|
||||
if align.x2y.lengths[head.i] == 1:
|
||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
if gold_i is None or gold_head is None:
|
||||
unlabelled.fp += 1
|
||||
labelled.fp += 1
|
||||
else:
|
||||
pred_deps.add((gold_i, gold_head, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in pred_deps_per_dep:
|
||||
pred_deps_per_dep[dep] = set()
|
||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||
if gold_i not in missing_indices:
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels and token.orth_.strip():
|
||||
if align.x2y.lengths[head.i] == 1:
|
||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
if gold_i is None or gold_head is None:
|
||||
unlabelled.fp += 1
|
||||
labelled.fp += 1
|
||||
else:
|
||||
pred_deps.add((gold_i, gold_head, dep))
|
||||
if dep not in labelled_per_dep:
|
||||
labelled_per_dep[dep] = PRFScore()
|
||||
if dep not in pred_deps_per_dep:
|
||||
pred_deps_per_dep[dep] = set()
|
||||
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
|
||||
labelled.score_set(pred_deps, gold_deps)
|
||||
for dep in labelled_per_dep:
|
||||
labelled_per_dep[dep].score_set(
|
||||
|
@ -610,29 +658,34 @@ class Scorer:
|
|||
unlabelled.score_set(
|
||||
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
|
||||
)
|
||||
return {
|
||||
f"{attr}_uas": unlabelled.fscore,
|
||||
f"{attr}_las": labelled.fscore,
|
||||
f"{attr}_las_per_type": {
|
||||
k: v.to_dict() for k, v in labelled_per_dep.items()
|
||||
},
|
||||
}
|
||||
if len(unlabelled) > 0:
|
||||
return {
|
||||
f"{attr}_uas": unlabelled.fscore,
|
||||
f"{attr}_las": labelled.fscore,
|
||||
f"{attr}_las_per_type": {
|
||||
k: v.to_dict() for k, v in labelled_per_dep.items()
|
||||
},
|
||||
}
|
||||
else:
|
||||
return {
|
||||
f"{attr}_uas": None,
|
||||
f"{attr}_las": None,
|
||||
f"{attr}_las_per_type": None,
|
||||
}
|
||||
|
||||
|
||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
||||
"""Compute per-entity PRFScore objects for a sequence of examples. The
|
||||
results are returned as a dictionary keyed by the entity type. You can
|
||||
add the PRFScore objects to get micro-averaged total.
|
||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples.
|
||||
"""
|
||||
scores = defaultdict(PRFScore)
|
||||
score_per_type = defaultdict(PRFScore)
|
||||
for eg in examples:
|
||||
if not eg.y.has_annotation("ENT_IOB"):
|
||||
continue
|
||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||
align_x2y = eg.alignment.x2y
|
||||
for pred_ent in eg.x.ents:
|
||||
if pred_ent.label_ not in scores:
|
||||
scores[pred_ent.label_] = PRFScore()
|
||||
if pred_ent.label_ not in score_per_type:
|
||||
score_per_type[pred_ent.label_] = PRFScore()
|
||||
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
|
||||
if len(indices):
|
||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||
|
@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
|||
if all(token.ent_iob != 0 for token in g_span):
|
||||
key = (pred_ent.label_, indices[0], indices[-1] + 1)
|
||||
if key in golds:
|
||||
scores[pred_ent.label_].tp += 1
|
||||
score_per_type[pred_ent.label_].tp += 1
|
||||
golds.remove(key)
|
||||
else:
|
||||
scores[pred_ent.label_].fp += 1
|
||||
score_per_type[pred_ent.label_].fp += 1
|
||||
for label, start, end in golds:
|
||||
scores[label].fn += 1
|
||||
return scores
|
||||
score_per_type[label].fn += 1
|
||||
totals = PRFScore()
|
||||
for prf in score_per_type.values():
|
||||
totals += prf
|
||||
if len(totals) > 0:
|
||||
return {
|
||||
"ents_p": totals.precision,
|
||||
"ents_r": totals.recall,
|
||||
"ents_f": totals.fscore,
|
||||
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"ents_p": None,
|
||||
"ents_r": None,
|
||||
"ents_f": None,
|
||||
"ents_per_type": None,
|
||||
}
|
||||
|
||||
|
||||
#############################################################################
|
||||
|
@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score):
|
|||
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
|
||||
"""
|
||||
if len(np.unique(y_true)) != 2:
|
||||
raise ValueError(Errors.E165)
|
||||
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
|
||||
fpr, tpr, _ = _roc_curve(y_true, y_score)
|
||||
return _auc(fpr, tpr)
|
||||
|
||||
|
|
|
@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc):
|
|||
pattern = [
|
||||
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
|
||||
]
|
||||
nomatch_pattern = [
|
||||
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
mock = Mock()
|
||||
matcher.add("pattern", [pattern], on_match=mock)
|
||||
matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||
|
||||
# check that matches with and without callback are the same (#4590)
|
||||
|
|
|
@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
|||
scores = nlp.evaluate(dev_examples)
|
||||
# "cat" is the only correct lemma
|
||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||
# the empty morphs are correct
|
||||
assert scores["morph_acc"] == pytest.approx(0.6)
|
||||
# no morphs are set
|
||||
assert scores["morph_acc"] == None
|
||||
|
||||
|
||||
def test_attributeruler_rule_order(nlp):
|
||||
|
|
|
@ -2,6 +2,7 @@ import pytest
|
|||
from spacy.language import Language
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.tokens import Doc
|
||||
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
|
||||
from thinc.api import Model, Linear, ConfigValidationError
|
||||
|
@ -156,15 +157,10 @@ def test_pipe_class_component_model():
|
|||
name = "test_class_component_model"
|
||||
default_config = {
|
||||
"model": {
|
||||
"@architectures": "spacy.TextCatEnsemble.v1",
|
||||
"exclusive_classes": False,
|
||||
"pretrained_vectors": None,
|
||||
"width": 64,
|
||||
"embed_size": 2000,
|
||||
"window_size": 1,
|
||||
"conv_depth": 2,
|
||||
"ngram_size": 1,
|
||||
"dropout": None,
|
||||
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||
"linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
|
||||
"no_output_layer": False},
|
||||
},
|
||||
"value1": 10,
|
||||
}
|
||||
|
|
|
@ -140,7 +140,7 @@ def test_overfitting_IO():
|
|||
nlp = English()
|
||||
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
|
||||
# Set exclusive labels
|
||||
config = {"model": {"exclusive_classes": True}}
|
||||
config = {"model": {"linear_model": {"exclusive_classes": True}}}
|
||||
textcat = nlp.add_pipe("textcat", config=config)
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
|
@ -192,9 +192,8 @@ def test_overfitting_IO():
|
|||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
|
||||
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
|
||||
],
|
||||
|
|
|
@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
|||
from numpy.testing import assert_array_equal
|
||||
import numpy
|
||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.en.examples import sentences as EN_SENTENCES
|
||||
|
||||
|
||||
def get_textcat_kwargs():
|
||||
def get_textcat_bow_kwargs():
|
||||
return {
|
||||
"width": 64,
|
||||
"embed_size": 2000,
|
||||
"pretrained_vectors": None,
|
||||
"exclusive_classes": False,
|
||||
"exclusive_classes": True,
|
||||
"ngram_size": 1,
|
||||
"window_size": 1,
|
||||
"conv_depth": 2,
|
||||
"dropout": None,
|
||||
"nO": 7,
|
||||
"no_output_layer": False,
|
||||
"nO": 34,
|
||||
}
|
||||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {
|
||||
"tok2vec": test_tok2vec(),
|
||||
"exclusive_classes": False,
|
||||
"nO": 13,
|
||||
}
|
||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
|
@ -105,7 +96,7 @@ def test_multi_hash_embed():
|
|||
"seed,model_func,kwargs",
|
||||
[
|
||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs()),
|
||||
(0, build_text_classifier, get_textcat_kwargs()),
|
||||
(0, build_bow_text_classifier, get_textcat_bow_kwargs()),
|
||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
|
||||
],
|
||||
)
|
||||
|
@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
|
|||
"seed,model_func,kwargs,get_X",
|
||||
[
|
||||
(0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||
(0, build_text_classifier, get_textcat_kwargs(), get_docs),
|
||||
(0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||
(0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||
],
|
||||
)
|
||||
|
@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
|
|||
"seed,dropout,model_func,kwargs,get_X",
|
||||
[
|
||||
(0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
|
||||
(0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
|
||||
(0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
|
||||
(0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
|
||||
],
|
||||
)
|
||||
|
|
|
@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
|
|||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||
|
||||
|
||||
def test_partial_annotation(en_tokenizer):
|
||||
pred_doc = en_tokenizer("a b c d e")
|
||||
pred_doc[0].tag_ = "A"
|
||||
pred_doc[0].pos_ = "X"
|
||||
pred_doc[0].set_morph("Feat=Val")
|
||||
pred_doc[0].dep_ = "dep"
|
||||
|
||||
# unannotated reference
|
||||
ref_doc = en_tokenizer("a b c d e")
|
||||
ref_doc.has_unknown_spaces = True
|
||||
example = Example(pred_doc, ref_doc)
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example])
|
||||
for key in scores:
|
||||
# cats doesn't have an unset state
|
||||
if key.startswith("cats"):
|
||||
continue
|
||||
assert scores[key] == None
|
||||
|
||||
# partially annotated reference, not overlapping with predicted annotation
|
||||
ref_doc = en_tokenizer("a b c d e")
|
||||
ref_doc.has_unknown_spaces = True
|
||||
ref_doc[1].tag_ = "A"
|
||||
ref_doc[1].pos_ = "X"
|
||||
ref_doc[1].set_morph("Feat=Val")
|
||||
ref_doc[1].dep_ = "dep"
|
||||
example = Example(pred_doc, ref_doc)
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example])
|
||||
assert scores["token_acc"] == None
|
||||
assert scores["tag_acc"] == 0.0
|
||||
assert scores["pos_acc"] == 0.0
|
||||
assert scores["morph_acc"] == 0.0
|
||||
assert scores["dep_uas"] == 1.0
|
||||
assert scores["dep_las"] == 0.0
|
||||
assert scores["sents_f"] == None
|
||||
|
||||
# partially annotated reference, overlapping with predicted annotation
|
||||
ref_doc = en_tokenizer("a b c d e")
|
||||
ref_doc.has_unknown_spaces = True
|
||||
ref_doc[0].tag_ = "A"
|
||||
ref_doc[0].pos_ = "X"
|
||||
ref_doc[1].set_morph("Feat=Val")
|
||||
ref_doc[1].dep_ = "dep"
|
||||
example = Example(pred_doc, ref_doc)
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example])
|
||||
assert scores["token_acc"] == None
|
||||
assert scores["tag_acc"] == 1.0
|
||||
assert scores["pos_acc"] == 1.0
|
||||
assert scores["morph_acc"] == 0.0
|
||||
assert scores["dep_uas"] == 1.0
|
||||
assert scores["dep_las"] == 0.0
|
||||
assert scores["sents_f"] == None
|
||||
|
||||
|
||||
def test_roc_auc_score():
|
||||
# Binary classification, toy tests from scikit-learn test suite
|
||||
y_true = [0, 1]
|
||||
|
@ -334,7 +390,8 @@ def test_roc_auc_score():
|
|||
score = ROCAUCScore()
|
||||
score.score_set(0.25, 0)
|
||||
score.score_set(0.75, 0)
|
||||
assert score.score == -float("inf")
|
||||
with pytest.raises(ValueError):
|
||||
s = score.score
|
||||
|
||||
y_true = [1, 1]
|
||||
y_score = [0.25, 0.75]
|
||||
|
@ -344,4 +401,5 @@ def test_roc_auc_score():
|
|||
score = ROCAUCScore()
|
||||
score.score_set(0.25, 1)
|
||||
score.score_set(0.75, 1)
|
||||
assert score.score == -float("inf")
|
||||
with pytest.raises(ValueError):
|
||||
s = score.score
|
||||
|
|
|
@ -51,7 +51,7 @@ def test_readers():
|
|||
for example in train_corpus(nlp):
|
||||
nlp.update([example], sgd=optimizer)
|
||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||
assert scores["cats_score"]
|
||||
assert scores["cats_score"] == 0.0
|
||||
# ensure the pipeline runs
|
||||
doc = nlp("Quick test")
|
||||
assert doc.cats
|
||||
|
|
|
@ -2,6 +2,7 @@ import numpy
|
|||
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
|
||||
from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
||||
from spacy.training import Corpus, docs_to_json, Example
|
||||
from spacy.training.align import get_alignments
|
||||
from spacy.training.converters import json_to_docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, DocBin
|
||||
|
@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||
|
||||
|
||||
@pytest.mark.skip("Outdated")
|
||||
@pytest.mark.parametrize(
|
||||
"tokens_a,tokens_b,expected",
|
||||
[
|
||||
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
|
||||
(["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
|
||||
(
|
||||
["a", "b", '"', "c"],
|
||||
['ab"', "c"],
|
||||
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
|
||||
([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
|
||||
),
|
||||
(["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
|
||||
(["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
|
||||
(
|
||||
["ab", "c", "d"],
|
||||
["a", "b", "cd"],
|
||||
(6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
|
||||
([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
|
||||
),
|
||||
(
|
||||
["a", "b", "cd"],
|
||||
["a", "b", "c", "d"],
|
||||
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
|
||||
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||
),
|
||||
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
||||
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||
],
|
||||
)
|
||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa
|
||||
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa
|
||||
a2b, b2a = get_alignments(tokens_a, tokens_b)
|
||||
assert (a2b, b2a) == expected # noqa
|
||||
# check symmetry
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa
|
||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa
|
||||
a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa
|
||||
assert (b2a, a2b) == expected # noqa
|
||||
|
||||
|
||||
def test_goldparse_startswith_space(en_tokenizer):
|
||||
|
@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
|
|||
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
||||
|
||||
|
||||
def test_goldparse_endswith_space(en_tokenizer):
|
||||
text = "a\n"
|
||||
doc = en_tokenizer(text)
|
||||
gold_words = ["a"]
|
||||
entities = ["U-DATE"]
|
||||
deps = ["ROOT"]
|
||||
heads = [0]
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
||||
)
|
||||
ner_tags = example.get_aligned_ner()
|
||||
assert ner_tags == ["U-DATE", "O"]
|
||||
assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
|
||||
|
||||
|
||||
def test_gold_constructor():
|
||||
"""Test that the Example constructor works fine"""
|
||||
nlp = English()
|
||||
|
@ -676,6 +691,87 @@ def test_alignment_different_texts():
|
|||
Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
|
||||
|
||||
def test_alignment_spaces(en_vocab):
|
||||
# single leading whitespace
|
||||
other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||
|
||||
# multiple leading whitespace tokens
|
||||
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# both with leading whitespace, not identical
|
||||
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# same leading whitespace, different tokenization
|
||||
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
|
||||
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
|
||||
assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# only one with trailing whitespace
|
||||
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
|
||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
|
||||
|
||||
# different trailing whitespace
|
||||
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
|
||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
|
||||
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
|
||||
|
||||
# same trailing whitespace, different tokenization
|
||||
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
|
||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# differing whitespace is allowed
|
||||
other_tokens = ["a", " \n ", "b", "c"]
|
||||
spacy_tokens = ["a", "b", " ", "c"]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.dataXd) == [0, 1, 3]
|
||||
assert list(align.y2x.dataXd) == [0, 2, 3]
|
||||
|
||||
# other differences in whitespace are allowed
|
||||
other_tokens = [" ", "a"]
|
||||
spacy_tokens = [" ", "a", " "]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
|
||||
other_tokens = ["a", " "]
|
||||
spacy_tokens = ["a", " "]
|
||||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
|
||||
|
||||
def test_retokenized_docs(doc):
|
||||
a = doc.to_array(["TAG"])
|
||||
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||
|
|
|
@ -399,14 +399,13 @@ cdef class Doc:
|
|||
return True
|
||||
cdef int i
|
||||
cdef int range_start = 0
|
||||
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
||||
attr = SENT_START
|
||||
attr = intify_attr(attr)
|
||||
# adjust attributes
|
||||
if attr == HEAD:
|
||||
# HEAD does not have an unset state, so rely on DEP
|
||||
attr = DEP
|
||||
elif attr == self.vocab.strings["IS_SENT_START"]:
|
||||
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
|
||||
attr = SENT_START
|
||||
# special cases for sentence boundaries
|
||||
if attr == SENT_START:
|
||||
if "sents" in self.user_hooks:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .corpus import Corpus # noqa: F401
|
||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||
from .align import Alignment # noqa: F401
|
||||
from .alignment import Alignment # noqa: F401
|
||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||
|
|
66
spacy/training/align.pyx
Normal file
66
spacy/training/align.pyx
Normal file
|
@ -0,0 +1,66 @@
|
|||
from typing import List, Tuple
|
||||
from itertools import chain
|
||||
import re
|
||||
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||
# Create character-to-token mappings
|
||||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
||||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
||||
str_a = "".join(A).lower()
|
||||
str_b = "".join(B).lower()
|
||||
cdef int len_str_a = len(str_a)
|
||||
cdef int len_str_b = len(str_b)
|
||||
# Check that the two texts only differ in whitespace and capitalization
|
||||
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
|
||||
len_str_a != len(char_to_token_a) or \
|
||||
len_str_b != len(char_to_token_b):
|
||||
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
|
||||
cdef int char_idx_a = 0
|
||||
cdef int char_idx_b = 0
|
||||
cdef int token_idx_a = 0
|
||||
cdef int token_idx_b = 0
|
||||
cdef int prev_token_idx_a = -1
|
||||
cdef int prev_token_idx_b = -1
|
||||
a2b = []
|
||||
b2a = []
|
||||
while char_idx_a < len_str_a and char_idx_b < len_str_b:
|
||||
# Find the current token position from the character position
|
||||
token_idx_a = char_to_token_a[char_idx_a]
|
||||
token_idx_b = char_to_token_b[char_idx_b]
|
||||
# Add a set for the next token if a token boundary has been crossed
|
||||
if prev_token_idx_a != token_idx_a:
|
||||
a2b.append(set())
|
||||
if prev_token_idx_b != token_idx_b:
|
||||
b2a.append(set())
|
||||
# Process the alignment at the current position
|
||||
if A[token_idx_a] == B[token_idx_b]:
|
||||
# Current tokens are identical
|
||||
a2b[-1].add(token_idx_b)
|
||||
b2a[-1].add(token_idx_a)
|
||||
char_idx_a += len(A[token_idx_a])
|
||||
char_idx_b += len(B[token_idx_b])
|
||||
elif str_a[char_idx_a] == str_b[char_idx_b]:
|
||||
# Current chars are identical
|
||||
a2b[-1].add(token_idx_b)
|
||||
b2a[-1].add(token_idx_a)
|
||||
char_idx_a += 1
|
||||
char_idx_b += 1
|
||||
elif str_a[char_idx_a].isspace():
|
||||
# Skip unaligned whitespace char in A
|
||||
char_idx_a += 1
|
||||
elif str_b[char_idx_b].isspace():
|
||||
# Skip unaligned whitespace char in B
|
||||
char_idx_b += 1
|
||||
else:
|
||||
# This should never happen
|
||||
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
|
||||
prev_token_idx_a = token_idx_a
|
||||
prev_token_idx_b = token_idx_b
|
||||
# Process unaligned trailing whitespace
|
||||
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
|
||||
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
|
||||
# Return values as sorted lists per token position
|
||||
return [sorted(x) for x in a2b], [sorted(x) for x in b2a]
|
|
@ -2,9 +2,8 @@ from typing import List
|
|||
import numpy
|
||||
from thinc.types import Ragged
|
||||
from dataclasses import dataclass
|
||||
import tokenizations
|
||||
|
||||
from ..errors import Errors
|
||||
from .align import get_alignments
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -20,9 +19,7 @@ class Alignment:
|
|||
|
||||
@classmethod
|
||||
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
|
||||
raise ValueError(Errors.E949)
|
||||
x2y, y2x = tokenizations.get_alignments(A, B)
|
||||
x2y, y2x = get_alignments(A, B)
|
||||
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||
|
||||
|
|
@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
|
|||
from ..tokens.span cimport Span
|
||||
from ..tokens.span import Span
|
||||
from ..attrs import IDS
|
||||
from .align import Alignment
|
||||
from .alignment import Alignment
|
||||
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||
from .iob_utils import biluo_tags_to_spans
|
||||
from ..errors import Errors, Warnings
|
||||
|
|
|
@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
# Resolve all training-relevant sections using the filled nlp config
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
if not isinstance(T["train_corpus"], str):
|
||||
raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
|
||||
if not isinstance(T["dev_corpus"], str):
|
||||
raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
optimizer = T["optimizer"]
|
||||
# Components that shouldn't be updated during training
|
||||
|
|
|
@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
|
|||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||
from ..errors import Errors
|
||||
from ..util import registry, load_model_from_config, dot_to_object
|
||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
||||
|
||||
|
||||
def pretrain(
|
||||
|
@ -38,7 +38,7 @@ def pretrain(
|
|||
_config = nlp.config.interpolate()
|
||||
T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
|
||||
P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
|
||||
corpus = dot_to_object(T, P["corpus"])
|
||||
corpus = resolve_dot_names(_config, [P["corpus"]])[0]
|
||||
batcher = P["batcher"]
|
||||
model = create_pretraining_model(nlp, P)
|
||||
optimizer = P["optimizer"]
|
||||
|
|
|
@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
|||
|
||||
Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it through
|
||||
a feed-forward subnetwork to build a mixed representations. The features used
|
||||
a feed-forward subnetwork to build a mixed representation. The features used
|
||||
can be configured with the `attrs` argument. The suggested attributes are
|
||||
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
||||
some subword information, without construction a fully character-based
|
||||
|
@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
|
|||
different architectures and settings to determine what works best on your
|
||||
specific data and challenge.
|
||||
|
||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||
### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatEnsemble.v1"
|
||||
> exclusive_classes = false
|
||||
> pretrained_vectors = null
|
||||
> width = 64
|
||||
> embed_size = 2000
|
||||
> conv_depth = 2
|
||||
> window_size = 1
|
||||
> ngram_size = 1
|
||||
> dropout = null
|
||||
> @architectures = "spacy.TextCatEnsemble.v2"
|
||||
> nO = null
|
||||
>
|
||||
> [model.linear_model]
|
||||
> @architectures = "spacy.TextCatBOW.v1"
|
||||
> exclusive_classes = true
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.Tok2Vec.v1"
|
||||
>
|
||||
> [model.tok2vec.embed]
|
||||
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||
> width = 64
|
||||
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
> include_static_vectors = false
|
||||
>
|
||||
> [model.tok2vec.encode]
|
||||
> @architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
> width = ${model.tok2vec.embed.width}
|
||||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> depth = 2
|
||||
> ```
|
||||
|
||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||
network has an internal CNN Tok2Vec layer and uses attention.
|
||||
Stacked ensemble of a linear bag-of-words model and a neural network model. The
|
||||
neural network is built upon a Tok2Vec layer and uses attention. The setting for
|
||||
whether or not this model should cater for multi-label classification, is taken
|
||||
from the linear model, where it is stored in `model.attrs["multi_label"]`.
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
||||
|
||||
The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
|||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
</Accordion>
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
> #### Example Config
|
||||
|
|
|
@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
|
|||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
|
|
@ -68,6 +68,8 @@ Scores the tokenization:
|
|||
- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
|
||||
character spans
|
||||
|
||||
Docs with `has_unknown_spaces` are skipped during scoring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -81,7 +83,8 @@ Scores the tokenization:
|
|||
|
||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
||||
|
||||
Scores a single token attribute.
|
||||
Scores a single token attribute. Tokens with missing values in the reference doc
|
||||
are skipped during scoring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -90,20 +93,22 @@ Scores a single token attribute.
|
|||
> print(scores["pos_acc"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
|
||||
|
||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
||||
|
||||
Scores a single token attribute per feature for a token attribute in the
|
||||
Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
format.
|
||||
format. Tokens with missing values in the reference doc are skipped during
|
||||
scoring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -112,13 +117,14 @@ format.
|
|||
> print(scores["morph_per_feat"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
|
||||
|
||||
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
|
|||
> print(scores["ents_f"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
|
||||
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
|
||||
| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ |
|
||||
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
||||
|
||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
||||
Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
|
||||
with missing values for the `attr` (typically `dep`) are skipped during scoring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
|||
> print(scores["dep_uas"], scores["dep_las"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| `head_attr` | The attribute containing the head token. ~~str~~ |
|
||||
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
|
||||
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| `head_attr` | The attribute containing the head token. ~~str~~ |
|
||||
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
|
||||
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
|
||||
| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||
|
||||
Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
|
||||
containing scores for each label like `Doc.cats`. The reported overall score
|
||||
depends on the scorer settings:
|
||||
containing scores for each label like `Doc.cats`. The returned dictionary
|
||||
contains the following scores:
|
||||
|
||||
1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
|
||||
`{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
|
||||
score), `{attr}_f_per_type`, `{attr}_auc_per_type`
|
||||
2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
|
||||
3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
|
||||
4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
|
||||
- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
|
||||
each label is weighted equally
|
||||
- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
|
||||
across evaluations per label
|
||||
- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
|
||||
scores, keyed by label
|
||||
- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
|
||||
description)
|
||||
|
||||
The reported `{attr}_score` depends on the classification properties:
|
||||
|
||||
- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
|
||||
of the positive label
|
||||
- **3+ exclusive classes**, macro-averaged F-score:
|
||||
`{attr}_score = {attr}_macro_f`
|
||||
- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers))
|
|||
|
||||
Computing similarity scores can be helpful in many situations, but it's also
|
||||
important to maintain **realistic expectations** about what information it can
|
||||
provide. Words can be related to each over in many ways, so a single
|
||||
provide. Words can be related to each other in many ways, so a single
|
||||
"similarity" score will always be a **mix of different signals**, and vectors
|
||||
trained on different data can produce very different results that may not be
|
||||
useful for your purpose. Here are some important considerations to keep in mind:
|
||||
|
|
|
@ -130,16 +130,31 @@ factory = "textcat"
|
|||
labels = []
|
||||
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
pretrained_vectors = null
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
dropout = 0
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.textcat.model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.textcat.model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${components.textcat.model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 2
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
```
|
||||
|
||||
spaCy has two additional built-in `textcat` architectures, and you can easily
|
||||
|
@ -687,7 +702,7 @@ Before the model can be used, it needs to be
|
|||
[initialized](/usage/training#initialization). This function receives a callback
|
||||
to access the full **training data set**, or a representative sample. This data
|
||||
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
||||
labels can be provided to `initialize`, or you can call
|
||||
labels can be provided to `initialize`, or you can call
|
||||
`RelationExtractor.add_label` directly. The number of labels defines the output
|
||||
dimensionality of the network, and will be used to do
|
||||
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||
|
|
|
@ -1244,15 +1244,10 @@ labels = []
|
|||
# This function is created and then passed to the "textcat" component as
|
||||
# the argument "model"
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
pretrained_vectors = null
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
dropout = null
|
||||
no_output_layer = false
|
||||
|
||||
[components.other_textcat]
|
||||
factory = "textcat"
|
||||
|
|
|
@ -1142,7 +1142,7 @@ pattern = [
|
|||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "subject",
|
||||
"RIGHT_ID": "founded_subject",
|
||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||
}
|
||||
# ...
|
||||
|
@ -1212,7 +1212,7 @@ pattern = [
|
|||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "subject",
|
||||
"RIGHT_ID": "founded_subject",
|
||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||
},
|
||||
{
|
||||
|
|
|
@ -717,7 +717,7 @@ tabular results to a file:
|
|||
```python
|
||||
### functions.py
|
||||
import sys
|
||||
from typing import IO, Tuple, Callable, Dict, Any
|
||||
from typing import IO, Tuple, Callable, Dict, Any, Optional
|
||||
import spacy
|
||||
from spacy import Language
|
||||
from pathlib import Path
|
||||
|
@ -729,7 +729,7 @@ def custom_logger(log_path):
|
|||
stdout: IO=sys.stdout,
|
||||
stderr: IO=sys.stderr
|
||||
) -> Tuple[Callable, Callable]:
|
||||
stdout.write(f"Logging to {log_path}\n")
|
||||
stdout.write(f"Logging to {log_path}\\n")
|
||||
log_file = Path(log_path).open("w", encoding="utf8")
|
||||
log_file.write("step\\t")
|
||||
log_file.write("score\\t")
|
||||
|
|
|
@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| Name | Description |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||
| [`Token.morph`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
|
||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||
| [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
|
||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class. |
|
||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
||||
|
@ -1032,9 +1032,9 @@ change your names and imports:
|
|||
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
|
||||
and maintaining one of the many awesome [plugins and extensions](/universe).
|
||||
We've tried to make it as easy as possible for you to upgrade your packages for
|
||||
spaCy v3.0. The most common use case for plugins is providing pipeline components
|
||||
and extension attributes. When migrating your plugin, double-check the
|
||||
following:
|
||||
spaCy v3.0. The most common use case for plugins is providing pipeline
|
||||
components and extension attributes. When migrating your plugin, double-check
|
||||
the following:
|
||||
|
||||
- Use the [`@Language.factory`](/api/language#factory) decorator to register
|
||||
your component and assign it a name. This allows users to refer to your
|
||||
|
|
|
@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg)
|
|||
Since each visualization is generated as a separate SVG, exporting `.svg` files
|
||||
only works if you're rendering **one single doc** at a time. (This makes sense –
|
||||
after all, each visualization should be a standalone graphic.) So instead of
|
||||
rendering all `Doc`s at one, loop over them and export them separately.
|
||||
rendering all `Doc`s at once, loop over them and export them separately.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ function formatAccuracy(data) {
|
|||
? null
|
||||
: {
|
||||
label,
|
||||
value: value.toFixed(2),
|
||||
value: (value * 100).toFixed(2),
|
||||
help: MODEL_META[label],
|
||||
}
|
||||
})
|
||||
|
|
Loading…
Reference in New Issue
Block a user