mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 00:32:40 +03:00
Merge branch 'v4' into feature/multiple-code-files
This commit is contained in:
commit
7ef87e24ca
|
@ -1248,17 +1248,12 @@ class Language:
|
||||||
component_cfg[name].setdefault("drop", drop)
|
component_cfg[name].setdefault("drop", drop)
|
||||||
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
# ignore statements are used here because mypy ignores hasattr
|
if (
|
||||||
if name not in exclude and hasattr(proc, "update"):
|
name not in exclude
|
||||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
|
and isinstance(proc, ty.TrainableComponent)
|
||||||
if sgd not in (None, False):
|
and proc.is_trainable
|
||||||
if (
|
):
|
||||||
name not in exclude
|
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||||
and isinstance(proc, ty.TrainableComponent)
|
|
||||||
and proc.is_trainable
|
|
||||||
and proc.model not in (True, False, None)
|
|
||||||
):
|
|
||||||
proc.finish_update(sgd)
|
|
||||||
if name in annotates:
|
if name in annotates:
|
||||||
for doc, eg in zip(
|
for doc, eg in zip(
|
||||||
_pipe(
|
_pipe(
|
||||||
|
@ -1271,6 +1266,17 @@ class Language:
|
||||||
examples,
|
examples,
|
||||||
):
|
):
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
|
# Only finish the update after all component updates are done. Some
|
||||||
|
# components may share weights (such as tok2vec) and we only want
|
||||||
|
# to apply weight updates after all gradients are accumulated.
|
||||||
|
for name, proc in self.pipeline:
|
||||||
|
if (
|
||||||
|
name not in exclude
|
||||||
|
and isinstance(proc, ty.TrainableComponent)
|
||||||
|
and proc.is_trainable
|
||||||
|
):
|
||||||
|
proc.finish_update(sgd)
|
||||||
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
|
|
|
@ -27,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
|
||||||
|
|
||||||
KNOWLEDGE_BASE_IDS = "kb_ids"
|
KNOWLEDGE_BASE_IDS = "kb_ids"
|
||||||
|
|
||||||
# See #9050
|
|
||||||
BACKWARD_OVERWRITE = True
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.EntityLinker.v2"
|
@architectures = "spacy.EntityLinker.v2"
|
||||||
|
@ -60,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
"overwrite": True,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
"use_gold_ents": True,
|
"use_gold_ents": True,
|
||||||
"candidates_batch_size": 1,
|
"candidates_batch_size": 1,
|
||||||
|
@ -191,7 +188,7 @@ class EntityLinker(TrainablePipe):
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
candidates_batch_size: int,
|
candidates_batch_size: int,
|
||||||
|
@ -215,6 +212,7 @@ class EntityLinker(TrainablePipe):
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||||
Iterable[Candidate]]
|
Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
overwrite (bool): Whether to overwrite existing non-empty annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
|
|
@ -21,10 +21,6 @@ from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
# See #9050
|
|
||||||
BACKWARD_OVERWRITE = True
|
|
||||||
BACKWARD_EXTEND = False
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -102,8 +98,8 @@ class Morphologizer(Tagger):
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "morphologizer",
|
name: str = "morphologizer",
|
||||||
*,
|
*,
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = False,
|
||||||
extend: bool = BACKWARD_EXTEND,
|
extend: bool = False,
|
||||||
scorer: Optional[Callable] = morphologizer_score,
|
scorer: Optional[Callable] = morphologizer_score,
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
):
|
):
|
||||||
|
@ -113,6 +109,8 @@ class Morphologizer(Tagger):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
overwrite (bool): Whether to overwrite existing annotations.
|
||||||
|
extend (bool): Whether to extend existing annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||||
|
|
|
@ -10,9 +10,6 @@ from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
# see #9050
|
|
||||||
BACKWARD_OVERWRITE = False
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
|
@ -52,13 +49,14 @@ class Sentencizer(Pipe):
|
||||||
name="sentencizer",
|
name="sentencizer",
|
||||||
*,
|
*,
|
||||||
punct_chars=None,
|
punct_chars=None,
|
||||||
overwrite=BACKWARD_OVERWRITE,
|
overwrite=False,
|
||||||
scorer=senter_score,
|
scorer=senter_score,
|
||||||
):
|
):
|
||||||
"""Initialize the sentencizer.
|
"""Initialize the sentencizer.
|
||||||
|
|
||||||
punct_chars (list): Punctuation characters to split on. Will be
|
punct_chars (list): Punctuation characters to split on. Will be
|
||||||
serialized with the nlp object.
|
serialized with the nlp object.
|
||||||
|
overwrite (bool): Whether to overwrite existing annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_spans for the attribute "sents".
|
Scorer.score_spans for the attribute "sents".
|
||||||
|
|
||||||
|
|
|
@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
# See #9050
|
|
||||||
BACKWARD_OVERWRITE = False
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger):
|
||||||
model,
|
model,
|
||||||
name="senter",
|
name="senter",
|
||||||
*,
|
*,
|
||||||
overwrite=BACKWARD_OVERWRITE,
|
overwrite=False,
|
||||||
scorer=senter_score,
|
scorer=senter_score,
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
):
|
):
|
||||||
|
@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
overwrite (bool): Whether to overwrite existing annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_spans for the attribute "sents".
|
Scorer.score_spans for the attribute "sents".
|
||||||
save_activations (bool): save model activations in Doc when annotating.
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
|
|
|
@ -27,9 +27,6 @@ from .. import util
|
||||||
|
|
||||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||||
|
|
||||||
# See #9050
|
|
||||||
BACKWARD_OVERWRITE = False
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -99,7 +96,7 @@ class Tagger(TrainablePipe):
|
||||||
model,
|
model,
|
||||||
name="tagger",
|
name="tagger",
|
||||||
*,
|
*,
|
||||||
overwrite=BACKWARD_OVERWRITE,
|
overwrite=False,
|
||||||
scorer=tagger_score,
|
scorer=tagger_score,
|
||||||
neg_prefix="!",
|
neg_prefix="!",
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
|
@ -110,6 +107,7 @@ class Tagger(TrainablePipe):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
|
overwrite (bool): Whether to overwrite existing annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_token_attr for the attribute "tag".
|
Scorer.score_token_attr for the attribute "tag".
|
||||||
save_activations (bool): save model activations in Doc when annotating.
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
|
|
|
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
|
||||||
assert group[0].label == doc.vocab.strings["TEST"]
|
assert group[0].label == doc.vocab.strings["TEST"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_char_span_attributes(doc):
|
||||||
|
label = "LABEL"
|
||||||
|
kb_id = "KB_ID"
|
||||||
|
span_id = "SPAN_ID"
|
||||||
|
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
|
||||||
|
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
|
||||||
|
assert span1.text == span2.text
|
||||||
|
assert span1.label_ == span2.label_ == label
|
||||||
|
assert span1.kb_id_ == span2.kb_id_ == kb_id
|
||||||
|
assert span1.id_ == span2.id_ == span_id
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sent_spans(doc):
|
def test_spans_sent_spans(doc):
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert sents[0].start == 0
|
assert sents[0].start == 0
|
||||||
|
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
|
||||||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Span.char_span + alignment mode "contract"
|
||||||
|
span2 = doc[0:2].char_span(
|
||||||
|
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||||
|
)
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
def test_span_to_array(doc):
|
def test_span_to_array(doc):
|
||||||
span = doc[1:-2]
|
span = doc[1:-2]
|
||||||
|
|
|
@ -54,9 +54,11 @@ def test_annotates_on_update():
|
||||||
return AssertSents(name)
|
return AssertSents(name)
|
||||||
|
|
||||||
class AssertSents:
|
class AssertSents:
|
||||||
|
model = None
|
||||||
|
is_trainable = True
|
||||||
|
|
||||||
def __init__(self, name, **cfg):
|
def __init__(self, name, **cfg):
|
||||||
self.name = name
|
self.name = name
|
||||||
pass
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
if not doc.has_annotation("SENT_START"):
|
if not doc.has_annotation("SENT_START"):
|
||||||
|
@ -64,10 +66,16 @@ def test_annotates_on_update():
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
|
||||||
for example in examples:
|
for example in examples:
|
||||||
if not example.predicted.has_annotation("SENT_START"):
|
if not example.predicted.has_annotation("SENT_START"):
|
||||||
raise ValueError("No sents")
|
raise ValueError("No sents")
|
||||||
return {}
|
|
||||||
|
return losses
|
||||||
|
|
||||||
|
def finish_update(self, sgd=None):
|
||||||
|
pass
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
|
|
|
@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():
|
||||||
|
|
||||||
|
|
||||||
def test_cli_find_threshold(capsys):
|
def test_cli_find_threshold(capsys):
|
||||||
thresholds = numpy.linspace(0, 1, 10)
|
|
||||||
|
|
||||||
def make_examples(nlp: Language) -> List[Example]:
|
def make_examples(nlp: Language) -> List[Example]:
|
||||||
docs: List[Example] = []
|
docs: List[Example] = []
|
||||||
|
|
||||||
|
@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
|
||||||
scores_key="cats_macro_f",
|
scores_key="cats_macro_f",
|
||||||
silent=True,
|
silent=True,
|
||||||
)
|
)
|
||||||
assert best_threshold != thresholds[0]
|
|
||||||
assert thresholds[0] < best_threshold < thresholds[9]
|
|
||||||
assert best_score == max(res.values())
|
assert best_score == max(res.values())
|
||||||
assert res[1.0] == 0.0
|
assert res[1.0] == 0.0
|
||||||
|
|
||||||
|
@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
|
||||||
nlp, _ = init_nlp((("spancat", {}),))
|
nlp, _ = init_nlp((("spancat", {}),))
|
||||||
with make_tempdir() as nlp_dir:
|
with make_tempdir() as nlp_dir:
|
||||||
nlp.to_disk(nlp_dir)
|
nlp.to_disk(nlp_dir)
|
||||||
res = find_threshold(
|
best_threshold, best_score, res = find_threshold(
|
||||||
model=nlp_dir,
|
model=nlp_dir,
|
||||||
data_path=docs_dir / "docs.spacy",
|
data_path=docs_dir / "docs.spacy",
|
||||||
pipe_name="spancat",
|
pipe_name="spancat",
|
||||||
|
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
|
||||||
scores_key="spans_sc_f",
|
scores_key="spans_sc_f",
|
||||||
silent=True,
|
silent=True,
|
||||||
)
|
)
|
||||||
assert res[0] != thresholds[0]
|
assert best_score == max(res.values())
|
||||||
assert thresholds[0] < res[0] < thresholds[8]
|
assert res[1.0] == 0.0
|
||||||
assert res[1] >= 0.6
|
|
||||||
assert res[2][1.0] == 0.0
|
|
||||||
|
|
||||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||||
|
|
|
@ -9,7 +9,7 @@ import spacy
|
||||||
from spacy.cli._util import app
|
from spacy.cli._util import app
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tokens import DocBin
|
from spacy.tokens import DocBin
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir, normalize_whitespace
|
||||||
|
|
||||||
|
|
||||||
def test_convert_auto():
|
def test_convert_auto():
|
||||||
|
@ -247,8 +247,8 @@ def test_benchmark_accuracy_alias():
|
||||||
# Verify that the `evaluate` alias works correctly.
|
# Verify that the `evaluate` alias works correctly.
|
||||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||||
"spacy evaluate", "spacy benchmark accuracy"
|
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,8 +10,9 @@ from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||||
|
from spacy.util import load_model_from_config
|
||||||
import spacy
|
import spacy
|
||||||
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
|
||||||
|
|
||||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||||
|
|
||||||
|
@ -25,6 +26,51 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
TAGGER_CFG_STRING = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec","tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
rows = [2000, 1000, 1000, 1000]
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TAGGER_TRAIN_DATA = [
|
||||||
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||||
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
TAGGER_TRAIN_DATA = [
|
TAGGER_TRAIN_DATA = [
|
||||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||||
|
@ -91,6 +137,26 @@ def test_language_update(nlp):
|
||||||
example = Example.from_dict(doc, wrongkeyannots)
|
example = Example.from_dict(doc, wrongkeyannots)
|
||||||
|
|
||||||
|
|
||||||
|
def test_language_update_updates():
|
||||||
|
config = Config().from_str(TAGGER_CFG_STRING)
|
||||||
|
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for t in TAGGER_TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||||
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||||
|
|
||||||
|
xp = get_array_module(docs_after_update[0].tensor)
|
||||||
|
assert xp.any(
|
||||||
|
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_language_evaluate(nlp):
|
def test_language_evaluate(nlp):
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import numpy
|
import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
|
||||||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||||
assert k1 == k2
|
assert k1 == k2
|
||||||
assert v1 == v2
|
assert v1 == v2
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_whitespace(s):
|
||||||
|
return re.sub(r"\s+", " ", s)
|
||||||
|
|
|
@ -108,6 +108,7 @@ class Doc:
|
||||||
kb_id: Union[int, str] = ...,
|
kb_id: Union[int, str] = ...,
|
||||||
vector: Optional[Floats1d] = ...,
|
vector: Optional[Floats1d] = ...,
|
||||||
alignment_mode: str = ...,
|
alignment_mode: str = ...,
|
||||||
|
span_id: Union[int, str] = ...,
|
||||||
) -> Span: ...
|
) -> Span: ...
|
||||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -528,9 +528,9 @@ cdef class Doc:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start_idx (int): The index of the first character of the span.
|
start_idx (int): The index of the first character of the span.
|
||||||
end_idx (int): The index of the first character after the span.
|
end_idx (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||||
named entities.
|
named entities.
|
||||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
|
||||||
named entity.
|
named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
the span.
|
the span.
|
||||||
|
@ -539,6 +539,7 @@ cdef class Doc:
|
||||||
with token boundaries), "contract" (span of all tokens completely
|
with token boundaries), "contract" (span of all tokens completely
|
||||||
within the character span), "expand" (span of all tokens at least
|
within the character span), "expand" (span of all tokens at least
|
||||||
partially covered by the character span). Defaults to "strict".
|
partially covered by the character span). Defaults to "strict".
|
||||||
|
span_id (Union[int, str]): An identifier to associate with the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://spacy.io/api/doc#char_span
|
||||||
|
|
|
@ -96,6 +96,9 @@ class Span:
|
||||||
label: Union[int, str] = ...,
|
label: Union[int, str] = ...,
|
||||||
kb_id: Union[int, str] = ...,
|
kb_id: Union[int, str] = ...,
|
||||||
vector: Optional[Floats1d] = ...,
|
vector: Optional[Floats1d] = ...,
|
||||||
|
id: Union[int, str] = ...,
|
||||||
|
alignment_mode: str = ...,
|
||||||
|
span_id: Union[int, str] = ...,
|
||||||
) -> Span: ...
|
) -> Span: ...
|
||||||
@property
|
@property
|
||||||
def conjuncts(self) -> Tuple[Token]: ...
|
def conjuncts(self) -> Tuple[Token]: ...
|
||||||
|
|
|
@ -656,22 +656,29 @@ cdef class Span:
|
||||||
else:
|
else:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
|
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
|
||||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||||
|
|
||||||
start (int): The index of the first character of the span.
|
start (int): The index of the first character of the span.
|
||||||
end (int): The index of the first character after the span.
|
end (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||||
named entities.
|
named entities.
|
||||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
the span.
|
the span.
|
||||||
|
id (Union[int, str]): Unused.
|
||||||
|
alignment_mode (str): How character indices are aligned to token
|
||||||
|
boundaries. Options: "strict" (character indices must be aligned
|
||||||
|
with token boundaries), "contract" (span of all tokens completely
|
||||||
|
within the character span), "expand" (span of all tokens at least
|
||||||
|
partially covered by the character span). Defaults to "strict".
|
||||||
|
span_id (Union[int, str]): An identifier to associate with the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
cdef SpanC* span_c = self.span_c()
|
cdef SpanC* span_c = self.span_c()
|
||||||
start_idx += span_c.start_char
|
start_idx += span_c.start_char
|
||||||
end_idx += span_c.start_char
|
end_idx += span_c.start_char
|
||||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def conjuncts(self):
|
def conjuncts(self):
|
||||||
|
|
|
@ -210,7 +210,7 @@ def train_while_improving(
|
||||||
subbatch,
|
subbatch,
|
||||||
drop=dropout,
|
drop=dropout,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
sgd=False, # type: ignore[arg-type]
|
sgd=None,
|
||||||
exclude=exclude,
|
exclude=exclude,
|
||||||
annotates=annotating_components,
|
annotates=annotating_components,
|
||||||
)
|
)
|
||||||
|
|
|
@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
|
||||||
> $ python -m spacy project assets [--sparse]
|
> $ python -m spacy project assets [--sparse]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||||
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||||
|
|
||||||
### project run {id="project-run",tag="command"}
|
### project run {id="project-run",tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
@ -209,15 +209,16 @@ alignment mode `"strict".
|
||||||
> assert span.text == "New York"
|
> assert span.text == "New York"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `start` | The index of the first character of the span. ~~int~~ |
|
| `start` | The index of the first character of the span. ~~int~~ |
|
||||||
| `end` | The index of the last character after the span. ~~int~~ |
|
| `end` | The index of the last character after the span. ~~int~~ |
|
||||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||||
|
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||||
|
|
||||||
## Doc.set_ents {id="set_ents",tag="method",version="3"}
|
## Doc.set_ents {id="set_ents",tag="method",version="3"}
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
|
@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
|
||||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
||||||
|
|
|
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
|
||||||
> assert span.text == "New York"
|
> assert span.text == "New York"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------- |
|
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `start` | The index of the first character of the span. ~~int~~ |
|
| `start` | The index of the first character of the span. ~~int~~ |
|
||||||
| `end` | The index of the last character after the span. ~~int~~ |
|
| `end` | The index of the last character after the span. ~~int~~ |
|
||||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
| `id` | Unused. ~~Union[int, str]~~ |
|
||||||
|
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||||
|
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||||
|
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||||
|
|
||||||
## Span.similarity {id="similarity",tag="method",model="vectors"}
|
## Span.similarity {id="similarity",tag="method",model="vectors"}
|
||||||
|
|
||||||
|
|
|
@ -21,8 +21,8 @@ menu:
|
||||||
## Package naming conventions {id="conventions"}
|
## Package naming conventions {id="conventions"}
|
||||||
|
|
||||||
In general, spaCy expects all pipeline packages to follow the naming convention
|
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||||
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
|
of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
|
||||||
into three components:
|
three components:
|
||||||
|
|
||||||
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||||
tagging, parsing, lemmatization and named entity recognition, or `dep` for
|
tagging, parsing, lemmatization and named entity recognition, or `dep` for
|
||||||
|
|
|
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
|
||||||
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
||||||
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
||||||
|
|
||||||
|
### Using the default knowledge base
|
||||||
|
|
||||||
|
As `KnowledgeBase` is now an abstract class, you should call the constructor of
|
||||||
|
the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
|
||||||
|
implementation:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- kb = KnowledgeBase()
|
||||||
|
+ kb = InMemoryLookupKB()
|
||||||
|
```
|
||||||
|
|
||||||
|
If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
|
||||||
|
implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
|
||||||
|
instead.
|
||||||
|
|
||||||
### Updated scorers for tokenization and textcat {id="scores"}
|
### Updated scorers for tokenization and textcat {id="scores"}
|
||||||
|
|
||||||
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
||||||
|
|
Loading…
Reference in New Issue
Block a user