diff --git a/spacy/language.py b/spacy/language.py index d2b89029d..fb86689bc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1248,17 +1248,12 @@ class Language: component_cfg[name].setdefault("drop", drop) pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - # ignore statements are used here because mypy ignores hasattr - if name not in exclude and hasattr(proc, "update"): - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore - if sgd not in (None, False): - if ( - name not in exclude - and isinstance(proc, ty.TrainableComponent) - and proc.is_trainable - and proc.model not in (True, False, None) - ): - proc.finish_update(sgd) + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) if name in annotates: for doc, eg in zip( _pipe( @@ -1271,6 +1266,17 @@ class Language: examples, ): eg.predicted = doc + # Only finish the update after all component updates are done. Some + # components may share weights (such as tok2vec) and we only want + # to apply weight updates after all gradients are accumulated. + for name, proc in self.pipeline: + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.finish_update(sgd) + return losses def rehearse( diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6fe322b62..63d5cccc2 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -27,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]] KNOWLEDGE_BASE_IDS = "kb_ids" -# See #9050 -BACKWARD_OVERWRITE = True - default_model_config = """ [model] @architectures = "spacy.EntityLinker.v2" @@ -60,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "overwrite": True, + "overwrite": False, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -191,7 +188,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], - overwrite: bool = BACKWARD_OVERWRITE, + overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, candidates_batch_size: int, @@ -215,6 +212,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 293add9e1..fabc51fee 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -21,10 +21,6 @@ from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -# See #9050 -BACKWARD_OVERWRITE = True -BACKWARD_EXTEND = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -102,8 +98,8 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = BACKWARD_OVERWRITE, - extend: bool = BACKWARD_EXTEND, + overwrite: bool = False, + extend: bool = False, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, ): @@ -113,6 +109,8 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. + extend (bool): Whether to extend existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 77f4e8adb..6c2565170 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -10,9 +10,6 @@ from ..language import Language from ..scorer import Scorer from .. import util -# see #9050 -BACKWARD_OVERWRITE = False - @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], @@ -52,13 +49,14 @@ class Sentencizer(Pipe): name="sentencizer", *, punct_chars=None, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 42feeb277..a7d263e94 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .. import util -# See #9050 -BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger): model, name="senter", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, save_activations: bool = False, ): @@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". save_activations (bool): save model activations in Doc when annotating. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a6be51c3c..101d8bcea 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -27,9 +27,6 @@ from .. import util ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] -# See #9050 -BACKWARD_OVERWRITE = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -99,7 +96,7 @@ class Tagger(TrainablePipe): model, name="tagger", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=tagger_score, neg_prefix="!", save_activations: bool = False, @@ -110,6 +107,7 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". save_activations (bool): save model activations in Doc when annotating. diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 21d247b74..a99f8b561 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -175,6 +175,18 @@ def test_modify_span_group(doc): assert group[0].label == doc.vocab.strings["TEST"] +def test_char_span_attributes(doc): + label = "LABEL" + kb_id = "KB_ID" + span_id = "SPAN_ID" + span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id) + span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id) + assert span1.text == span2.text + assert span1.label_ == span2.label_ == label + assert span1.kb_id_ == span2.kb_id_ == kb_id + assert span1.id_ == span2.id_ == span_id + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 @@ -354,6 +366,14 @@ def test_spans_by_character(doc): span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" ) + # Span.char_span + alignment mode "contract" + span2 = doc[0:2].char_span( + span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" + ) + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py index 869b8b874..10fb22c97 100644 --- a/spacy/tests/pipeline/test_annotates_on_update.py +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -54,9 +54,11 @@ def test_annotates_on_update(): return AssertSents(name) class AssertSents: + model = None + is_trainable = True + def __init__(self, name, **cfg): self.name = name - pass def __call__(self, doc): if not doc.has_annotation("SENT_START"): @@ -64,10 +66,16 @@ def test_annotates_on_update(): return doc def update(self, examples, *, drop=0.0, sgd=None, losses=None): + losses.setdefault(self.name, 0.0) + for example in examples: if not example.predicted.has_annotation("SENT_START"): raise ValueError("No sents") - return {} + + return losses + + def finish_update(self, sgd=None): + pass nlp = English() nlp.add_pipe("sentencizer") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 42ffae22d..dc7ce46fe 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing(): def test_cli_find_threshold(capsys): - thresholds = numpy.linspace(0, 1, 10) - def make_examples(nlp: Language) -> List[Example]: docs: List[Example] = [] @@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys): scores_key="cats_macro_f", silent=True, ) - assert best_threshold != thresholds[0] - assert thresholds[0] < best_threshold < thresholds[9] assert best_score == max(res.values()) assert res[1.0] == 0.0 @@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys): nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - res = find_threshold( + best_threshold, best_score, res = find_threshold( model=nlp_dir, data_path=docs_dir / "docs.spacy", pipe_name="spancat", @@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys): scores_key="spans_sc_f", silent=True, ) - assert res[0] != thresholds[0] - assert thresholds[0] < res[0] < thresholds[8] - assert res[1] >= 0.6 - assert res[2][1.0] == 0.0 + assert best_score == max(res.values()) + assert res[1.0] == 0.0 # Having multiple textcat_multilabel components should work, since the name has to be specified. nlp, _ = init_nlp((("textcat_multilabel", {}),)) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 9b099ccb5..648a52374 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -9,7 +9,7 @@ import spacy from spacy.cli._util import app from spacy.language import Language from spacy.tokens import DocBin -from .util import make_tempdir +from .util import make_tempdir, normalize_whitespace def test_convert_auto(): @@ -247,8 +247,8 @@ def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) - assert result_benchmark.stdout == result_evaluate.stdout.replace( - "spacy evaluate", "spacy benchmark accuracy" + assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace( + result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy") ) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index f2d6d5fc0..3d0905dd3 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -10,8 +10,9 @@ from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German from spacy.util import registry, ignore_error, raise_error, find_matching_language +from spacy.util import load_model_from_config import spacy -from thinc.api import CupyOps, NumpyOps, get_current_ops +from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal @@ -25,6 +26,51 @@ try: except ImportError: pass +TAGGER_CFG_STRING = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +TAGGER_TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + TAGGER_TRAIN_DATA = [ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), @@ -91,6 +137,26 @@ def test_language_update(nlp): example = Example.from_dict(doc, wrongkeyannots) +def test_language_update_updates(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=optimizer) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + assert xp.any( + xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + ) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} diff --git a/spacy/tests/util.py b/spacy/tests/util.py index d5f3c39ff..c2647558d 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,7 @@ import numpy import tempfile import contextlib +import re import srsly from spacy.tokens import Doc from spacy.vocab import Vocab @@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): assert k1 == k2 assert v1 == v2 + + +def normalize_whitespace(s): + return re.sub(r"\s+", " ", s) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 1c7c18bf3..93cd8de05 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -108,6 +108,7 @@ class Doc: kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2b3b83e6a..2eca1aafd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -528,9 +528,9 @@ cdef class Doc: doc (Doc): The parent document. start_idx (int): The index of the first character of the span. end_idx (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. @@ -539,6 +539,7 @@ cdef class Doc: with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/doc#char_span diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 5168f3b03..979e74e7e 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -96,6 +96,9 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + id: Union[int, str] = ..., + alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index b605434fd..aefea4f71 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -382,7 +382,7 @@ cdef class Span: result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) # ensure we get a scalar back (numpy does this automatically but cupy doesn't) return result.item() - + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -656,22 +656,29 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + id (Union[int, str]): Unused. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. """ cdef SpanC* span_c = self.span_c() start_idx += span_c.start_char end_idx += span_c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id) @property def conjuncts(self): diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fc929816d..fcc023a0d 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -210,7 +210,7 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=False, # type: ignore[arg-type] + sgd=None, exclude=exclude, annotates=annotating_components, ) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 0bf708183..9777650a9 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir] > $ python -m spacy project assets [--sparse] > ``` -| Name | Description | -| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | +| Name | Description | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--extra`, `-e` 3.3.1 | Download assets marked as "extra". Default false. ~~bool (flag)~~ | +| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | ### project run {id="project-run",tag="command"} diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index a303d628e..1a3f6179f 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | -| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | @@ -209,15 +209,16 @@ alignment mode `"strict". > assert span.text == "New York" > ``` -| Name | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.3.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Doc.set_ents {id="set_ents",tag="method",version="3"} diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 238b62a2e..12b2f6bef 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters. | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 4660ec312..9514bc773 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | | `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 878bb30c3..e62d9c724 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -186,14 +186,17 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `id` | Unused. ~~Union[int, str]~~ | +| `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"} diff --git a/website/docs/models/index.mdx b/website/docs/models/index.mdx index 371e4460f..366d44f0e 100644 --- a/website/docs/models/index.mdx +++ b/website/docs/models/index.mdx @@ -21,8 +21,8 @@ menu: ## Package naming conventions {id="conventions"} In general, spaCy expects all pipeline packages to follow the naming convention -of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name -into three components: +of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into +three components: 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with tagging, parsing, lemmatization and named entity recognition, or `dep` for diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx index ac61338e3..3ca64f8a2 100644 --- a/website/docs/usage/v3-5.mdx +++ b/website/docs/usage/v3-5.mdx @@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` as explained in the [docs](/api/textcategorizer#assigned-attributes). +### Using the default knowledge base + +As `KnowledgeBase` is now an abstract class, you should call the constructor of +the new `InMemoryLookupKB` instead when you want to use spaCy's default KB +implementation: + +```diff +- kb = KnowledgeBase() ++ kb = InMemoryLookupKB() +``` + +If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to +implement its abstract methods, or alternatively inherit from `InMemoryLookupKB` +instead. + ### Updated scorers for tokenization and textcat {id="scores"} We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported