diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 138b4b94b..168465fab 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -643,7 +643,7 @@ Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a sample text and checking how it updates its internal weights and parameters. ```cli -$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id] +$ python -m spacy debug model [config_path] [component] [--layers] [--dimensions] [--parameters] [--gradients] [--attributes] [--print-step0] [--print-step1] [--print-step2] [--print-step3] [--gpu-id] ``` diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 549c3bcc4..73540b3d3 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -232,7 +232,9 @@ transformers as subnetworks directly, you can also use them via the The `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, -which lets you access the transformers outputs at runtime. +which lets you access the transformers outputs at runtime. The trained +transformer-based [pipelines](/models) provided by spaCy end on `_trf`, e.g. +[`en_core_web_trf`](/models/en#en_core_web_trf). ```cli $ python -m spacy download en_core_web_trf diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 6dbf2525e..eb443c645 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1656,9 +1656,10 @@ because it only requires annotated sentence boundaries rather than full dependency parses. spaCy's [trained pipelines](/models) include both a parser and a trained sentence segmenter, which is [disabled](/usage/processing-pipelines#disabling) by default. If you only need -sentence boundaries and no parser, you can use the `enable` and `disable` -arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and -disable the parser. +sentence boundaries and no parser, you can use the `exclude` or `disable` +argument on [`spacy.load`](/api/top-level#spacy.load) to load the pipeline +without the parser and then enable the sentence recognizer explicitly with +[`nlp.enable_pipe`](/api/language#enable_pipe). > #### senter vs. parser > @@ -1670,7 +1671,8 @@ disable the parser. ### {executable="true"} import spacy -nlp = spacy.load("en_core_web_sm", enable=["senter"], disable=["parser"]) +nlp = spacy.load("en_core_web_sm", exclude=["parser"]) +nlp.enable_pipe("senter") doc = nlp("This is a sentence. This is another sentence.") for sent in doc.sents: print(sent.text) @@ -1734,7 +1736,7 @@ nlp = spacy.load("en_core_web_sm") doc = nlp(text) print("Before:", [sent.text for sent in doc.sents]) -@Language.component("set_custom_coundaries") +@Language.component("set_custom_boundaries") def set_custom_boundaries(doc): for token in doc[:-1]: if token.text == "...": diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index e33ea6001..fdae6d3e5 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1159,7 +1159,8 @@ class DebugComponent: self.logger.info(f"Pipeline: {nlp.pipe_names}") def __call__(self, doc: Doc) -> Doc: - self.logger.debug(f"Doc: {len(doc)} tokens, is_tagged: {doc.is_tagged}") + is_tagged = doc.has_annotation("TAG") + self.logger.debug(f"Doc: {len(doc)} tokens, is tagged: {is_tagged}") return doc nlp = spacy.load("en_core_web_sm") diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index a510398e6..f5825f3a9 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -838,7 +838,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) # Add pattern for valid hashtag, i.e. '#' plus any ASCII token -matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) +matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]]) # Register token extension Token.set_extension("is_hashtag", default=False) diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 250fdb4f4..9191a7db2 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -285,6 +285,7 @@ add to your pipeline and customize for your use case: | [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | | [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | | [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | +| [`TrainablePipe`](/api/pipe) | Base class for trainable pipeline components. | @@ -396,8 +397,8 @@ type-check model definitions. For data validation, spaCy v3.0 adopts [`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which -lets you register **custom functions with typed arguments**, reference them -in your config and see validation errors if the argument values don't match. +lets you register **custom functions with typed arguments**, reference them in +your config and see validation errors if the argument values don't match.