mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Update syntax for code block meta data
This commit is contained in:
parent
47e4c62741
commit
b305510227
|
@ -120,8 +120,7 @@ file.
|
|||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||
> ```
|
||||
|
||||
```json
|
||||
### Example
|
||||
```json {title="Example"}
|
||||
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
|
||||
|
|
|
@ -317,8 +317,7 @@ $ python -m spacy convert ./data.json .
|
|||
> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
|
||||
> help you convert entity offsets to the right format.
|
||||
|
||||
```python
|
||||
### Example structure
|
||||
```python {title="Example structure"}
|
||||
[{
|
||||
"id": int, # ID of the document within the corpus
|
||||
"paragraphs": [{ # list of paragraphs in the corpus
|
||||
|
@ -436,8 +435,7 @@ file to keep track of your settings and hyperparameters and your own
|
|||
|
||||
</Infobox>
|
||||
|
||||
```python
|
||||
### Examples
|
||||
```python {title="Examples"}
|
||||
# Training data for a part-of-speech tagger
|
||||
doc = Doc(vocab, words=["I", "like", "stuff"])
|
||||
gold_dict = {"tags": ["NOUN", "VERB", "NOUN"]}
|
||||
|
@ -483,13 +481,11 @@ spaCy's [`Lexeme`](/api/lexeme#attributes) object.
|
|||
> vocab_data = "/path/to/vocab-data.jsonl"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### First line
|
||||
```python {title="First line"}
|
||||
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
|
||||
```
|
||||
|
||||
```python
|
||||
### Entry structure
|
||||
```python {title="Entry structure"}
|
||||
{
|
||||
"orth": string, # the word text
|
||||
"id": int, # can correspond to row in vectors table
|
||||
|
|
|
@ -15,8 +15,7 @@ notable downside to this format is that you can't easily extract just one
|
|||
document from the `DocBin`. The serialization format is gzipped msgpack, where
|
||||
the msgpack object has the following structure:
|
||||
|
||||
```python
|
||||
### msgpack object structure
|
||||
```python {title="msgpack object structure"}
|
||||
{
|
||||
"version": str, # DocBin version number
|
||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||
|
|
|
@ -780,8 +780,7 @@ doesn't, the pipeline analysis won't catch that.
|
|||
|
||||
<Accordion title="Example output" spaced>
|
||||
|
||||
```json
|
||||
### Structured
|
||||
```json {title="Structured"}
|
||||
{
|
||||
"summary": {
|
||||
"tagger": {
|
||||
|
|
|
@ -69,8 +69,7 @@ Initialize the sentencizer.
|
|||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
|
||||
|
||||
```python
|
||||
### punct_chars defaults
|
||||
```python {title="punct_chars defaults"}
|
||||
['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።',
|
||||
'፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫',
|
||||
'᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉',
|
||||
|
|
|
@ -61,8 +61,7 @@ Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
|||
information to construct a `Language` object, loads in the model data and
|
||||
weights, and returns it.
|
||||
|
||||
```python
|
||||
### Abstract example
|
||||
```python {title="Abstract example"}
|
||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||
nlp = cls() # 2. Initialize it
|
||||
for name in pipeline:
|
||||
|
|
|
@ -384,8 +384,7 @@ to raw text with no highlighting. An optional label can be added as the first
|
|||
line with the prefix `####` (Python-like) and `///` (JavaScript-like). the
|
||||
indented block as plain text and preserve whitespace.
|
||||
|
||||
```python
|
||||
### Using spaCy
|
||||
```python {title="Using spaCy"}
|
||||
import spacy
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
doc = nlp("This is a sentence.")
|
||||
|
@ -407,8 +406,7 @@ adding `{highlight="..."}` to the headline. Acceptable ranges are spans like
|
|||
> ```
|
||||
> ````
|
||||
|
||||
```python
|
||||
### Using the matcher {highlight="5-7"}
|
||||
```python {title="Using the matcher",highlight="5-7"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -435,8 +433,7 @@ interactive widget defaults to a regular code block.
|
|||
> ```
|
||||
> ````
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
doc = nlp("This is a sentence.")
|
||||
|
@ -554,8 +551,7 @@ This is a regular paragraph with a [link](https://spacy.io) and **bold text**.
|
|||
| -------- | -------- |
|
||||
| Column 1 | Column 2 |
|
||||
|
||||
```python
|
||||
### Code block title {highlight="2-3"}
|
||||
```python {title="Code block title",highlight="2-3"}
|
||||
import spacy
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
doc = nlp("Hello world")
|
||||
|
|
|
@ -7,8 +7,7 @@ later, depending on your use case.
|
|||
|
||||
Named entities are available as the `ents` property of a `Doc`:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
|
|
@ -11,8 +11,7 @@ Linguistic annotations are available as
|
|||
efficiency. So to get the readable string representation of an attribute, we
|
||||
need to add an underscore `_` to its name:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
|
|
@ -4,8 +4,7 @@ language. For example, punctuation at the end of a sentence should be split off
|
|||
– whereas "U.K." should remain one token. Each `Doc` consists of individual
|
||||
tokens, and we can iterate over them:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
|
|
@ -3,8 +3,7 @@ multi-dimensional meaning representations of a word. Word vectors can be
|
|||
generated using an algorithm like
|
||||
[word2vec](https://en.wikipedia.org/wiki/Word2vec) and usually look like this:
|
||||
|
||||
```python
|
||||
### banana.vector
|
||||
```python {title="banana.vector"}
|
||||
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
||||
3.28450017e-02, -4.19569999e-01, 7.20689967e-02,
|
||||
-3.74760002e-01, 5.74599989e-02, -1.24009997e-02,
|
||||
|
@ -42,8 +41,7 @@ the [`Token.vector`](/api/token#vector) attribute.
|
|||
default to an average of their token vectors. You can also check if a token has
|
||||
a vector assigned, and get the L2 norm, which can be used to normalize vectors.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
|
@ -93,8 +91,7 @@ similarity.
|
|||
> You should see that the similarity results are identical to the token
|
||||
> similarity.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_md") # make sure to use larger package!
|
||||
|
|
|
@ -123,8 +123,7 @@ the entity recognizer, use a
|
|||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as their model's
|
||||
`tok2vec` argument, which connects to the `tok2vec` component model.
|
||||
|
||||
```ini
|
||||
### Shared {highlight="1-2,4-5,19-20"}
|
||||
```ini {title="Shared",highlight="1-2,4-5,19-20"}
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
@ -152,8 +151,7 @@ In the independent setup, the entity recognizer component defines its own
|
|||
same. This makes them fully independent and doesn't require an upstream
|
||||
[`Tok2Vec`](/api/tok2vec) component to be present in the pipeline.
|
||||
|
||||
```ini
|
||||
### Independent {highlight="7-8"}
|
||||
```ini {title="Independent", highlight="7-8"}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
|
@ -210,8 +208,7 @@ your package manager and CUDA version. If you skip this step, pip will install
|
|||
PyTorch as a dependency below, but it may not find the best version for your
|
||||
setup.
|
||||
|
||||
```bash
|
||||
### Example: Install PyTorch 1.11.0 for CUDA 11.3 with pip
|
||||
```bash {title="Example: Install PyTorch 1.11.0 for CUDA 11.3 with pip"}
|
||||
# See: https://pytorch.org/get-started/locally/
|
||||
$ pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
|
||||
```
|
||||
|
@ -224,8 +221,7 @@ environment variable if your CUDA runtime is installed in a non-standard
|
|||
location. Putting it all together, if you had installed CUDA 11.3 in
|
||||
`/opt/nvidia/cuda`, you would run:
|
||||
|
||||
```bash
|
||||
### Installation with CUDA
|
||||
```bash {title="Installation with CUDA"}
|
||||
$ export CUDA_PATH="/opt/nvidia/cuda"
|
||||
$ pip install -U %%SPACY_PKG_NAME[cuda113,transformers]%%SPACY_PKG_FLAGS
|
||||
```
|
||||
|
@ -235,8 +231,7 @@ that require [`SentencePiece`](https://github.com/google/sentencepiece) (e.g.,
|
|||
ALBERT, CamemBERT, XLNet, Marian, and T5), install the additional dependencies
|
||||
with:
|
||||
|
||||
```bash
|
||||
### Install sentencepiece
|
||||
```bash {title="Install sentencepiece"}
|
||||
$ pip install transformers[sentencepiece]
|
||||
```
|
||||
|
||||
|
@ -261,8 +256,7 @@ transformer-based [pipelines](/models) provided by spaCy end on `_trf`, e.g.
|
|||
$ python -m spacy download en_core_web_trf
|
||||
```
|
||||
|
||||
```python
|
||||
### Example
|
||||
```python {title="Example"}
|
||||
import spacy
|
||||
from thinc.api import set_gpu_allocator, require_gpu
|
||||
|
||||
|
@ -343,8 +337,7 @@ component:
|
|||
> )
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg",excerpt="true"}
|
||||
[components.transformer]
|
||||
factory = "transformer"
|
||||
max_batch_items = 4096
|
||||
|
@ -424,8 +417,7 @@ subsentences of at most `max_length` tokens are returned.
|
|||
> max_length = 25
|
||||
> ```
|
||||
|
||||
```python
|
||||
### code.py
|
||||
```python {title="code.py"}
|
||||
import spacy_transformers
|
||||
|
||||
@spacy_transformers.registry.span_getters("custom_sent_spans")
|
||||
|
@ -475,8 +467,7 @@ is where we'll plug in our transformer model, using the
|
|||
[TransformerListener](/api/architectures#TransformerListener) layer, which
|
||||
sneakily delegates to the `Transformer` pipeline component.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="12"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="12"}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
|
@ -735,8 +726,7 @@ whole model), or a
|
|||
spaCy's built-in model architectures have a reference named `"tok2vec"` that
|
||||
will refer to the right layer.
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
# 1. Use the whole model of the "tok2vec" component
|
||||
[pretraining]
|
||||
component = "tok2vec"
|
||||
|
@ -759,8 +749,7 @@ A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
|||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
|
||||
make use of the final output, you could fill in this value in your config file:
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
|
||||
[paths]
|
||||
init_tok2vec = "pretrain/model4.bin"
|
||||
|
|
|
@ -40,8 +40,7 @@ this config, you won't be able to change it anymore. The architecture is like a
|
|||
recipe for the network, and you can't change the recipe once the dish has
|
||||
already been prepared. You have to make a new one.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
|
@ -126,8 +125,7 @@ default. This architecture combines a simple bag-of-words model with a neural
|
|||
network, usually resulting in the most accurate results, but at the cost of
|
||||
speed. The config file for this model would look something like this:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
labels = []
|
||||
|
@ -165,8 +163,7 @@ use those by swapping out the definition of the textcat's model. For instance,
|
|||
to use the simple and fast bag-of-words model
|
||||
[TextCatBOW](/api/architectures#TextCatBOW), you can change the config to:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="6-10"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="6-10"}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
labels = []
|
||||
|
@ -198,8 +195,7 @@ These steps together compute dense, context-sensitive representations of the
|
|||
tokens, and their combination forms a typical
|
||||
[`Tok2Vec`](/api/architectures#Tok2Vec) layer:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
@ -220,8 +216,7 @@ a sublayer for another one, for instance changing the first sublayer to a
|
|||
character embedding with the [CharacterEmbed](/api/architectures#CharacterEmbed)
|
||||
architecture:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.CharacterEmbed.v2"
|
||||
# ...
|
||||
|
@ -260,8 +255,7 @@ Let's use PyTorch to define a very simple neural network consisting of two
|
|||
hidden `Linear` layers with `ReLU` activation and dropout, and a
|
||||
softmax-activated output layer:
|
||||
|
||||
```python
|
||||
### PyTorch model
|
||||
```python {title="PyTorch model"}
|
||||
from torch import nn
|
||||
|
||||
torch_model = nn.Sequential(
|
||||
|
@ -312,8 +306,7 @@ architecture a name so spaCy knows how to find it, and allows passing in
|
|||
arguments like hyperparameters via the [config](/usage/training#config). The
|
||||
full example then becomes:
|
||||
|
||||
```python
|
||||
### Registering the architecture {highlight="9"}
|
||||
```python {title="Registering the architecture",highlight="9"}
|
||||
from typing import List
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, PyTorchWrapper, chain, with_array
|
||||
|
@ -352,8 +345,7 @@ by specifying it in the config file. In this configuration, all required
|
|||
parameters for the various subcomponents of the custom architecture are passed
|
||||
in as settings via the config.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="5-5"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="5-5"}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
|
@ -381,8 +373,7 @@ GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
|
|||
respective libraries, preventing OOM errors when there's available memory
|
||||
sitting in the other library's pool.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[training]
|
||||
gpu_allocator = "pytorch"
|
||||
```
|
||||
|
@ -461,8 +452,7 @@ you have to call
|
|||
[`Model.initialize`](https://thinc.ai/docs/api-model#initialize) with an **input
|
||||
sample** `X` and an **output sample** `Y` with the correct dimensions:
|
||||
|
||||
```python
|
||||
### Shape inference with initialization {highlight="3,7,10"}
|
||||
```python {title="Shape inference with initialization",highlight="3,7,10"}
|
||||
with Model.define_operators({">>": chain}):
|
||||
layers = (
|
||||
Relu(hidden_width)
|
||||
|
@ -564,8 +554,7 @@ matrix** (~~Floats2d~~) of predictions:
|
|||
> type checks and validation. See the section on [type signatures](#type-sigs)
|
||||
> for details.
|
||||
|
||||
```python
|
||||
### The model architecture
|
||||
```python {title="The model architecture"}
|
||||
@spacy.registry.architectures("rel_model.v1")
|
||||
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
|
||||
model = ... # 👈 model will go here
|
||||
|
@ -590,8 +579,7 @@ transforms the instance tensor into a final tensor holding the predictions:
|
|||
> # ...
|
||||
> ```
|
||||
|
||||
```python
|
||||
### The model architecture {highlight="6"}
|
||||
```python {title="The model architecture",highlight="6"}
|
||||
@spacy.registry.architectures("rel_model.v1")
|
||||
def create_relation_model(
|
||||
create_instance_tensor: Model[List[Doc], Floats2d],
|
||||
|
@ -614,8 +602,7 @@ The `classification_layer` could be something like a
|
|||
> nO = null
|
||||
> ```
|
||||
|
||||
```python
|
||||
### The classification layer
|
||||
```python {title="The classification layer"}
|
||||
@spacy.registry.architectures("rel_classification_layer.v1")
|
||||
def create_classification_layer(
|
||||
nO: int = None, nI: int = None
|
||||
|
@ -651,8 +638,7 @@ that has the full implementation.
|
|||
> # ...
|
||||
> ```
|
||||
|
||||
```python
|
||||
### The layer that creates the instance tensor
|
||||
```python {title="The layer that creates the instance tensor"}
|
||||
@spacy.registry.architectures("rel_instance_tensor.v1")
|
||||
def create_tensors(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
|
@ -732,8 +718,7 @@ are within a **maximum distance** (in number of tokens) of each other:
|
|||
> max_length = 100
|
||||
> ```
|
||||
|
||||
```python
|
||||
### Candidate generation
|
||||
```python {title="Candidate generation"}
|
||||
@spacy.registry.misc("rel_instance_generator.v1")
|
||||
def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
|
||||
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
|
||||
|
@ -776,8 +761,7 @@ above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as
|
|||
training data, will include their gold-standard relation annotations in
|
||||
`example.reference._.rel`.
|
||||
|
||||
```python
|
||||
### Registering the extension attribute
|
||||
```python {title="Registering the extension attribute"}
|
||||
from spacy.tokens import Doc
|
||||
Doc.set_extension("rel", default={})
|
||||
```
|
||||
|
@ -790,8 +774,7 @@ create a subclass of [`TrainablePipe`](/api/pipe) that holds the model.
|
|||
|
||||
![Illustration of Pipe methods](/images/trainable_component.svg)
|
||||
|
||||
```python
|
||||
### Pipeline component skeleton
|
||||
```python {title="Pipeline component skeleton"}
|
||||
from spacy.pipeline import TrainablePipe
|
||||
|
||||
class RelationExtractor(TrainablePipe):
|
||||
|
@ -828,8 +811,7 @@ and the name of this component. Additionally, this component, just like the
|
|||
will predict scores for each label. We add convenience methods to easily
|
||||
retrieve and add to them.
|
||||
|
||||
```python
|
||||
### The constructor (continued)
|
||||
```python {title="The constructor (continued)"}
|
||||
def __init__(self, vocab, model, name="rel"):
|
||||
"""Create a component instance."""
|
||||
# ...
|
||||
|
@ -858,8 +840,7 @@ will be used to do
|
|||
layers of the neural network. This is triggered by calling
|
||||
[`Model.initialize`](https://thinc.ai/api/model#initialize).
|
||||
|
||||
```python
|
||||
### The initialize method {highlight="12,15,18,22"}
|
||||
```python {title="The initialize method",highlight="12,15,18,22"}
|
||||
from itertools import islice
|
||||
|
||||
def initialize(
|
||||
|
@ -899,8 +880,7 @@ update the weights of the model layers. Thinc provides several
|
|||
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
|
||||
implementation of the `get_loss` function.
|
||||
|
||||
```python
|
||||
### The update method {highlight="12-14"}
|
||||
```python {title="The update method",highlight="12-14"}
|
||||
def update(
|
||||
self,
|
||||
examples: Iterable[Example],
|
||||
|
@ -926,8 +906,7 @@ delegate to the internal model's
|
|||
[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
|
||||
of `Doc` objects and returns a ~~Floats2d~~ array:
|
||||
|
||||
```python
|
||||
### The predict method
|
||||
```python {title="The predict method"}
|
||||
def predict(self, docs: Iterable[Doc]) -> Floats2d:
|
||||
predictions = self.model.predict(docs)
|
||||
return self.model.ops.asarray(predictions)
|
||||
|
@ -944,8 +923,7 @@ need to refer to the model's `get_instances` function that defined which pairs
|
|||
of entities were relevant candidates, so that the predictions can be linked to
|
||||
those exact entities:
|
||||
|
||||
```python
|
||||
### The set_annotations method {highlight="5-6,10"}
|
||||
```python {title="The set_annotations method",highlight="5-6,10"}
|
||||
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
|
||||
c = 0
|
||||
get_instances = self.model.attrs["get_instances"]
|
||||
|
@ -962,8 +940,7 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
|
|||
Under the hood, when the pipe is applied to a document, it delegates to the
|
||||
`predict` and `set_annotations` methods:
|
||||
|
||||
```python
|
||||
### The __call__ method
|
||||
```python {title="The __call__ method"}
|
||||
def __call__(self, doc: Doc):
|
||||
predictions = self.predict([doc])
|
||||
self.set_annotations([doc], predictions)
|
||||
|
@ -974,8 +951,7 @@ There is one more optional method to implement: [`score`](/api/pipe#score)
|
|||
calculates the performance of your component on a set of examples, and returns
|
||||
the results as a dictionary:
|
||||
|
||||
```python
|
||||
### The score method
|
||||
```python {title="The score method"}
|
||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
prf = PRFScore()
|
||||
for example in examples:
|
||||
|
@ -1014,8 +990,7 @@ assigns it a name and lets you create the component with
|
|||
> rel_micro_f = 1.0
|
||||
> ```
|
||||
|
||||
```python
|
||||
### Registering the pipeline component
|
||||
```python {title="Registering the pipeline component"}
|
||||
from spacy.language import Language
|
||||
|
||||
@Language.factory("relation_extractor")
|
||||
|
@ -1027,8 +1002,7 @@ You can extend the decorator to include information such as the type of
|
|||
annotations that are required for this component to run, the type of annotations
|
||||
it produces, and the scores that can be calculated:
|
||||
|
||||
```python
|
||||
### Factory annotations {highlight="5-11"}
|
||||
```python {title="Factory annotations",highlight="5-11"}
|
||||
from spacy.language import Language
|
||||
|
||||
@Language.factory(
|
||||
|
|
|
@ -62,8 +62,7 @@ allows you to access individual morphological features.
|
|||
> and express that it's a pronoun in the third person.
|
||||
> 2. Inspect `token.morph` for the other tokens.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -80,8 +79,7 @@ spaCy's statistical [`Morphologizer`](/api/morphologizer) component assigns the
|
|||
morphological features and coarse-grained part-of-speech tags as `Token.morph`
|
||||
and `Token.pos`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
|
@ -106,8 +104,7 @@ coarse-grained part-of-speech tags and morphological features.
|
|||
[mapping table](#mappings-exceptions) maps the fine-grained tags to a
|
||||
coarse-grained POS tags and morphological features.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -126,8 +123,7 @@ spaCy provides two pipeline components for lemmatization:
|
|||
2. The [`EditTreeLemmatizer`](/api/edittreelemmatizer)
|
||||
<Tag variant="new">3.3</Tag> component provides a trainable lemmatizer.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
# English pipelines include a rule-based lemmatizer
|
||||
|
@ -238,8 +234,7 @@ head. You can think of noun chunks as a noun plus the words describing the noun
|
|||
get the noun chunks in a document, simply iterate over
|
||||
[`Doc.noun_chunks`](/api/doc#noun_chunks).
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -269,8 +264,7 @@ label, which describes the type of syntactic relation that connects the child to
|
|||
the head. As with other attributes, the value of `.dep` is a hash value. You can
|
||||
get the string value with `.dep_`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -307,8 +301,7 @@ head**. You can therefore iterate over the arcs in the tree by iterating over
|
|||
the words in the sentence. This is usually the best way to match an arc of
|
||||
interest – from below:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.symbols import nsubj, VERB
|
||||
|
||||
|
@ -351,8 +344,7 @@ order. There are also two integer-typed attributes,
|
|||
[`Token.n_rights`](/api/token#n_rights) that give the number of left and right
|
||||
children.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -363,8 +355,7 @@ print(doc[2].n_lefts) # 2
|
|||
print(doc[2].n_rights) # 1
|
||||
```
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
|
@ -387,8 +378,7 @@ sequence of tokens. You can walk up the tree with the
|
|||
> true for the German pipelines, which have many
|
||||
> [non-projective dependencies](https://explosion.ai/blog/german-model#word-order).
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -417,8 +407,7 @@ easiest way to create a `Span` object for a syntactic phrase. Note that
|
|||
`.right_edge` gives a token **within** the subtree – so if you use it as the
|
||||
end-point of a range, don't forget to `+1`!
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -445,8 +434,7 @@ currency values, i.e. entities labeled as `MONEY`, and then uses the dependency
|
|||
parse to find the noun phrase they are referring to – for example `"Net income"`
|
||||
→ `"$9.4 million"`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -491,8 +479,7 @@ If you want to know how to write rules that hook into some type of syntactic
|
|||
construction, just plug the sentence into the visualizer and see how spaCy
|
||||
annotates it.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -567,8 +554,7 @@ on a token, it will return an empty string.
|
|||
> - `U` – Token is a single-token **unit** entity.
|
||||
> - `O` – Token is **outside** an entity.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -603,8 +589,7 @@ directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest
|
|||
way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function
|
||||
and create the new entity as a [`Span`](/api/span).
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
@ -644,8 +629,7 @@ You can also assign entity annotations using the
|
|||
both the `ENT_TYPE` and the `ENT_IOB` attributes in the array you're importing
|
||||
from.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import numpy
|
||||
import spacy
|
||||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||||
|
@ -714,8 +698,7 @@ list of `Doc` objects to displaCy and run
|
|||
For more details and examples, see the
|
||||
[usage guide on visualizing spaCy](/usage/visualizers).
|
||||
|
||||
```python
|
||||
### Named Entity example
|
||||
```python {title="Named Entity example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -919,8 +902,7 @@ rules. This could be very certain expressions, or abbreviations only used in
|
|||
this specific field. Here's how to add a special case rule to an existing
|
||||
[`Tokenizer`](/api/tokenizer) instance:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.symbols import ORTH
|
||||
|
||||
|
@ -967,8 +949,7 @@ tokens produced are identical to `nlp.tokenizer()` except for whitespace tokens:
|
|||
> " SUFFIX
|
||||
> ```
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English()
|
||||
|
@ -1002,8 +983,7 @@ You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
|
|||
to use `re.compile()` to build a regular expression object, and pass its
|
||||
`.search()` and `.finditer()` methods:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import re
|
||||
import spacy
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
@ -1096,8 +1076,7 @@ letters as an infix. If you do not want the tokenizer to split on hyphens
|
|||
between letters, you can modify the existing infix definition from
|
||||
[`lang/punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py):
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
|
||||
|
@ -1181,8 +1160,7 @@ a `Doc` object consisting of the text split on single space characters. We can
|
|||
then overwrite the `nlp.tokenizer` attribute with an instance of our custom
|
||||
tokenizer.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -1232,8 +1210,7 @@ produced by the tokenizer.
|
|||
> **training transformer models** in spaCy, as well as helpful utilities for
|
||||
> aligning word pieces to linguistic tokenization.
|
||||
|
||||
```python
|
||||
### Custom BERT word piece tokenizer
|
||||
```python {title="Custom BERT word piece tokenizer"}
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
from spacy.tokens import Doc
|
||||
import spacy
|
||||
|
@ -1295,8 +1272,7 @@ setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
|
|||
> @tokenizers = "whitespace_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="1"}
|
||||
```python {title="functions.py",highlight="1"}
|
||||
@spacy.registry.tokenizers("whitespace_tokenizer")
|
||||
def create_whitespace_tokenizer():
|
||||
def create_tokenizer(nlp):
|
||||
|
@ -1321,8 +1297,7 @@ correct type.
|
|||
> lowercase = true
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="1"}
|
||||
```python {title="functions.py",highlight="1"}
|
||||
@spacy.registry.tokenizers("bert_word_piece_tokenizer")
|
||||
def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
|
||||
def create_tokenizer(nlp):
|
||||
|
@ -1365,8 +1340,7 @@ boolean values, indicating whether each word is followed by a space.
|
|||
> `Doc` with `words` and `spaces` so that the `doc.text` matches the original
|
||||
> input text.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -1412,8 +1386,7 @@ token.
|
|||
> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that all
|
||||
> tokens now correspond 1-to-1.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.training import Alignment
|
||||
|
||||
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||
|
@ -1465,8 +1438,7 @@ root.
|
|||
> recognized as a named entity, this change will also be reflected in the
|
||||
> `doc.ents`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1535,8 +1507,7 @@ second split subtoken) and "York" should be attached to "in".
|
|||
> 3. Split the token into three tokens instead of two – for example,
|
||||
> `["New", "Yo", "rk"]`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -1565,8 +1536,7 @@ the token indices after splitting.
|
|||
If you don't care about the heads (for example, if you're only running the
|
||||
tokenizer and not the parser), you can attach each subtoken to itself:
|
||||
|
||||
```python
|
||||
### {highlight="3"}
|
||||
```python {highlight="3"}
|
||||
doc = nlp("I live in NewYorkCity")
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 0), (doc[3], 1), (doc[3], 2)]
|
||||
|
@ -1622,8 +1592,7 @@ values can't be overwritten. For more details, see the
|
|||
> you need to provide a list of extension attribute values as the `"_"`
|
||||
> property, one for each split subtoken.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Token
|
||||
|
||||
|
@ -1648,8 +1617,7 @@ has sentence boundaries by calling
|
|||
[`Doc.has_annotation`](/api/doc#has_annotation) with the attribute name
|
||||
`"SENT_START"`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1684,8 +1652,7 @@ with spaCy's provided trained pipelines. For social media or conversational text
|
|||
that doesn't follow the same rules, your application may benefit from a custom
|
||||
trained or rule-based component.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1719,8 +1686,7 @@ without the parser and then enable the sentence recognizer explicitly with
|
|||
> which is better at predicting sentence boundaries when punctuation is not
|
||||
> present.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm", exclude=["parser"])
|
||||
|
@ -1737,8 +1703,7 @@ The [`Sentencizer`](/api/sentencizer) component is a
|
|||
punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only
|
||||
need sentence boundaries without dependency parses.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
@ -1777,8 +1742,7 @@ for unset sentence boundaries. This approach can be useful if you want to
|
|||
implement **additional** rules specific to your data, while still being able to
|
||||
take advantage of dependency-based sentence segmentation.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.language import Language
|
||||
import spacy
|
||||
|
||||
|
@ -1828,8 +1792,7 @@ The following example shows how the tag and POS `NNP`/`PROPN` can be specified
|
|||
for the phrase `"The Who"`, overriding the tags provided by the statistical
|
||||
tagger and the POS tag map.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1932,8 +1895,7 @@ the removed words, mapped to `(string, score)` tuples, where `string` is the
|
|||
entry the removed word was mapped to and `score` the similarity score between
|
||||
the two words.
|
||||
|
||||
```python
|
||||
### Removed words
|
||||
```python {title="Removed words"}
|
||||
{
|
||||
"Shore": ("coast", 0.732257),
|
||||
"Precautionary": ("caution", 0.490973),
|
||||
|
@ -1978,8 +1940,7 @@ be slower than approaches that work with the whole vectors table at once, but
|
|||
it's a great approach for once-off conversions before you save out your `nlp`
|
||||
object to disk.
|
||||
|
||||
```python
|
||||
### Adding vectors
|
||||
```python {title="Adding vectors"}
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
vector_data = {
|
||||
|
@ -2005,8 +1966,7 @@ own language subclass. The subclass should define two attributes: the `lang`
|
|||
overview of the available attributes that can be overwritten, see the
|
||||
[`Language.Defaults`](/api/language#defaults) documentation.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
|
||||
class CustomEnglishDefaults(English.Defaults):
|
||||
|
@ -2048,8 +2008,7 @@ language name, and even train pipelines with it and refer to it in your
|
|||
> python -m spacy train config.cfg --code code.py
|
||||
> ```
|
||||
|
||||
```python
|
||||
### Registering a custom language {highlight="7,12-13"}
|
||||
```python {title="Registering a custom language",highlight="7,12-13"}
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
|
|
@ -113,8 +113,7 @@ The Chinese language class supports three word segmentation options, `char`,
|
|||
> nlp.tokenizer.initialize(pkuseg_model="mixed")
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.zh.ChineseTokenizer"
|
||||
segmenter = "char"
|
||||
|
@ -155,8 +154,7 @@ local path at runtime. See the usage guide on the
|
|||
[config lifecycle](/usage/training#config-lifecycle) for more background on
|
||||
this.
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
[initialize]
|
||||
|
||||
[initialize.tokenizer]
|
||||
|
@ -167,8 +165,7 @@ pkuseg_user_dict = "default"
|
|||
You can also initialize the tokenizer for a blank language class by calling its
|
||||
`initialize` method:
|
||||
|
||||
```python
|
||||
### Examples
|
||||
```python {title="Examples"}
|
||||
# Initialize the pkuseg tokenizer
|
||||
cfg = {"segmenter": "pkuseg"}
|
||||
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
|
||||
|
@ -247,8 +244,7 @@ segmentation and part-of-speech tagging. The default Japanese language class and
|
|||
the provided Japanese pipelines use SudachiPy split mode `A`. The tokenizer
|
||||
config can be used to configure the split mode to `A`, `B` or `C`.
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ja.JapaneseTokenizer"
|
||||
split_mode = "A"
|
||||
|
@ -291,8 +287,7 @@ than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
|||
> nlp = spacy.blank("ko", config=config)
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
```ini {title="config.cfg"}
|
||||
[nlp]
|
||||
lang = "ko"
|
||||
tokenizer = {"@tokenizers" = "spacy.Tokenizer.v1"}
|
||||
|
@ -416,8 +411,7 @@ or configure your own download script using the URL of the archive file. The
|
|||
archive consists of a package directory that contains another directory with the
|
||||
pipeline data.
|
||||
|
||||
```yaml
|
||||
### Directory structure {highlight="6"}
|
||||
```yaml {title="Directory structure",highlight="6"}
|
||||
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
||||
├── setup.py # setup file for pip installation
|
||||
├── meta.json # copy of pipeline meta
|
||||
|
@ -493,8 +487,7 @@ If you've installed a trained pipeline via [`spacy download`](/api/cli#download)
|
|||
or directly via pip, you can also `import` it and then call its `load()` method
|
||||
with no arguments:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import en_core_web_sm
|
||||
|
||||
nlp = en_core_web_sm.load()
|
||||
|
@ -535,8 +528,7 @@ installation, you can upload the pipeline packages there. pip's
|
|||
supports both package names to download via a PyPi server, as well as
|
||||
[direct URLs](#pipeline-urls).
|
||||
|
||||
```text
|
||||
### requirements.txt
|
||||
```text {title="requirements.txt"}
|
||||
spacy>=3.0.0,<4.0.0
|
||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
|
||||
```
|
||||
|
|
|
@ -60,8 +60,7 @@ so we can iterate over them and access the named entity predictions:
|
|||
> 1. Also disable the `"ner"` component. You'll see that the `doc.ents` are now
|
||||
> empty, because the entity recognizer didn't run.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
texts = [
|
||||
|
@ -95,8 +94,7 @@ the input should be a sequence of `(text, context)` tuples and the output will
|
|||
be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
|
||||
the context and save it in a [custom attribute](#custom-components-attributes):
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -246,8 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the
|
|||
together and returns an instance of `Language` with a pipeline set and access to
|
||||
the binary data:
|
||||
|
||||
```python
|
||||
### spacy.load under the hood
|
||||
```python {title="spacy.load under the hood"}
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
|
||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||
|
@ -266,8 +263,7 @@ subsequently to the `Token` and `Span` which are only views of the `Doc`, and
|
|||
don't own any data themselves. All components return the modified document,
|
||||
which is then processed by the next component in the pipeline.
|
||||
|
||||
```python
|
||||
### The pipeline under the hood
|
||||
```python {title="The pipeline under the hood"}
|
||||
doc = nlp.make_doc("This is a sentence") # Create a Doc from raw text
|
||||
for name, proc in nlp.pipeline: # Iterate over components in order
|
||||
doc = proc(doc) # Apply each component
|
||||
|
@ -390,8 +386,7 @@ call its `restore()` method to restore the disabled components when needed. This
|
|||
can be useful if you want to prevent unnecessary code indentation of large
|
||||
blocks.
|
||||
|
||||
```python
|
||||
### Disable for block
|
||||
```python {title="Disable for block"}
|
||||
# 1. Use as a context manager
|
||||
with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]):
|
||||
doc = nlp("I won't be tagged and parsed")
|
||||
|
@ -502,8 +497,7 @@ vectors available – otherwise, it won't be able to make the same predictions.
|
|||
> frozen_components = ["ner"]
|
||||
> ```
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
# The source pipeline with different components
|
||||
|
@ -533,8 +527,7 @@ table instead of only returning the structured data.
|
|||
> `"entity_linker"`. The analysis should now show no problems, because
|
||||
> requirements are met.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
@ -546,8 +539,7 @@ analysis = nlp.analyze_pipes(pretty=True)
|
|||
|
||||
<Accordion title="Example output">
|
||||
|
||||
```json
|
||||
### Structured
|
||||
```json {title="Structured"}
|
||||
{
|
||||
"summary": {
|
||||
"tagger": {
|
||||
|
@ -565,7 +557,12 @@ analysis = nlp.analyze_pipes(pretty=True)
|
|||
},
|
||||
"problems": {
|
||||
"tagger": [],
|
||||
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
|
||||
"entity_linker": [
|
||||
"doc.ents",
|
||||
"doc.sents",
|
||||
"token.ent_iob",
|
||||
"token.ent_type"
|
||||
]
|
||||
},
|
||||
"attrs": {
|
||||
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
|
||||
|
@ -698,8 +695,7 @@ component under the name `"info_component"`.
|
|||
> else. spaCy should now complain that it doesn't know a component of the
|
||||
> name `"info_component"`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
|
||||
|
@ -732,8 +728,7 @@ boundaries.
|
|||
> to `None` (missing value), the parser will assign sentence boundaries in
|
||||
> between.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
|
||||
|
@ -786,8 +781,7 @@ All other settings can be passed in by the user via the `config` argument on
|
|||
[`@Language.factory`](/api/language#factory) decorator also lets you define a
|
||||
`default_config` that's used as a fallback.
|
||||
|
||||
```python
|
||||
### With config {highlight="4,9"}
|
||||
```python {title="With config",highlight="4,9"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
|
||||
|
@ -851,8 +845,7 @@ a token, the `Token.norm_` with an entry from a language-specific lookup table.
|
|||
It's registered twice under the name `"token_normalizer"` – once using
|
||||
`@English.factory` and once using `@German.factory`:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
|
||||
|
@ -921,8 +914,7 @@ case-sensitive.
|
|||
> should see an entry for the acronyms component, referencing the factory
|
||||
> `acronyms` and the config settings.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc
|
||||
from spacy.matcher import PhraseMatcher
|
||||
|
@ -1024,8 +1016,7 @@ argument, the name:
|
|||
> batchers. `misc` is intended for miscellaneous functions that don't fit
|
||||
> anywhere else.
|
||||
|
||||
```python
|
||||
### Registered function for assets {highlight="1"}
|
||||
```python {title="Registered function for assets",highlight="1"}
|
||||
@spacy.registry.misc("acronyms.slang_dict.v1")
|
||||
def create_acronyms_slang_dict():
|
||||
dictionary = {"lol": "laughing out loud", "brb": "be right back"}
|
||||
|
@ -1093,8 +1084,7 @@ on [serialization methods](/usage/saving-loading/#serialization-methods).
|
|||
> receive the directory path `/path/acronyms` and can then create files in this
|
||||
> directory.
|
||||
|
||||
```python
|
||||
### Custom serialization methods {highlight="7-11,13-15"}
|
||||
```python {title="Custom serialization methods",highlight="7-11,13-15"}
|
||||
import srsly
|
||||
from spacy.util import ensure_path
|
||||
|
||||
|
@ -1176,8 +1166,7 @@ be defined via the config – in this case a dictionary `data`.
|
|||
> path = "/path/to/slang_dict.json"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### Custom initialize method {highlight="5-6"}
|
||||
```python {title="Custom initialize method",highlight="5-6"}
|
||||
class AcronymComponent:
|
||||
def __init__(self):
|
||||
self.data = {}
|
||||
|
@ -1240,8 +1229,7 @@ string value.
|
|||
> and write a type hint for `log_level` that only accepts the exact string
|
||||
> values `"DEBUG"`, `"INFO"` or `"CRITICAL"`.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc
|
||||
|
@ -1330,8 +1318,7 @@ components. It also makes the components more **modular** and lets you
|
|||
[swap](/usage/layers-architectures#swap-architectures) different architectures
|
||||
in your config, and re-use model definitions.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components]
|
||||
|
||||
[components.textcat]
|
||||
|
@ -1465,8 +1452,7 @@ particular instance. If an attribute of the same name already exists, or if
|
|||
you're trying to access an attribute that hasn't been registered, spaCy will
|
||||
raise an `AttributeError`.
|
||||
|
||||
```python
|
||||
### Example
|
||||
```python {title="Example"}
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
fruits = ["apple", "pear", "banana", "orange", "strawberry"]
|
||||
|
@ -1501,8 +1487,7 @@ entity annotations for countries and sets custom attributes on the `Doc` and
|
|||
`Span` – for example, the capital, latitude/longitude coordinates and even the
|
||||
country flag.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import requests
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
|
@ -1600,8 +1585,7 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
|||
| `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
|
||||
| `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) |
|
||||
|
||||
```python
|
||||
### Add custom similarity hooks
|
||||
```python {title="Add custom similarity hooks"}
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
|
@ -1759,8 +1743,7 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
|||
> attributes. By definition, each token can only be part of one entity, so
|
||||
> overlapping entity spans are not allowed.
|
||||
|
||||
```python
|
||||
### {highlight="1,8-9"}
|
||||
```python {highlight="1,8-9"}
|
||||
import your_custom_entity_recognizer
|
||||
from spacy.training import biluo_tags_to_spans
|
||||
from spacy.language import Language
|
||||
|
@ -1798,8 +1781,7 @@ label scheme than spaCy's default models.
|
|||
> it fully replaces the `nlp` object instead of providing a pipeline component,
|
||||
> since it also needs to handle tokenization.
|
||||
|
||||
```python
|
||||
### {highlight="1,11,17-19"}
|
||||
```python {highlight="1,11,17-19"}
|
||||
import your_custom_model
|
||||
from spacy.language import Language
|
||||
from spacy.symbols import POS, TAG, DEP, HEAD
|
||||
|
|
|
@ -393,8 +393,7 @@ it will export a directory `model-best`, which you can then re-use in other
|
|||
commands.
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
commands:
|
||||
- name: train
|
||||
help: 'Train a spaCy pipeline using the specified corpus and config'
|
||||
|
@ -450,8 +449,7 @@ directory:
|
|||
> directories: ['assets', 'configs', 'corpus', 'metas', 'metrics', 'notebooks', 'packages', 'scripts', 'training']
|
||||
> ```
|
||||
|
||||
```yaml
|
||||
### Example project directory
|
||||
```yaml {title="Example project directory"}
|
||||
├── project.yml # the project settings
|
||||
├── project.lock # lockfile that tracks inputs/outputs
|
||||
├── assets/ # downloaded data assets
|
||||
|
@ -485,8 +483,7 @@ calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and
|
|||
uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test
|
||||
report:
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
commands:
|
||||
- name: test
|
||||
help: 'Test the trained pipeline'
|
||||
|
@ -522,8 +519,7 @@ that you can define via your `project.yml`:
|
|||
> types. For instance, `batch_size: int` means that the value provided via the
|
||||
> command line is converted to an integer.
|
||||
|
||||
```python
|
||||
### scripts/custom_evaluation.py
|
||||
```python {title="scripts/custom_evaluation.py"}
|
||||
import typer
|
||||
|
||||
def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str):
|
||||
|
@ -550,8 +546,7 @@ override settings on the command line – for example using `--vars.batch_size`.
|
|||
> system). It also normalizes references to `python3`, `pip3` and `pip`.
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
vars:
|
||||
batch_size: 128
|
||||
|
||||
|
@ -575,8 +570,7 @@ settings on the command line and passing through system-level settings.
|
|||
> BATCH_SIZE=128 python -m spacy project run evaluate
|
||||
> ```
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
env:
|
||||
batch_size: BATCH_SIZE
|
||||
gpu_id: GPU_ID
|
||||
|
@ -675,8 +669,7 @@ protocols.
|
|||
> $ python -m spacy project pull local
|
||||
> ```
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
remotes:
|
||||
default: 's3://my-spacy-bucket'
|
||||
local: '/mnt/scratch/cache'
|
||||
|
@ -707,8 +700,7 @@ their contents.
|
|||
|
||||
For instance, let's say you had the following command in your `project.yml`:
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
- name: train
|
||||
help: 'Train a spaCy pipeline using the specified corpus and config'
|
||||
script:
|
||||
|
@ -858,8 +850,7 @@ collected with Prodigy and training a spaCy pipeline:
|
|||
> ```
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
vars:
|
||||
prodigy:
|
||||
train_dataset: "fashion_brands_training"
|
||||
|
@ -904,8 +895,7 @@ if accuracy increases in the last segment, this could indicate that collecting
|
|||
more annotations of the same type might improve the model further.
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml (excerpt)
|
||||
```yaml {title="project.yml (excerpt)"}
|
||||
- name: "train_curve"
|
||||
help: "Train the model with Prodigy by using different portions of training examples to evaluate if more annotations can potentially improve the performance"
|
||||
script:
|
||||
|
@ -975,8 +965,7 @@ and explore your own custom trained pipelines.
|
|||
> ```
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
commands:
|
||||
- name: visualize
|
||||
help: "Visualize the pipeline's output interactively using Streamlit"
|
||||
|
@ -1023,8 +1012,7 @@ query your API from Python and JavaScript (Vanilla JS and React).
|
|||
> ```
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
- name: "serve"
|
||||
help: "Serve the models via a FastAPI REST API using the given host and port"
|
||||
script:
|
||||
|
@ -1135,8 +1123,7 @@ automatically as part of a workflow. Make sure to set `--build wheel` when
|
|||
running `spacy package` to build a wheel file for your pipeline package.
|
||||
|
||||
{/* prettier-ignore */}
|
||||
```yaml
|
||||
### project.yml
|
||||
```yaml {title="project.yml"}
|
||||
- name: "push_to_hub"
|
||||
help: "Upload the trained model to the Hugging Face Hub"
|
||||
script:
|
||||
|
|
|
@ -102,8 +102,7 @@ First, we initialize the `Matcher` with a vocab. The matcher must always share
|
|||
the same vocab with the documents it will operate on. We can now call
|
||||
[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -305,8 +304,7 @@ more valid tokens, `Doc.char_span` returns `None`.
|
|||
> `"USA"` is a single token and `Span` objects are **sequences of tokens**. So
|
||||
> `"US"` cannot be its own span, because it does not end on a token boundary.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
import re
|
||||
|
||||
|
@ -353,8 +351,7 @@ the (white)space tokens are split on. That hopefully shouldn't happen, though,
|
|||
because it'd mean your regex is producing matches with leading or trailing
|
||||
whitespace.
|
||||
|
||||
```python
|
||||
### {highlight="5-8"}
|
||||
```python {highlight="5-8"}
|
||||
span = doc.char_span(start, end)
|
||||
if span is not None:
|
||||
print("Found match:", span.text)
|
||||
|
@ -427,8 +424,7 @@ The `Matcher` can validate patterns against a JSON schema with the option
|
|||
`validate=True`. This is useful for debugging patterns during development, in
|
||||
particular for catching unsupported attributes.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -452,8 +448,7 @@ corpus of blog articles, and you want to match all mentions of "Google I/O"
|
|||
match on the uppercase versions, avoiding matches with phrases such as "Google
|
||||
i/o".
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
|
@ -525,8 +520,7 @@ label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
|
|||
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
|
||||
using the `match_id` as the span label.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
|
@ -560,8 +554,7 @@ process the text. You can achieve this by adding a
|
|||
that's called on each `Doc` object, merges the leftover HTML spans and sets an
|
||||
attribute `bad_html` on the token.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -656,8 +649,7 @@ calculate the start and end of the matched span within the sentence. Using
|
|||
displaCy in ["manual" mode](/usage/visualizers#manual-usage) lets you pass in a
|
||||
list of dictionaries containing the text and entities to render.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -728,8 +720,7 @@ set of rules like this is often better than training a model. It'll produce more
|
|||
predictable results, is much easier to modify and extend, and doesn't require
|
||||
any training data – only a set of test cases.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -777,8 +768,7 @@ that you can create a pattern for one or more emoji tokens. Valid hashtags
|
|||
usually consist of a `#`, plus a sequence of ASCII characters with no
|
||||
whitespace, making them easy to match as well.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
@ -848,8 +838,7 @@ To label the hashtags, we can use a
|
|||
[custom attribute](/usage/processing-pipelines#custom-components-attributes) set
|
||||
on the respective token:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Token
|
||||
|
@ -888,8 +877,7 @@ patterns can contain single or multiple tokens.
|
|||
|
||||
### Adding phrase patterns {id="adding-phrase-patterns"}
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import PhraseMatcher
|
||||
|
||||
|
@ -940,8 +928,7 @@ By default, the `PhraseMatcher` will match on the verbatim token text, e.g.
|
|||
pattern to the matched `Doc`. For example, using the attribute `LOWER` lets you
|
||||
match on `Token.lower` and create case-insensitive match patterns:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
|
||||
|
@ -975,8 +962,7 @@ will be tokenized and you'll be able to find tokens and combinations of tokens
|
|||
based on a few examples. Here, we're matching on the shapes `ddd.d.d.d` and
|
||||
`ddd.ddd.d.d`:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
|
||||
|
@ -1132,8 +1118,7 @@ head to an immediate dependent as `head > child`.
|
|||
The simplest dependency matcher pattern will identify and name a single token in
|
||||
the tree:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import DependencyMatcher
|
||||
|
||||
|
@ -1154,8 +1139,7 @@ print(matches) # [(4851363122962674176, [1])]
|
|||
Now that we have a named anchor token (`anchor_founded`), we can add the founder
|
||||
as the immediate dependent (`>`) of `founded` with the dependency label `nsubj`:
|
||||
|
||||
```python
|
||||
### Step 1 {highlight="8,10"}
|
||||
```python {title="Step 1",highlight="8,10"}
|
||||
pattern = [
|
||||
{
|
||||
"RIGHT_ID": "anchor_founded",
|
||||
|
@ -1173,8 +1157,7 @@ pattern = [
|
|||
|
||||
The direct object (`dobj`) is added in the same way:
|
||||
|
||||
```python
|
||||
### Step 2 {highlight=""}
|
||||
```python {title="Step 2"}
|
||||
pattern = [
|
||||
#...
|
||||
{
|
||||
|
@ -1194,8 +1177,7 @@ tokens into the pattern**. For the final part of our pattern, we'll specify that
|
|||
the token `founded_object` should have a modifier with the dependency relation
|
||||
`amod` or `compound`:
|
||||
|
||||
```python
|
||||
### Step 3 {highlight="7"}
|
||||
```python {title="Step 3",highlight="7"}
|
||||
pattern = [
|
||||
# ...
|
||||
{
|
||||
|
@ -1218,8 +1200,7 @@ right:
|
|||
|
||||
The full pattern comes together as shown in the example below:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import DependencyMatcher
|
||||
|
||||
|
@ -1310,8 +1291,7 @@ matches were to overlap, the pattern matching most tokens takes priority. If
|
|||
they also happen to be equally long, then the match occurring first in the `Doc`
|
||||
is chosen.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English()
|
||||
|
@ -1333,8 +1313,7 @@ entity ruler will only add spans to the `doc.ents` if they don't overlap with
|
|||
existing entities predicted by the model. To overwrite overlapping entities, you
|
||||
can set `overwrite_ents=True` on initialization.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1362,8 +1341,7 @@ The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each
|
|||
pattern. Using the `id` attribute allows multiple patterns to be associated with
|
||||
the same entity.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English()
|
||||
|
@ -1392,8 +1370,7 @@ The [`to_disk`](/api/entityruler#to_disk) and
|
|||
from JSONL (newline-delimited JSON) files, containing one pattern object per
|
||||
line.
|
||||
|
||||
```json
|
||||
### patterns.jsonl
|
||||
```json {title="patterns.jsonl"}
|
||||
{"label": "ORG", "pattern": "Apple"}
|
||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}
|
||||
```
|
||||
|
@ -1491,8 +1468,7 @@ Unlike in `doc.ents`, overlapping matches are allowed in `doc.spans`, so no
|
|||
filtering is required, but optional filtering and sorting can be applied to the
|
||||
spans before they're saved.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
@ -1514,8 +1490,7 @@ always filtered, using [`util.filter_spans`](/api/top-level#util.filter_spans)
|
|||
by default. See the [`SpanRuler` API docs](/api/spanruler) for more information
|
||||
about how to customize the sorting and filtering of matched spans.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1535,8 +1510,7 @@ You can save patterns in a JSONL file (newline-delimited JSON) to load with
|
|||
[`SpanRuler.initialize`](/api/spanruler#initialize) or
|
||||
[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
|
||||
|
||||
```json
|
||||
### patterns.jsonl
|
||||
```json {title="patterns.jsonl"}
|
||||
{"label": "ORG", "pattern": "Apple"}
|
||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}
|
||||
```
|
||||
|
@ -1597,8 +1571,7 @@ or "Dr.". This makes sense, because it makes it easier to resolve the entity
|
|||
type back to a knowledge base. But what if your application needs the full
|
||||
names, _including_ the titles?
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1617,8 +1590,7 @@ expands the entity span by one token. After all, what all titles in this example
|
|||
have in common is that _if_ they occur, they occur in the **previous token**
|
||||
right before the person entity.
|
||||
|
||||
```python
|
||||
### {highlight="9-13"}
|
||||
```python {highlight="9-13"}
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
@ -1646,8 +1618,7 @@ register it as a [pipeline component](/usage/processing-pipelines) so it can run
|
|||
automatically when processing a text. We can use
|
||||
[`nlp.add_pipe`](/api/language#add_pipe) to add it to the current pipeline.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Span
|
||||
|
@ -1696,8 +1667,7 @@ We can now use the [`Span.set_extension`](/api/span#set_extension) method to add
|
|||
the custom extension attribute `"person_title"`, using `get_person_title` as the
|
||||
getter function.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
@ -1739,8 +1709,7 @@ tense**, whether company names are attached to it and whether the person is the
|
|||
subject. All of this information is available in the part-of-speech tags and the
|
||||
dependency parse.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -1804,8 +1773,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
|
|||
> retokenizer.merge(ent)
|
||||
> ```
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
from spacy import displacy
|
||||
|
@ -1842,8 +1810,7 @@ information is in the attached auxiliary "was":
|
|||
|
||||
To solve this, we can adjust the rules to also check for the above construction:
|
||||
|
||||
```python
|
||||
### {highlight="10-12"}
|
||||
```python {highlight="10-12"}
|
||||
@Language.component("extract_person_orgs")
|
||||
def extract_person_orgs(doc):
|
||||
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
|
||||
|
|
|
@ -28,14 +28,12 @@ contains the pipeline configuration and all the relevant settings.
|
|||
> dictionary containing the training configuration, pipeline component factories
|
||||
> and other settings. It is saved out with a pipeline as the `config.cfg`.
|
||||
|
||||
```python
|
||||
### Serialize
|
||||
```python {title="Serialize"}
|
||||
config = nlp.config
|
||||
bytes_data = nlp.to_bytes()
|
||||
```
|
||||
|
||||
```python
|
||||
### Deserialize
|
||||
```python {title="Deserialize"}
|
||||
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
|
||||
nlp = lang_cls.from_config(config)
|
||||
nlp.from_bytes(bytes_data)
|
||||
|
@ -62,8 +60,7 @@ collection of `Doc` objects together, and is much more efficient than calling
|
|||
also control what data gets saved, and you can merge pallets together for easy
|
||||
map/reduce-style processing.
|
||||
|
||||
```python
|
||||
### {highlight="4,8,9,13,14"}
|
||||
```python {highlight="4,8,9,13,14"}
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
|
@ -122,8 +119,7 @@ the entire pipeline once. And instead of pickling several `Doc` objects
|
|||
separately, pickle a list of `Doc` objects. Since they all share a reference to
|
||||
the _same_ `Vocab` object, it will only be included once.
|
||||
|
||||
```python
|
||||
### Pickling objects with shared data {highlight="8-9"}
|
||||
```python {title="Pickling objects with shared data",highlight="8-9"}
|
||||
doc1 = nlp("Hello world")
|
||||
doc2 = nlp("This is a test")
|
||||
|
||||
|
@ -199,8 +195,7 @@ the data to and from a JSON file.
|
|||
> saving out a pipeline with a rule-based entity recognizer and including all
|
||||
> rules _with_ the component data.
|
||||
|
||||
```python
|
||||
### {highlight="16-23,25-30"}
|
||||
```python {highlight="16-23,25-30"}
|
||||
import json
|
||||
from spacy import Language
|
||||
from spacy.util import ensure_path
|
||||
|
@ -240,8 +235,7 @@ After adding the component to the pipeline and adding some data to it, we can
|
|||
serialize the `nlp` object to a directory, which will call the custom
|
||||
component's `to_disk` method.
|
||||
|
||||
```python
|
||||
### {highlight="2-4"}
|
||||
```python {highlight="2-4"}
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
my_component = nlp.add_pipe("my_component")
|
||||
my_component.add({"hello": "world"})
|
||||
|
@ -252,8 +246,7 @@ The contents of the directory would then look like this.
|
|||
`CustomComponent.to_disk` converted the data to a JSON string and saved it to a
|
||||
file `data.json` in its subdirectory:
|
||||
|
||||
```yaml
|
||||
### Directory structure {highlight="2-3"}
|
||||
```yaml {title="Directory structure",highlight="2-3"}
|
||||
└── /path/to/pipeline
|
||||
├── my_component # data serialized by "my_component"
|
||||
│ └── data.json
|
||||
|
@ -341,8 +334,7 @@ snake when it's called:
|
|||
> └── setup.py # setup file for pip installation
|
||||
> ```
|
||||
|
||||
```python
|
||||
### snek.py
|
||||
```python {title="snek.py"}
|
||||
from spacy.language import Language
|
||||
|
||||
snek = """
|
||||
|
@ -375,8 +367,7 @@ entry to the factories, you can now expose it in your `setup.py` via the
|
|||
> the created entry point is named `snek` and points to the function
|
||||
> `snek_component` in the module `snek`, i.e. `snek.py`.
|
||||
|
||||
```python
|
||||
### setup.py {highlight="5-7"}
|
||||
```python {title="setup.py",highlight="5-7"}
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
|
@ -457,8 +448,7 @@ class SnekFactory:
|
|||
return doc
|
||||
```
|
||||
|
||||
```diff
|
||||
### setup.py
|
||||
```diff {title="setup.py"}
|
||||
entry_points={
|
||||
- "spacy_factories": ["snek = snek:snek_component"]
|
||||
+ "spacy_factories": ["snek = snek:SnekFactory"]
|
||||
|
@ -503,8 +493,7 @@ custom pipeline – but you don't necessarily want to modify spaCy's code to ad
|
|||
language. In your package, you could then implement the following
|
||||
[custom language subclass](/usage/linguistic-features#language-subclass):
|
||||
|
||||
```python
|
||||
### snek.py
|
||||
```python {title="snek.py"}
|
||||
from spacy.language import Language
|
||||
|
||||
class SnekDefaults(Language.Defaults):
|
||||
|
@ -519,8 +508,7 @@ Alongside the `spacy_factories`, there's also an entry point option for
|
|||
`spacy_languages`, which maps language codes to language-specific `Language`
|
||||
subclasses:
|
||||
|
||||
```diff
|
||||
### setup.py
|
||||
```diff {title="setup.py"}
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
|
@ -553,8 +541,7 @@ values.
|
|||
> [scispaCy](/universe/project/scispacy) and
|
||||
> [Blackstone](/universe/project/blackstone).
|
||||
|
||||
```python
|
||||
### snek.py
|
||||
```python {title="snek.py"}
|
||||
displacy_colors = {"SNEK": "#3dff74", "HUMAN": "#cfc5ff"}
|
||||
```
|
||||
|
||||
|
@ -562,8 +549,7 @@ Given the above colors, the entry point can be defined as follows. Entry points
|
|||
need to have a name, so we use the key `colors`. However, the name doesn't
|
||||
matter and whatever is defined in the entry point group will be used.
|
||||
|
||||
```diff
|
||||
### setup.py
|
||||
```diff {title="setup.py"}
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
|
@ -671,8 +657,7 @@ This command will create a pipeline package directory and will run
|
|||
`.tar.gz` archive of your package that can be installed using `pip install`.
|
||||
Installing the binary wheel is usually more efficient.
|
||||
|
||||
```yaml
|
||||
### Directory structure
|
||||
```yaml {title="Directory structure"}
|
||||
└── /
|
||||
├── MANIFEST.in # to include meta.json
|
||||
├── meta.json # pipeline meta data
|
||||
|
|
|
@ -180,8 +180,7 @@ can load it via [`spacy.load`](/api/top-level#spacy.load). This will return a
|
|||
usually call it `nlp`. Calling the `nlp` object on a string of text will return
|
||||
a processed `Doc`:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -293,8 +292,7 @@ way too much space. So instead, spaCy hashes the string and stores it in the
|
|||
**lookup table that works in both directions** – you can look up a string to get
|
||||
its hash, or a hash to get its string:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -311,8 +309,7 @@ a word. For example, no matter if "love" is used as a verb or a noun in some
|
|||
context, its spelling and whether it consists of alphabetic characters won't
|
||||
ever change. Its hash value will also always be the same.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
@ -350,8 +347,7 @@ vocabulary. That's why you always need to make sure all objects you create have
|
|||
access to the same vocabulary. If they don't, spaCy might not be able to find
|
||||
the strings it needs.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
```python {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
|
|
@ -310,8 +310,7 @@ define a setting once and can reference it across your config using the
|
|||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
||||
in `[pretraining]` and will become `pretraining.optimizer`.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="5,18"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="5,18"}
|
||||
[system]
|
||||
seed = 0
|
||||
|
||||
|
@ -372,8 +371,7 @@ are handled automatically.
|
|||
|
||||
Here's an example of creating a `.spacy` file from some NER annotations.
|
||||
|
||||
```python
|
||||
### preprocess.py
|
||||
```python {title="preprocess.py"}
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
|
@ -438,8 +436,7 @@ existing weights. This lets you include an already trained component in your
|
|||
pipeline, or update a trained component with more data specific to your use
|
||||
case.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components]
|
||||
|
||||
# "parser" and "ner" are sourced from a trained pipeline
|
||||
|
@ -532,8 +529,7 @@ list of components. For example, the feature `DEP` from the parser could be used
|
|||
as a tagger feature by including `DEP` in the tok2vec `attrs` and including
|
||||
`parser` in `annotating_components`:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="7,12"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="7,12"}
|
||||
[nlp]
|
||||
pipeline = ["parser", "tagger"]
|
||||
|
||||
|
@ -555,8 +551,7 @@ pipeline is run. The config excerpt below shows how a frozen `ner` component and
|
|||
a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
|
||||
entity linker during training:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[nlp]
|
||||
pipeline = ["sentencizer", "ner", "entity_linker"]
|
||||
|
||||
|
@ -588,8 +583,7 @@ the `batch_size` can be a number that doesn't change, or a schedule, like a
|
|||
sequence of compounding values, which has shown to be an effective trick (see
|
||||
[Smith et al., 2017](https://arxiv.org/abs/1711.00489)).
|
||||
|
||||
```ini
|
||||
### With static value
|
||||
```ini {title="With static value"}
|
||||
[training]
|
||||
batch_size = 128
|
||||
```
|
||||
|
@ -612,8 +606,7 @@ from your configs.
|
|||
> instance, a learning rate schedule can be provided as the an argument of an
|
||||
> optimizer.
|
||||
|
||||
```ini
|
||||
### With registered function
|
||||
```ini {title="With registered function"}
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
|
@ -799,8 +792,7 @@ stop word to the defaults:
|
|||
> @callbacks = "customize_language_data"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="3,6"}
|
||||
```python {title="functions.py",highlight="3,6"}
|
||||
import spacy
|
||||
|
||||
@spacy.registry.callbacks("customize_language_data")
|
||||
|
@ -836,8 +828,7 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
|
|||
> debug = true
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="5,7-9"}
|
||||
```python {title="functions.py",highlight="5,7-9"}
|
||||
from typing import List
|
||||
import spacy
|
||||
|
||||
|
@ -892,8 +883,7 @@ settings and specify this callback in your config:
|
|||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
|
@ -966,8 +956,7 @@ tabular results to a file:
|
|||
> log_path = "my_file.tab"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
import sys
|
||||
from typing import IO, Tuple, Callable, Dict, Any, Optional
|
||||
import spacy
|
||||
|
@ -1019,8 +1008,7 @@ You can also implement your own batch size schedule to use during training. The
|
|||
> know that a config referencing `v1` means a different function than a config
|
||||
> referencing `v2`.
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
import spacy
|
||||
|
||||
@spacy.registry.schedules("my_custom_schedule.v1")
|
||||
|
@ -1037,8 +1025,7 @@ settings in the block will be passed to the function as keyword arguments. Keep
|
|||
in mind that the config shouldn't have any hidden defaults and all arguments on
|
||||
the functions need to be represented in the config.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[training.batch_size]
|
||||
@schedules = "my_custom_schedule.v1"
|
||||
start = 2
|
||||
|
@ -1064,8 +1051,7 @@ for more details.
|
|||
> output_width = 512
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
from typing import List
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
|
@ -1215,8 +1201,7 @@ especially when packing multiple documents together. You can also create `Doc`
|
|||
objects manually, so you can write your own custom logic to convert and store
|
||||
existing annotations for use in spaCy.
|
||||
|
||||
```python
|
||||
### Training data from Doc objects {highlight="6-9"}
|
||||
```python {title="Training data from Doc objects",highlight="6-9"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
|
@ -1296,8 +1281,7 @@ as **config settings** – in this case, `source`.
|
|||
> source = "s3://your_bucket/path/data.csv"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="7-8"}
|
||||
```python {title="functions.py",highlight="7-8"}
|
||||
from typing import Callable, Iterator, List
|
||||
import spacy
|
||||
from spacy.training import Example
|
||||
|
@ -1354,8 +1338,7 @@ training should stop.
|
|||
> max_steps = 2000
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
from typing import Callable, Iterable, Iterator
|
||||
from spacy import util
|
||||
import random
|
||||
|
@ -1418,8 +1401,7 @@ annotations are the same.
|
|||
> size = 150
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
from typing import Callable, Iterable, Iterator, List
|
||||
import spacy
|
||||
from spacy.training import Example
|
||||
|
@ -1455,8 +1437,7 @@ your config. The built-in [`orth_variants`](/api/top-level#orth_variants)
|
|||
augmenter creates a data augmentation callback that uses orth-variant
|
||||
replacement.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="8,14"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="8,14"}
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
|
@ -1480,11 +1461,18 @@ typically loaded from a JSON file. There are two types of orth variant rules:
|
|||
`"single"` for single tokens that should be replaced (e.g. hyphens) and
|
||||
`"paired"` for pairs of tokens (e.g. quotes).
|
||||
|
||||
```json
|
||||
### orth_variants.json
|
||||
```json {title="orth_variants.json"}
|
||||
{
|
||||
"single": [{ "tags": ["NFP"], "variants": ["…", "..."] }],
|
||||
"paired": [{ "tags": ["``", "''"], "variants": [["'", "'"], ["‘", "’"]] }]
|
||||
"paired": [
|
||||
{
|
||||
"tags": ["``", "''"],
|
||||
"variants": [
|
||||
["'", "'"],
|
||||
["‘", "’"]
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -1758,8 +1746,7 @@ of being dropped.
|
|||
> - [`nlp.to_disk`](/api/language#to_disk): Save the updated pipeline to a
|
||||
> directory.
|
||||
|
||||
```python
|
||||
### Example training loop
|
||||
```python {title="Example training loop"}
|
||||
optimizer = nlp.initialize()
|
||||
for itn in range(100):
|
||||
random.shuffle(train_data)
|
||||
|
|
|
@ -389,8 +389,7 @@ with only tokenizers, you now need to install that data explicitly via
|
|||
setup is required – the package just needs to be installed in the same
|
||||
environment as spaCy.
|
||||
|
||||
```python
|
||||
### {highlight="3-4"}
|
||||
```python {highlight="3-4"}
|
||||
nlp = Turkish()
|
||||
doc = nlp("Bu bir cümledir.")
|
||||
# 🚨 This now requires the lookups data to be installed explicitly
|
||||
|
|
|
@ -29,8 +29,7 @@ on the predicted docs during training. This makes it easy to use the predictions
|
|||
of a previous component in the pipeline as features for a subsequent component,
|
||||
e.g. the dependency labels in the tagger:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="7,12"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="7,12"}
|
||||
[nlp]
|
||||
pipeline = ["parser", "tagger"]
|
||||
|
||||
|
@ -293,8 +292,7 @@ spaCy v3.0, a bug allowed vectors to be loaded implicitly through `source`,
|
|||
however in v3.1 this setting must be provided explicitly as
|
||||
`[initialize.vectors]`:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[components.ner]
|
||||
source = "en_core_web_md"
|
||||
|
||||
|
|
|
@ -31,8 +31,7 @@ $ pip install spacy[apple]
|
|||
To customize the scoring, you can specify a scoring function for each component
|
||||
in your config from the new [`scorers` registry](/api/top-level#registry):
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="3"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="3"}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
||||
|
@ -43,8 +42,7 @@ scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
|||
Most pipeline components now include an `overwrite` setting in the config that
|
||||
determines whether existing annotation in the `Doc` is preserved or overwritten:
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="3"}
|
||||
```ini {title="config.cfg (excerpt)",highlight="3"}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
overwrite = false
|
||||
|
|
|
@ -647,8 +647,7 @@ Custom pipeline components now have to be registered explicitly using the
|
|||
that take a `Doc` and return it, all you have to do is add the
|
||||
`@Language.component` decorator to it and assign it a name:
|
||||
|
||||
```diff
|
||||
### Stateless function components
|
||||
```diff {title="Stateless function components"}
|
||||
+ from spacy.language import Language
|
||||
|
||||
+ @Language.component("my_component")
|
||||
|
@ -662,8 +661,7 @@ the method used to initialize the factory has **two named arguments**: `nlp`
|
|||
(the current `nlp` object) and `name` (the string name of the component
|
||||
instance).
|
||||
|
||||
```diff
|
||||
### Stateful class components
|
||||
```diff {title="Stateful class components"}
|
||||
+ from spacy.language import Language
|
||||
|
||||
+ @Language.factory("my_component")
|
||||
|
@ -679,8 +677,7 @@ class MyComponent:
|
|||
Instead of decorating your class, you could also add a factory function that
|
||||
takes the arguments `nlp` and `name` and returns an instance of your component:
|
||||
|
||||
```diff
|
||||
### Stateful class components with factory function
|
||||
```diff {title="Stateful class components with factory function"}
|
||||
+ from spacy.language import Language
|
||||
|
||||
+ @Language.factory("my_component")
|
||||
|
@ -863,8 +860,7 @@ tables = ["lexeme_norm"]
|
|||
> details see the [config lifecycle](/usage/training/#config-lifecycle) and
|
||||
> [initialization](/usage/training/#initialization) docs.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
```ini {title="config.cfg (excerpt)"}
|
||||
[initialize.components.attribute_ruler]
|
||||
|
||||
[initialize.components.attribute_ruler.tag_map]
|
||||
|
@ -981,8 +977,7 @@ this callback in your config:
|
|||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
```python {title="functions.py"}
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
|
@ -1028,16 +1023,14 @@ classmethod takes a reference `Doc` and a
|
|||
[dictionary of annotations](/api/data-formats#dict-input), similar to the
|
||||
"simple training style" in spaCy v2.x:
|
||||
|
||||
```diff
|
||||
### Migrating Doc and GoldParse
|
||||
```diff {title="Migrating Doc and GoldParse"}
|
||||
doc = nlp.make_doc("Mark Zuckerberg is the CEO of Facebook")
|
||||
entities = [(0, 15, "PERSON"), (30, 38, "ORG")]
|
||||
- gold = GoldParse(doc, entities=entities)
|
||||
+ example = Example.from_dict(doc, {"entities": entities})
|
||||
```
|
||||
|
||||
```diff
|
||||
### Migrating simple training style
|
||||
```diff {title="Migrating simple training style"}
|
||||
text = "Mark Zuckerberg is the CEO of Facebook"
|
||||
annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]}
|
||||
+ doc = nlp.make_doc(text)
|
||||
|
@ -1050,8 +1043,7 @@ The [`Language.update`](/api/language#update),
|
|||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||
raw text and a dictionary of annotations.
|
||||
|
||||
```python
|
||||
### Training loop {highlight="5-8,12"}
|
||||
```python {title="Training loop",highlight="5-8,12"}
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
||||
|
@ -1130,8 +1122,7 @@ the following:
|
|||
- Update all references to [`nlp.add_pipe`](/api/language#add_pipe) in your docs
|
||||
to use **string names** instead of the component functions.
|
||||
|
||||
```python
|
||||
### {highlight="1-5"}
|
||||
```python {highlight="1-5"}
|
||||
from spacy.language import Language
|
||||
|
||||
@Language.factory("my_component", default_config={"some_setting": False})
|
||||
|
|
|
@ -34,8 +34,7 @@ package that helps you integrate spaCy visualizations into your apps!
|
|||
The dependency visualizer, `dep`, shows part-of-speech tags and syntactic
|
||||
dependencies.
|
||||
|
||||
```python
|
||||
### Dependency example
|
||||
```python {title="Dependency example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -104,8 +103,7 @@ displacy.serve(sentence_spans, style="dep")
|
|||
The entity visualizer, `ent`, highlights named entities and their labels in a
|
||||
text.
|
||||
|
||||
```python
|
||||
### Named Entity example
|
||||
```python {title="Named Entity example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -176,8 +174,7 @@ title for a brief description of the text example and the number of iterations.
|
|||
|
||||
The span visualizer, `span`, highlights overlapping spans in a text.
|
||||
|
||||
```python
|
||||
### Span example
|
||||
```python {title="Span example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
from spacy.tokens import Span
|
||||
|
@ -234,8 +231,7 @@ displaCy is able to detect whether you're working in a
|
|||
rendered in a cell straight away. When you export your notebook, the
|
||||
visualizations will be included as HTML.
|
||||
|
||||
```python
|
||||
### Jupyter example
|
||||
```python {title="Jupyter example"}
|
||||
# Don't forget to install a trained pipeline, e.g.: python -m spacy download en
|
||||
|
||||
# In[1]:
|
||||
|
@ -279,8 +275,7 @@ example, to export it to a file or serve it in a custom way – you can use
|
|||
[`displacy.render`](/api/top-level#displacy.render). It works the same way, but
|
||||
returns a string containing the markup.
|
||||
|
||||
```python
|
||||
### Example
|
||||
```python {title="Example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
|
@ -323,8 +318,7 @@ rendering all `Doc`s at once, loop over them and export them separately.
|
|||
|
||||
### Example: Export SVG graphics of dependency parses {id="examples-export-svg"}
|
||||
|
||||
```python
|
||||
### Example
|
||||
```python {title="Example"}
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
from pathlib import Path
|
||||
|
@ -372,8 +366,7 @@ helper functions for converting `Doc` objects to displaCy's format for use with
|
|||
> html = displacy.render(ex, style="ent", manual=True)
|
||||
> ```
|
||||
|
||||
```python
|
||||
### DEP input
|
||||
```python {title="DEP input"}
|
||||
{
|
||||
"words": [
|
||||
{"text": "This", "tag": "DT"},
|
||||
|
@ -389,8 +382,7 @@ helper functions for converting `Doc` objects to displaCy's format for use with
|
|||
}
|
||||
```
|
||||
|
||||
```python
|
||||
### ENT input
|
||||
```python {title="ENT input"}
|
||||
{
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
|
@ -398,8 +390,7 @@ helper functions for converting `Doc` objects to displaCy's format for use with
|
|||
}
|
||||
```
|
||||
|
||||
```python
|
||||
### ENT input with knowledge base links
|
||||
```python {title="ENT input with knowledge base links"}
|
||||
{
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "https://www.wikidata.org/entity/Q95"}],
|
||||
|
@ -407,8 +398,7 @@ helper functions for converting `Doc` objects to displaCy's format for use with
|
|||
}
|
||||
```
|
||||
|
||||
```python
|
||||
### SPANS input
|
||||
```python {title="SPANS input"}
|
||||
{
|
||||
"text": "Welcome to the Bank of China.",
|
||||
"spans": [
|
||||
|
|
Loading…
Reference in New Issue
Block a user