Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2021-03-22 22:47:30 +11:00
commit 4cb7125f7a
19 changed files with 333 additions and 95 deletions

View File

@ -73,8 +73,13 @@ class Warnings:
"degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed. The languages with lexeme normalization tables "
"are currently: {langs}")
"package installed and load the table in your config. The "
"languages with lexeme normalization tables are currently: "
"{langs}\n\nLoad the table in your config with:\n\n"
"[initialize.lookups]\n"
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
"lang = ${{nlp.lang}}\n"
"tables = [\"lexeme_norm\"]\n")
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
"attribute or operator.")

View File

@ -1686,8 +1686,14 @@ class Language:
)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer
for listener in proc.listening_components:
# Remove listeners not in the pipeline
listener_names = getattr(proc, "listening_components", [])
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance

View File

@ -202,6 +202,8 @@ cdef class Matcher:
doclike (Doc or Span): The document to match over.
as_spans (bool): Return Span objects with labels instead of (match_id,
start, end) tuples.
allow_missing (bool): Whether to skip checks for missing annotation for
attributes included in patterns. Defaults to False.
RETURNS (list): A list of `(match_id, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
@ -222,7 +224,7 @@ cdef class Matcher:
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
pipe = "morphologizer or tagger+attribute_ruler"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:

View File

@ -194,7 +194,7 @@ cdef class PhraseMatcher:
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
pipe = "morphologizer or tagger+attribute_ruler"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:

View File

@ -137,6 +137,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#initialize
"""
validate_get_examples(get_examples, "Morphologizer.initialize")
util.check_lexeme_norms(self.vocab, "morphologizer")
if labels is not None:
self.cfg["labels_morph"] = labels["morph"]
self.cfg["labels_pos"] = labels["pos"]

View File

@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer#initialize
"""
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
util.check_lexeme_norms(self.vocab, "senter")
doc_sample = []
label_sample = []
assert self.labels, Errors.E924.format(name=self.name)

View File

@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#initialize
"""
validate_get_examples(get_examples, "Tagger.initialize")
util.check_lexeme_norms(self.vocab, "tagger")
if labels is not None:
for tag in labels:
self.add_label(tag)

View File

@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
def initialize(self, get_examples, nlp=None, labels=None):
validate_get_examples(get_examples, "Parser.initialize")
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
util.check_lexeme_norms(self.vocab, "parser or NER")
if labels is not None:
actions = dict(labels)
else:

View File

@ -155,7 +155,7 @@ class Corpus:
continue
elif self.max_length == 0 or len(reference) < self.max_length:
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
elif reference.has_annotation("SENT_START"):
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
@ -166,7 +166,7 @@ class Corpus:
self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs:
if reference.is_sentenced:
if reference.has_annotation("SENT_START"):
ref_sents = [sent.as_doc() for sent in reference.sents]
else:
ref_sents = [reference]

View File

@ -72,8 +72,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer
for listener in proc.listening_components:
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
# Don't warn about components not in the pipeline
if listener not in nlp.pipe_names:
continue
if listener in frozen_components and name not in frozen_components:
logger.warning(Warnings.W087.format(name=name, listener=listener))
# We always check this regardless, in case user freezes tok2vec

View File

@ -59,7 +59,7 @@ if TYPE_CHECKING:
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
logger = logging.getLogger("spacy")
logger_stream_handler = logging.StreamHandler()
logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
logger_stream_handler.setFormatter(
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
)
logger.addHandler(logger_stream_handler)
@ -1454,8 +1456,11 @@ def is_cython_func(func: Callable) -> bool:
if hasattr(func, attr): # function or class instance
return True
# https://stackoverflow.com/a/55767059
if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
and func.__module__ in sys.modules: # method
if (
hasattr(func, "__qualname__")
and hasattr(func, "__module__")
and func.__module__ in sys.modules
): # method
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
return False
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
"""
if is_in_jupyter():
from thinc.backends.cupy_ops import CupyOps
if CupyOps.xp is not None:
from thinc.backends import contextvars_eq_thread_ops
if not contextvars_eq_thread_ops():
warnings.warn(Warnings.W111)
def check_lexeme_norms(vocab, component_name):
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
langs = ", ".join(LEXEME_NORM_LANGS)
logger.debug(Warnings.W033.format(model=component_name, langs=langs))

View File

@ -77,7 +77,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
| Name | Description |
| ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
@ -259,7 +259,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
| Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `input_file` | Input file. ~~Path (positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
@ -642,7 +642,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------- |
| `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ |
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
| `inputs` | Path to input file, or `-` for standard input. ~~Path (positional)~~ |
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Profiling information for the pipeline. |
@ -1192,9 +1192,9 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
> ```
| Name | Description |
| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
| ----------------- | ------------------------------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
@ -1236,7 +1236,7 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~ |
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |

View File

@ -198,7 +198,6 @@ more efficient than processing texts one-by-one.
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
@ -873,7 +872,7 @@ when loading a config with
> ```
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ |
| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ |
| `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |

View File

@ -133,8 +133,8 @@ The L2 norm of the lexeme's vector representation.
| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ |
| `lower` | Lowercase form of the word. ~~int~~ |
| `lower_` | Lowercase form of the word. ~~str~~ |
| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |

View File

@ -121,10 +121,11 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> ```
| Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

View File

@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy
menu:
- ['Quickstart', 'quickstart']
- ['Conventions', 'conventions']
- ['Pipeline Design', 'design']
---
<!-- TODO: include interactive demo -->
@ -53,3 +54,146 @@ For a detailed compatibility overview, see the
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
This is also the source of spaCy's internal compatibility check, performed when
you run the [`download`](/api/cli#download) command.
## Pretrained pipeline design {#design}
The spaCy v3 pretrained pipelines are designed to be efficient and configurable.
For example, multiple components can share a common "token-to-vector" model and
it's easy to swap out or disable the lemmatizer. The pipelines are designed to
be efficient in terms of speed and size and work well when the pipeline is run
in full.
When modifying a pretrained pipeline, it's important to understand how the
components **depend on** each other. Unlike spaCy v2, where the `tagger`,
`parser` and `ner` components were all independent, some v3 components depend on
earlier components in the pipeline. As a result, disabling or reordering
components can affect the annotation quality or lead to warnings and errors.
Main changes from spaCy v2 models:
- The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A
component like a tagger or parser can
[listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or
`transformer` rather than having its own separate tok2vec layer.
- Rule-based exceptions move from individual components to the
`attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions
to the attribute ruler and the tag map and morph rules move from the tagger to
the attribute ruler.
- The lemmatizer tables and processing move from the vocab and tagger to a
separate `lemmatizer` component.
### CNN/CPU pipeline design
In the `sm`/`md`/`lg` models:
- The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
component.
- The `attribute_ruler` maps `token.tag` to `token.pos` if there is no
`morphologizer`. The `attribute_ruler` additionally makes sure whitespace is
tagged consistently and copies `token.pos` to `token.tag` if there is no
tagger. For English, the attribute ruler can improve its mapping from
`token.tag` to `token.pos` if dependency parses from a `parser` are present,
but the parser is not required.
- The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian,
Norwegian and Spanish) requires `token.pos` annotation from either
`tagger`+`attribute_ruler` or `morphologizer`.
- The `ner` component is independent with its own internal tok2vec layer.
<!-- TODO: pretty diagram -->
### Transformer pipeline design
In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
all listen to the `transformer` component. The `attribute_ruler` and
`lemmatizer` have the same configuration as in the CNN models.
<!-- TODO: pretty diagram -->
### Modifying the default pipeline
For faster processing, you may only want to run a subset of the components in a
pretrained pipeline. The `disable` and `exclude` arguments to
[`spacy.load`](/api/top-level#spacy.load) let you control which components are
loaded and run. Disabled components are loaded in the background so it's
possible to reenable them in the same pipeline in the future with
[`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component
completely, use `exclude` instead of `disable`.
#### Disable part-of-speech tagging and lemmatization
To disable part-of-speech tagging and lemmatization, disable the `tagger`,
`morphologizer`, `attribute_ruler` and `lemmatizer` components.
```python
# Note: English doesn't include a morphologizer
nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"])
```
<Infobox variant="warning" title="Rule-based lemmatizers require Token.pos">
The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for
Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable
any of these components, you'll see lemmatizer warnings unless the lemmatizer is
also disabled.
</Infobox>
#### Use senter rather than parser for fast sentence segmentation
If you need fast sentence segmentation without dependency parses, disable the
`parser` use the `senter` component instead:
```python
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
```
The `senter` component is ~10&times; faster than the parser and more accurate
than the rule-based `sentencizer`.
#### Switch from rule-based to lookup lemmatization
For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish
pipelines, you can switch from the default rule-based lemmatizer to a lookup
lemmatizer:
```python
# Requirements: pip install spacy-lookups-data
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe("lemmatizer")
nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
```
#### Disable everything except NER
For the non-transformer models, the `ner` component is independent, so you can
disable everything else:
```python
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
```
In the transformer models, `ner` listens to the `transformer` component, so you
can disable all components related tagging, parsing, and lemmatization.
```python
nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
```
#### Move NER to the end of the pipeline
For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the
end of the pipeline after `attribute_ruler` and `lemmatizer`:
```python
# load without NER
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
# source NER from the same pipeline package as the last component
nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))
# insert the entity ruler
nlp.add_pipe("entity_ruler", before="ner")
```

View File

@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(
fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG")
# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc.set_ents([fb_ent], default="unmodified")
# Option 2: Assign a complete list of ents to doc.ents
doc.ents = list(doc.ents) + [fb_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 2, 'ORG')] 🎉
# [('fb', 0, 1, 'ORG')] 🎉
```
Keep in mind that you need to create a `Span` with the start and end index of
the **token**, not the start and end index of the entity in the document. In
this case, "fb" is token `(0, 1)` but at the document level, the entity will
have the start and end indices `(0, 2)`.
Keep in mind that `Span` is initialized with the start and end **token**
indices, not the character offsets. To create a span from character offsets, use
[`Doc.char_span`](/api/doc#char_span):
```python
fb_ent = doc.char_span(0, 2, label="ORG")
```
#### Setting entity annotations from array {#setting-from-array}
@ -645,9 +654,10 @@ write efficient native code.
```python
# cython: infer_types=True
from spacy.typedefs cimport attr_t
from spacy.tokens.doc cimport Doc
cpdef set_entity(Doc doc, int start, int end, int ent_type):
cpdef set_entity(Doc doc, int start, int end, attr_t ent_type):
for i in range(start, end):
doc.c[i].ent_type = ent_type
doc.c[start].ent_iob = 3

View File

@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a
(potentially very large) iterable of texts as a stream. Because we're only
accessing the named entities in `doc.ents` (set by the `ner` component), we'll
disable all other statistical components (the `tagger` and `parser`) during
processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and
access the named entity predictions:
disable all other components during processing. `nlp.pipe` yields `Doc` objects,
so we can iterate over them and access the named entity predictions:
> #### ✏️ Things to try
>
@ -73,7 +72,7 @@ texts = [
]
nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
# Do something with the doc here
print([(ent.text, ent.label_) for ent in doc.ents])
```
@ -92,6 +91,54 @@ have to call `list()` on it first:
</Infobox>
### Multiprocessing {#multiprocessing}
spaCy includes built-in support for multiprocessing with
[`nlp.pipe`](/api/language#pipe) using the `n_process` option:
```python
# Multiprocessing with 4 processes
docs = nlp.pipe(texts, n_process=4)
# With as many processes as CPUs (use with caution!)
docs = nlp.pipe(texts, n_process=-1)
```
Depending on your platform, starting many processes with multiprocessing can add
a lot of overhead. In particular, the default start method `spawn` used in
macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models
because the model data is copied in memory for each new process. See the
[Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
for further details.
For shorter tasks and in particular with `spawn`, it can be faster to use a
smaller number of processes with a larger batch size. The optimal `batch_size`
setting will depend on the pipeline components, the length of your documents,
the number of processes and how much memory is available.
```python
# Default batch size is `nlp.batch_size` (typically 1000)
docs = nlp.pipe(texts, n_process=2, batch_size=2000)
```
<Infobox title="Multiprocessing on GPU" variant="warning">
Multiprocessing is not generally recommended on GPU because RAM is too limited.
If you want to try it out, be aware that it is only possible using `spawn` due
to limitations in CUDA.
</Infobox>
<Infobox title="Multiprocessing with transformer models" variant="warning">
In Linux, transformer models may hang or deadlock with multiprocessing due to an
[issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One
suggested workaround is to use `spawn` instead of `fork` and another is to limit
the number of threads before loading any models using
`torch.set_num_threads(1)`.
</Infobox>
## Pipelines and built-in components {#pipelines}
spaCy makes it very easy to create your own pipelines consisting of reusable
@ -144,10 +191,12 @@ nlp = spacy.load("en_core_web_sm")
```
... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize
`spacy.lang.en.English`, and create each pipeline component and add it to the
processing pipeline. It'll then load in the model data from the data directory
and return the modified `Language` class for you to use as the `nlp` object.
pipeline
`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy
will then initialize `spacy.lang.en.English`, and create each pipeline component
and add it to the processing pipeline. It'll then load in the model data from
the data directory and return the modified `Language` class for you to use as
the `nlp` object.
<Infobox title="Changed in v3.0" variant="warning">
@ -171,7 +220,7 @@ the binary data:
```python
### spacy.load under the hood
lang = "en"
pipeline = ["tok2vec", "tagger", "parser", "ner"]
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
@ -186,7 +235,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the
components can access it to assign annotations to the `Doc` object, and
subsequently to the `Token` and `Span` which are only views of the `Doc`, and
don't own any data themselves. All components return the modified document,
which is then processed by the component next in the pipeline.
which is then processed by the next component in the pipeline.
```python
### The pipeline under the hood
@ -201,9 +250,9 @@ list of human-readable component names.
```python
print(nlp.pipeline)
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>), ('attribute_ruler', <spacy.pipeline.AttributeRuler>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer>)]
print(nlp.pipe_names)
# ['tok2vec', 'tagger', 'parser', 'ner']
# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
```
### Built-in pipeline components {#built-in}
@ -300,7 +349,7 @@ blocks.
```python
### Disable for block
# 1. Use as a context manager
with nlp.select_pipes(disable=["tagger", "parser"]):
with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]):
doc = nlp("I won't be tagged and parsed")
doc = nlp("I will be tagged and parsed")
@ -324,7 +373,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
argument if you only want to disable components during processing:
```python
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
# Do something with the doc here
```
@ -1497,24 +1546,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
| Name | Customizes |
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `user_hooks` | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) |
| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) |
| `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
| `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) |
```python
### Add custom similarity hooks
from spacy.language import Language
class SimilarityModel:
def __init__(self, model):
self._model = model
def __init__(self, name: str, index: int):
self.name = name
self.index = index
def __call__(self, doc):
doc.user_hooks["similarity"] = self.similarity
doc.user_span_hooks["similarity"] = self.similarity
doc.user_token_hooks["similarity"] = self.similarity
return doc
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])
return obj1.vector[self.index] + obj2.vector[self.index]
@Language.factory("similarity_component", default_config={"index": 0})
def create_similarity_component(nlp, name, index: int):
return SimilarityModel(name, index)
```
## Developing plugins and wrappers {#plugins}

View File

@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md'
When serializing the pipeline, keep in mind that this will only save out the
**binary data for the individual components** to allow spaCy to restore them
not the entire objects. This is a good thing, because it makes serialization
safe. But it also means that you have to take care of storing the language name
and pipeline component names as well, and restoring them separately before you
can load in the data.
safe. But it also means that you have to take care of storing the config, which
contains the pipeline configuration and all the relevant settings.
> #### Saving the meta and config
>
@ -33,24 +32,21 @@ can load in the data.
```python
### Serialize
config = nlp.config
bytes_data = nlp.to_bytes()
lang = nlp.config["nlp"]["lang"] # "en"
pipeline = nlp.config["nlp"]["pipeline"] # ["tagger", "parser", "ner"]
```
```python
### Deserialize
nlp = spacy.blank(lang)
for pipe_name in pipeline:
nlp.add_pipe(pipe_name)
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
nlp.from_bytes(bytes_data)
```
This is also how spaCy does it under the hood when loading a pipeline: it loads
the `config.cfg` containing the language and pipeline information, initializes
the language class, creates and adds the pipeline components based on the
defined [factories](/usage/processing-pipeline#custom-components-factories) and
_then_ loads in the binary data. You can read more about this process
the language class, creates and adds the pipeline components based on the config
and _then_ loads in the binary data. You can read more about this process
[here](/usage/processing-pipelines#pipelines).
## Serializing Doc objects efficiently {#docs new="2.2"}