mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
4cb7125f7a
|
@ -73,8 +73,13 @@ class Warnings:
|
|||
"degree. If this is intentional or the language you're using "
|
||||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: {langs}")
|
||||
"package installed and load the table in your config. The "
|
||||
"languages with lexeme normalization tables are currently: "
|
||||
"{langs}\n\nLoad the table in your config with:\n\n"
|
||||
"[initialize.lookups]\n"
|
||||
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
"tables = [\"lexeme_norm\"]\n")
|
||||
W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
|
||||
"attribute or operator.")
|
||||
|
||||
|
|
|
@ -1686,15 +1686,21 @@ class Language:
|
|||
)
|
||||
# Detect components with listeners that are not frozen consistently
|
||||
for name, proc in nlp.pipeline:
|
||||
if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer
|
||||
for listener in proc.listening_components:
|
||||
# If it's a component sourced from another pipeline, we check if
|
||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
||||
# models (e.g. so component can be frozen without its performance
|
||||
# degrading when other components/tok2vec are updated)
|
||||
paths = sourced.get(listener, {}).get("replace_listeners", [])
|
||||
if paths:
|
||||
nlp.replace_listeners(name, listener, paths)
|
||||
# Remove listeners not in the pipeline
|
||||
listener_names = getattr(proc, "listening_components", [])
|
||||
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
|
||||
for listener_name in unused_listener_names:
|
||||
for listener in proc.listener_map.get(listener_name, []):
|
||||
proc.remove_listener(listener, listener_name)
|
||||
|
||||
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
|
||||
# If it's a component sourced from another pipeline, we check if
|
||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
||||
# models (e.g. so component can be frozen without its performance
|
||||
# degrading when other components/tok2vec are updated)
|
||||
paths = sourced.get(listener, {}).get("replace_listeners", [])
|
||||
if paths:
|
||||
nlp.replace_listeners(name, listener, paths)
|
||||
return nlp
|
||||
|
||||
def replace_listeners(
|
||||
|
|
|
@ -202,6 +202,8 @@ cdef class Matcher:
|
|||
doclike (Doc or Span): The document to match over.
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
allow_missing (bool): Whether to skip checks for missing annotation for
|
||||
attributes included in patterns. Defaults to False.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
|
@ -222,7 +224,7 @@ cdef class Matcher:
|
|||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer"
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
|
|
|
@ -194,7 +194,7 @@ cdef class PhraseMatcher:
|
|||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer"
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
|
|
|
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "Morphologizer.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "morphologizer")
|
||||
if labels is not None:
|
||||
self.cfg["labels_morph"] = labels["morph"]
|
||||
self.cfg["labels_pos"] = labels["pos"]
|
||||
|
|
|
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
|
|||
DOCS: https://spacy.io/api/sentencerecognizer#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "senter")
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
assert self.labels, Errors.E924.format(name=self.name)
|
||||
|
|
|
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "Tagger.initialize")
|
||||
util.check_lexeme_norms(self.vocab, "tagger")
|
||||
if labels is not None:
|
||||
for tag in labels:
|
||||
self.add_label(tag)
|
||||
|
|
|
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
def initialize(self, get_examples, nlp=None, labels=None):
|
||||
validate_get_examples(get_examples, "Parser.initialize")
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||
util.check_lexeme_norms(self.vocab, "parser or NER")
|
||||
if labels is not None:
|
||||
actions = dict(labels)
|
||||
else:
|
||||
|
|
|
@ -155,7 +155,7 @@ class Corpus:
|
|||
continue
|
||||
elif self.max_length == 0 or len(reference) < self.max_length:
|
||||
yield self._make_example(nlp, reference, False)
|
||||
elif reference.is_sentenced:
|
||||
elif reference.has_annotation("SENT_START"):
|
||||
for ref_sent in reference.sents:
|
||||
if len(ref_sent) == 0:
|
||||
continue
|
||||
|
@ -166,7 +166,7 @@ class Corpus:
|
|||
self, nlp: "Language", reference_docs: Iterable[Doc]
|
||||
) -> Iterator[Example]:
|
||||
for reference in reference_docs:
|
||||
if reference.is_sentenced:
|
||||
if reference.has_annotation("SENT_START"):
|
||||
ref_sents = [sent.as_doc() for sent in reference.sents]
|
||||
else:
|
||||
ref_sents = [reference]
|
||||
|
|
|
@ -72,13 +72,16 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||
# Detect components with listeners that are not frozen consistently
|
||||
for name, proc in nlp.pipeline:
|
||||
if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer
|
||||
for listener in proc.listening_components:
|
||||
if listener in frozen_components and name not in frozen_components:
|
||||
logger.warning(Warnings.W087.format(name=name, listener=listener))
|
||||
# We always check this regardless, in case user freezes tok2vec
|
||||
if listener not in frozen_components and name in frozen_components:
|
||||
logger.warning(Warnings.W086.format(name=name, listener=listener))
|
||||
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
|
||||
# Don't warn about components not in the pipeline
|
||||
if listener not in nlp.pipe_names:
|
||||
continue
|
||||
|
||||
if listener in frozen_components and name not in frozen_components:
|
||||
logger.warning(Warnings.W087.format(name=name, listener=listener))
|
||||
# We always check this regardless, in case user freezes tok2vec
|
||||
if listener not in frozen_components and name in frozen_components:
|
||||
logger.warning(Warnings.W086.format(name=name, listener=listener))
|
||||
return nlp
|
||||
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ if TYPE_CHECKING:
|
|||
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
|
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
|
|||
|
||||
logger = logging.getLogger("spacy")
|
||||
logger_stream_handler = logging.StreamHandler()
|
||||
logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
|
||||
logger_stream_handler.setFormatter(
|
||||
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
|
||||
)
|
||||
logger.addHandler(logger_stream_handler)
|
||||
|
||||
|
||||
|
@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
|
|||
if hasattr(func, attr): # function or class instance
|
||||
return True
|
||||
# https://stackoverflow.com/a/55767059
|
||||
if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
|
||||
and func.__module__ in sys.modules: # method
|
||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||
return hasattr(cls_func, attr)
|
||||
if (
|
||||
hasattr(func, "__qualname__")
|
||||
and hasattr(func, "__module__")
|
||||
and func.__module__ in sys.modules
|
||||
): # method
|
||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||
return hasattr(cls_func, attr)
|
||||
return False
|
||||
|
||||
|
||||
|
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
|
|||
"""
|
||||
if is_in_jupyter():
|
||||
from thinc.backends.cupy_ops import CupyOps
|
||||
|
||||
if CupyOps.xp is not None:
|
||||
from thinc.backends import contextvars_eq_thread_ops
|
||||
|
||||
if not contextvars_eq_thread_ops():
|
||||
warnings.warn(Warnings.W111)
|
||||
|
||||
|
||||
def check_lexeme_norms(vocab, component_name):
|
||||
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(LEXEME_NORM_LANGS)
|
||||
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
||||
|
|
|
@ -77,7 +77,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
|
|||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ |
|
||||
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
|
||||
|
@ -259,7 +259,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
|
|||
| Name | Description |
|
||||
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_file` | Input file. ~~Path (positional)~~ |
|
||||
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ |
|
||||
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
|
||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
|
||||
|
@ -642,7 +642,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
|
|||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------- |
|
||||
| `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ |
|
||||
| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||
| `inputs` | Path to input file, or `-` for standard input. ~~Path (positional)~~ |
|
||||
| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Profiling information for the pipeline. |
|
||||
|
@ -1191,14 +1191,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
|||
> $ python -m spacy project dvc all
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
|
||||
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
|
||||
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
|
||||
## ray {#ray new="3"}
|
||||
|
||||
|
@ -1236,7 +1236,7 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a
|
|||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~ |
|
||||
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
|
|
|
@ -198,7 +198,6 @@ more efficient than processing texts one-by-one.
|
|||
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
@ -872,10 +871,10 @@ when loading a config with
|
|||
> replace_listeners = ["model.tok2vec"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ |
|
||||
| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ |
|
||||
| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ |
|
||||
| `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |
|
||||
|
||||
## Language.meta {#meta tag="property"}
|
||||
|
|
|
@ -133,8 +133,8 @@ The L2 norm of the lexeme's vector representation.
|
|||
| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ |
|
||||
| `lower` | Lowercase form of the word. ~~int~~ |
|
||||
| `lower_` | Lowercase form of the word. ~~str~~ |
|
||||
| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
||||
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
||||
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
||||
|
|
|
@ -120,12 +120,13 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy
|
|||
menu:
|
||||
- ['Quickstart', 'quickstart']
|
||||
- ['Conventions', 'conventions']
|
||||
- ['Pipeline Design', 'design']
|
||||
---
|
||||
|
||||
<!-- TODO: include interactive demo -->
|
||||
|
@ -53,3 +54,146 @@ For a detailed compatibility overview, see the
|
|||
[`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
|
||||
This is also the source of spaCy's internal compatibility check, performed when
|
||||
you run the [`download`](/api/cli#download) command.
|
||||
|
||||
## Pretrained pipeline design {#design}
|
||||
|
||||
The spaCy v3 pretrained pipelines are designed to be efficient and configurable.
|
||||
For example, multiple components can share a common "token-to-vector" model and
|
||||
it's easy to swap out or disable the lemmatizer. The pipelines are designed to
|
||||
be efficient in terms of speed and size and work well when the pipeline is run
|
||||
in full.
|
||||
|
||||
When modifying a pretrained pipeline, it's important to understand how the
|
||||
components **depend on** each other. Unlike spaCy v2, where the `tagger`,
|
||||
`parser` and `ner` components were all independent, some v3 components depend on
|
||||
earlier components in the pipeline. As a result, disabling or reordering
|
||||
components can affect the annotation quality or lead to warnings and errors.
|
||||
|
||||
Main changes from spaCy v2 models:
|
||||
|
||||
- The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A
|
||||
component like a tagger or parser can
|
||||
[listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or
|
||||
`transformer` rather than having its own separate tok2vec layer.
|
||||
- Rule-based exceptions move from individual components to the
|
||||
`attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions
|
||||
to the attribute ruler and the tag map and morph rules move from the tagger to
|
||||
the attribute ruler.
|
||||
- The lemmatizer tables and processing move from the vocab and tagger to a
|
||||
separate `lemmatizer` component.
|
||||
|
||||
### CNN/CPU pipeline design
|
||||
|
||||
In the `sm`/`md`/`lg` models:
|
||||
|
||||
- The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
|
||||
component.
|
||||
- The `attribute_ruler` maps `token.tag` to `token.pos` if there is no
|
||||
`morphologizer`. The `attribute_ruler` additionally makes sure whitespace is
|
||||
tagged consistently and copies `token.pos` to `token.tag` if there is no
|
||||
tagger. For English, the attribute ruler can improve its mapping from
|
||||
`token.tag` to `token.pos` if dependency parses from a `parser` are present,
|
||||
but the parser is not required.
|
||||
- The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian,
|
||||
Norwegian and Spanish) requires `token.pos` annotation from either
|
||||
`tagger`+`attribute_ruler` or `morphologizer`.
|
||||
- The `ner` component is independent with its own internal tok2vec layer.
|
||||
|
||||
<!-- TODO: pretty diagram -->
|
||||
|
||||
### Transformer pipeline design
|
||||
|
||||
In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
|
||||
all listen to the `transformer` component. The `attribute_ruler` and
|
||||
`lemmatizer` have the same configuration as in the CNN models.
|
||||
|
||||
<!-- TODO: pretty diagram -->
|
||||
|
||||
### Modifying the default pipeline
|
||||
|
||||
For faster processing, you may only want to run a subset of the components in a
|
||||
pretrained pipeline. The `disable` and `exclude` arguments to
|
||||
[`spacy.load`](/api/top-level#spacy.load) let you control which components are
|
||||
loaded and run. Disabled components are loaded in the background so it's
|
||||
possible to reenable them in the same pipeline in the future with
|
||||
[`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component
|
||||
completely, use `exclude` instead of `disable`.
|
||||
|
||||
#### Disable part-of-speech tagging and lemmatization
|
||||
|
||||
To disable part-of-speech tagging and lemmatization, disable the `tagger`,
|
||||
`morphologizer`, `attribute_ruler` and `lemmatizer` components.
|
||||
|
||||
```python
|
||||
# Note: English doesn't include a morphologizer
|
||||
nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
|
||||
nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"])
|
||||
```
|
||||
|
||||
<Infobox variant="warning" title="Rule-based lemmatizers require Token.pos">
|
||||
|
||||
The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for
|
||||
Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable
|
||||
any of these components, you'll see lemmatizer warnings unless the lemmatizer is
|
||||
also disabled.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Use senter rather than parser for fast sentence segmentation
|
||||
|
||||
If you need fast sentence segmentation without dependency parses, disable the
|
||||
`parser` use the `senter` component instead:
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp.disable_pipe("parser")
|
||||
nlp.enable_pipe("senter")
|
||||
```
|
||||
|
||||
The `senter` component is ~10× faster than the parser and more accurate
|
||||
than the rule-based `sentencizer`.
|
||||
|
||||
#### Switch from rule-based to lookup lemmatization
|
||||
|
||||
For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish
|
||||
pipelines, you can switch from the default rule-based lemmatizer to a lookup
|
||||
lemmatizer:
|
||||
|
||||
```python
|
||||
# Requirements: pip install spacy-lookups-data
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp.remove_pipe("lemmatizer")
|
||||
nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
|
||||
```
|
||||
|
||||
#### Disable everything except NER
|
||||
|
||||
For the non-transformer models, the `ner` component is independent, so you can
|
||||
disable everything else:
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
|
||||
```
|
||||
|
||||
In the transformer models, `ner` listens to the `transformer` component, so you
|
||||
can disable all components related tagging, parsing, and lemmatization.
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
|
||||
```
|
||||
|
||||
#### Move NER to the end of the pipeline
|
||||
|
||||
For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the
|
||||
end of the pipeline after `attribute_ruler` and `lemmatizer`:
|
||||
|
||||
```python
|
||||
# load without NER
|
||||
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
|
||||
|
||||
# source NER from the same pipeline package as the last component
|
||||
nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))
|
||||
|
||||
# insert the entity ruler
|
||||
nlp.add_pipe("entity_ruler", before="ner")
|
||||
```
|
||||
|
|
|
@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
|
|||
print('Before', ents)
|
||||
# The model didn't recognize "fb" as an entity :(
|
||||
|
||||
fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
|
||||
# Create a span for the new entity
|
||||
fb_ent = Span(doc, 0, 1, label="ORG")
|
||||
|
||||
# Option 1: Modify the provided entity spans, leaving the rest unmodified
|
||||
doc.set_ents([fb_ent], default="unmodified")
|
||||
|
||||
# Option 2: Assign a complete list of ents to doc.ents
|
||||
doc.ents = list(doc.ents) + [fb_ent]
|
||||
|
||||
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
|
||||
print('After', ents)
|
||||
# [('fb', 0, 2, 'ORG')] 🎉
|
||||
# [('fb', 0, 1, 'ORG')] 🎉
|
||||
```
|
||||
|
||||
Keep in mind that you need to create a `Span` with the start and end index of
|
||||
the **token**, not the start and end index of the entity in the document. In
|
||||
this case, "fb" is token `(0, 1)` – but at the document level, the entity will
|
||||
have the start and end indices `(0, 2)`.
|
||||
Keep in mind that `Span` is initialized with the start and end **token**
|
||||
indices, not the character offsets. To create a span from character offsets, use
|
||||
[`Doc.char_span`](/api/doc#char_span):
|
||||
|
||||
```python
|
||||
fb_ent = doc.char_span(0, 2, label="ORG")
|
||||
```
|
||||
|
||||
#### Setting entity annotations from array {#setting-from-array}
|
||||
|
||||
|
@ -645,9 +654,10 @@ write efficient native code.
|
|||
|
||||
```python
|
||||
# cython: infer_types=True
|
||||
from spacy.typedefs cimport attr_t
|
||||
from spacy.tokens.doc cimport Doc
|
||||
|
||||
cpdef set_entity(Doc doc, int start, int end, int ent_type):
|
||||
cpdef set_entity(Doc doc, int start, int end, attr_t ent_type):
|
||||
for i in range(start, end):
|
||||
doc.c[i].ent_type = ent_type
|
||||
doc.c[start].ent_iob = 3
|
||||
|
|
|
@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
|
|||
In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a
|
||||
(potentially very large) iterable of texts as a stream. Because we're only
|
||||
accessing the named entities in `doc.ents` (set by the `ner` component), we'll
|
||||
disable all other statistical components (the `tagger` and `parser`) during
|
||||
processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and
|
||||
access the named entity predictions:
|
||||
disable all other components during processing. `nlp.pipe` yields `Doc` objects,
|
||||
so we can iterate over them and access the named entity predictions:
|
||||
|
||||
> #### ✏️ Things to try
|
||||
>
|
||||
|
@ -73,7 +72,7 @@ texts = [
|
|||
]
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
|
||||
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
|
||||
# Do something with the doc here
|
||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||
```
|
||||
|
@ -92,6 +91,54 @@ have to call `list()` on it first:
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Multiprocessing {#multiprocessing}
|
||||
|
||||
spaCy includes built-in support for multiprocessing with
|
||||
[`nlp.pipe`](/api/language#pipe) using the `n_process` option:
|
||||
|
||||
```python
|
||||
# Multiprocessing with 4 processes
|
||||
docs = nlp.pipe(texts, n_process=4)
|
||||
|
||||
# With as many processes as CPUs (use with caution!)
|
||||
docs = nlp.pipe(texts, n_process=-1)
|
||||
```
|
||||
|
||||
Depending on your platform, starting many processes with multiprocessing can add
|
||||
a lot of overhead. In particular, the default start method `spawn` used in
|
||||
macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models
|
||||
because the model data is copied in memory for each new process. See the
|
||||
[Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
|
||||
for further details.
|
||||
|
||||
For shorter tasks and in particular with `spawn`, it can be faster to use a
|
||||
smaller number of processes with a larger batch size. The optimal `batch_size`
|
||||
setting will depend on the pipeline components, the length of your documents,
|
||||
the number of processes and how much memory is available.
|
||||
|
||||
```python
|
||||
# Default batch size is `nlp.batch_size` (typically 1000)
|
||||
docs = nlp.pipe(texts, n_process=2, batch_size=2000)
|
||||
```
|
||||
|
||||
<Infobox title="Multiprocessing on GPU" variant="warning">
|
||||
|
||||
Multiprocessing is not generally recommended on GPU because RAM is too limited.
|
||||
If you want to try it out, be aware that it is only possible using `spawn` due
|
||||
to limitations in CUDA.
|
||||
|
||||
</Infobox>
|
||||
|
||||
<Infobox title="Multiprocessing with transformer models" variant="warning">
|
||||
|
||||
In Linux, transformer models may hang or deadlock with multiprocessing due to an
|
||||
[issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One
|
||||
suggested workaround is to use `spawn` instead of `fork` and another is to limit
|
||||
the number of threads before loading any models using
|
||||
`torch.set_num_threads(1)`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Pipelines and built-in components {#pipelines}
|
||||
|
||||
spaCy makes it very easy to create your own pipelines consisting of reusable
|
||||
|
@ -144,10 +191,12 @@ nlp = spacy.load("en_core_web_sm")
|
|||
```
|
||||
|
||||
... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
|
||||
pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize
|
||||
`spacy.lang.en.English`, and create each pipeline component and add it to the
|
||||
processing pipeline. It'll then load in the model data from the data directory
|
||||
and return the modified `Language` class for you to use as the `nlp` object.
|
||||
pipeline
|
||||
`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy
|
||||
will then initialize `spacy.lang.en.English`, and create each pipeline component
|
||||
and add it to the processing pipeline. It'll then load in the model data from
|
||||
the data directory and return the modified `Language` class for you to use as
|
||||
the `nlp` object.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -171,7 +220,7 @@ the binary data:
|
|||
```python
|
||||
### spacy.load under the hood
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "tagger", "parser", "ner"]
|
||||
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
|
||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||
|
||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||
|
@ -186,7 +235,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the
|
|||
components can access it to assign annotations to the `Doc` object, and
|
||||
subsequently to the `Token` and `Span` which are only views of the `Doc`, and
|
||||
don't own any data themselves. All components return the modified document,
|
||||
which is then processed by the component next in the pipeline.
|
||||
which is then processed by the next component in the pipeline.
|
||||
|
||||
```python
|
||||
### The pipeline under the hood
|
||||
|
@ -201,9 +250,9 @@ list of human-readable component names.
|
|||
|
||||
```python
|
||||
print(nlp.pipeline)
|
||||
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>), ('attribute_ruler', <spacy.pipeline.AttributeRuler>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer>)]
|
||||
print(nlp.pipe_names)
|
||||
# ['tok2vec', 'tagger', 'parser', 'ner']
|
||||
# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
|
||||
```
|
||||
|
||||
### Built-in pipeline components {#built-in}
|
||||
|
@ -300,7 +349,7 @@ blocks.
|
|||
```python
|
||||
### Disable for block
|
||||
# 1. Use as a context manager
|
||||
with nlp.select_pipes(disable=["tagger", "parser"]):
|
||||
with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]):
|
||||
doc = nlp("I won't be tagged and parsed")
|
||||
doc = nlp("I will be tagged and parsed")
|
||||
|
||||
|
@ -324,7 +373,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
|
|||
argument if you only want to disable components during processing:
|
||||
|
||||
```python
|
||||
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
|
||||
for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
|
||||
# Do something with the doc here
|
||||
```
|
||||
|
||||
|
@ -1497,24 +1546,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
|||
|
||||
| Name | Customizes |
|
||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `user_hooks` | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) |
|
||||
| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) |
|
||||
| `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
|
||||
| `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) |
|
||||
|
||||
```python
|
||||
### Add custom similarity hooks
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
class SimilarityModel:
|
||||
def __init__(self, model):
|
||||
self._model = model
|
||||
def __init__(self, name: str, index: int):
|
||||
self.name = name
|
||||
self.index = index
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks["similarity"] = self.similarity
|
||||
doc.user_span_hooks["similarity"] = self.similarity
|
||||
doc.user_token_hooks["similarity"] = self.similarity
|
||||
return doc
|
||||
|
||||
def similarity(self, obj1, obj2):
|
||||
y = self._model([obj1.vector, obj2.vector])
|
||||
return float(y[0])
|
||||
return obj1.vector[self.index] + obj2.vector[self.index]
|
||||
|
||||
|
||||
@Language.factory("similarity_component", default_config={"index": 0})
|
||||
def create_similarity_component(nlp, name, index: int):
|
||||
return SimilarityModel(name, index)
|
||||
```
|
||||
|
||||
## Developing plugins and wrappers {#plugins}
|
||||
|
|
|
@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
|||
When serializing the pipeline, keep in mind that this will only save out the
|
||||
**binary data for the individual components** to allow spaCy to restore them –
|
||||
not the entire objects. This is a good thing, because it makes serialization
|
||||
safe. But it also means that you have to take care of storing the language name
|
||||
and pipeline component names as well, and restoring them separately before you
|
||||
can load in the data.
|
||||
safe. But it also means that you have to take care of storing the config, which
|
||||
contains the pipeline configuration and all the relevant settings.
|
||||
|
||||
> #### Saving the meta and config
|
||||
>
|
||||
|
@ -33,24 +32,21 @@ can load in the data.
|
|||
|
||||
```python
|
||||
### Serialize
|
||||
config = nlp.config
|
||||
bytes_data = nlp.to_bytes()
|
||||
lang = nlp.config["nlp"]["lang"] # "en"
|
||||
pipeline = nlp.config["nlp"]["pipeline"] # ["tagger", "parser", "ner"]
|
||||
```
|
||||
|
||||
```python
|
||||
### Deserialize
|
||||
nlp = spacy.blank(lang)
|
||||
for pipe_name in pipeline:
|
||||
nlp.add_pipe(pipe_name)
|
||||
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
|
||||
nlp = lang_cls.from_config(config)
|
||||
nlp.from_bytes(bytes_data)
|
||||
```
|
||||
|
||||
This is also how spaCy does it under the hood when loading a pipeline: it loads
|
||||
the `config.cfg` containing the language and pipeline information, initializes
|
||||
the language class, creates and adds the pipeline components based on the
|
||||
defined [factories](/usage/processing-pipeline#custom-components-factories) and
|
||||
_then_ loads in the binary data. You can read more about this process
|
||||
the language class, creates and adds the pipeline components based on the config
|
||||
and _then_ loads in the binary data. You can read more about this process
|
||||
[here](/usage/processing-pipelines#pipelines).
|
||||
|
||||
## Serializing Doc objects efficiently {#docs new="2.2"}
|
||||
|
|
Loading…
Reference in New Issue
Block a user