From ef77c886388713f7651daced6c90c63165a88d6b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 17 Mar 2021 14:56:04 +0900 Subject: [PATCH 01/18] Don't warn about components not in the pipeline See here: https://github.com/explosion/spaCy/discussions/7463 Still need to check if there are any side effects of listeners being present but not in the pipeline, but this commit will silence the warnings. --- spacy/training/initialize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index f7f2f21a4..d017aa909 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -74,6 +74,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": for name, proc in nlp.pipeline: if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer for listener in proc.listening_components: + # Don't warn about components not in the pipeline + if listener not in nlp.pipeline: + continue + if listener in frozen_components and name not in frozen_components: logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec From a5ffe8dfed105b089678353c5517a787c8c4240c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 11:29:57 +0100 Subject: [PATCH 02/18] Add details about pretrained pipeline design --- website/docs/models/index.md | 144 +++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 30b4f11d9..2ca1bf6b3 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy menu: - ['Quickstart', 'quickstart'] - ['Conventions', 'conventions'] + - ['Pipeline Design', 'design'] --- @@ -53,3 +54,146 @@ For a detailed compatibility overview, see the [`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json). This is also the source of spaCy's internal compatibility check, performed when you run the [`download`](/api/cli#download) command. + +## Pretrained pipeline design {#design} + +The spaCy v3 pretrained pipelines are designed to be efficient and configurable. +For example, multiple components can share a common "token-to-vector" model and +it's easy to swap out or disable the lemmatizer. The pipelines are designed to +be efficient in terms of speed and size and work well when the pipeline is run +in full. + +When modifying a pretrained v3 pipeline, it's important to understand how the +components **depend on** each other. Unlike spaCy v2, where the `tagger`, +`parser` and `ner` components were all independent, some v3 components depend on +earlier components in the pipeline. As a result, disabling or reordering +components can affect the annotation quality or lead to warnings and errors. + +Main changes from spaCy v2 models: + +- The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A + component like a tagger or parser can + [listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or + `transformer` rather than having its own separate tok2vec layer. +- Rule-based exceptions move from individual components to the + `attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions + to the attribute ruler and the tag map and morph rules move from the tagger to + the attribute ruler. +- The lemmatizer tables and processing move from the vocab and tagger to a + separate `lemmatizer` component. + +### CNN/CPU pipeline design + +In the `sm`/`md`/`lg` models: + +- The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec` + component. +- The `attribute_ruler` maps `token.tag` to `token.pos` if there is no + `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is + tagged consistently and copies `token.pos` to `token.tag` if there is no + tagger. For English, the attribute ruler can improve its mapping from + `token.tag` to `token.pos` if dependency parses from a `parser` are present, + but the parser is not required. +- The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian, + Norwegian and Spanish) requires `token.pos` annotation from either + `tagger`+`attribute_ruler` or `morphologizer`. +- The `ner` component is independent with its own internal tok2vec layer. + + + +### Transformer pipeline design + +In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) +all listen to the `transformer` component. The `attribute_ruler` and +`lemmatizer` have the same configuration as in the CNN models. + + + +### Modifying the default pipeline + +For faster processing, you may only want to run a subset of the components in a +pretrained pipeline. The `disable` and `exclude` arguments to +[`spacy.load`](/api/top-level#spacy.load) let you control which components are +loaded and run. Disabled components are loaded in the background so it's +possible to reenable them in the same pipeline in the future with +[`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component +completely, use `exclude` instead of `disable`. + +#### Disable part-of-speech tagging and lemmatization + +To disable part-of-speech tagging and lemmatization, disable the `tagger`, +`morphologizer`, `attribute_ruler` and `lemmatizer` components. + +```python +# Note: English doesn't include a morphologizer +nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"]) +nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"]) +``` + + + +The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for +Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable +any of these components, you'll see lemmatizer warnings unless the lemmatizer is +also disabled. + + + +#### Use senter rather than parser for fast sentence segmentation + +If you need fast sentence segmentation without dependency parses, disable the +`parser` use the `senter` component instead: + +```python +nlp = spacy.load("en_core_web_sm") +nlp.disable_pipe("parser") +nlp.enable_pipe("senter") +``` + +The `senter` component is ~10× faster than the parser and more accurate +than the rule-based `sentencizer`. + +#### Switch from rule-based to lookup lemmatization + +For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish +pipelines, you can switch from the default rule-based lemmatizer to a lookup +lemmatizer: + +```python +# Requirements: pip install spacy-lookups-data +nlp = spacy.load("en_core_web_sm") +nlp.remove_pipe("lemmatizer") +nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize() +``` + +#### Disable everything except NER + +For the non-transformer models, the `ner` component is independent, so you can +disable everything else: + +```python +nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) +``` + +In the transformer models, `ner` listens to the `transformer` layer, so you can +disable all components related tagging, parsing, and lemmatization. + +```python +nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]) +``` + +#### Move NER to the end of the pipeline + +For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the +end of the pipeline after `attribute_ruler` and `lemmatizer`: + +```python +# load without NER +nlp = spacy.load("en_core_web_sm", exclude=["ner"]) + +# source NER from the same pipeline package as the last component +nlp.add_pipe("ner", source=spacy.load("en_core_web_sm")) + +# insert the entity ruler +nlp.add_pipe("entity_ruler", before="ner") +``` From 5da323fd86064046f185b972d87718cbdd41e0ab Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 12:59:05 +0100 Subject: [PATCH 03/18] Minor edits --- website/docs/models/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 2ca1bf6b3..1d03b4c3d 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -63,7 +63,7 @@ it's easy to swap out or disable the lemmatizer. The pipelines are designed to be efficient in terms of speed and size and work well when the pipeline is run in full. -When modifying a pretrained v3 pipeline, it's important to understand how the +When modifying a pretrained pipeline, it's important to understand how the components **depend on** each other. Unlike spaCy v2, where the `tagger`, `parser` and `ner` components were all independent, some v3 components depend on earlier components in the pipeline. As a result, disabling or reordering @@ -175,8 +175,8 @@ disable everything else: nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) ``` -In the transformer models, `ner` listens to the `transformer` layer, so you can -disable all components related tagging, parsing, and lemmatization. +In the transformer models, `ner` listens to the `transformer` compoinent, so you +can disable all components related tagging, parsing, and lemmatization. ```python nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]) From 40bc01e66823c82a5319497ad46675b83bc7878f Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 17 Mar 2021 22:41:41 +0900 Subject: [PATCH 04/18] Proactively remove unused listeners With this the changes in initialize.py might be unecessary. Requires testing. --- spacy/language.py | 24 +++++++++++++++--------- spacy/training/initialize.py | 19 +++++++++---------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 871dfafaa..04a5e843e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1686,15 +1686,21 @@ class Language: ) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: - if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer - for listener in proc.listening_components: - # If it's a component sourced from another pipeline, we check if - # the tok2vec listeners should be replaced with standalone tok2vec - # models (e.g. so component can be frozen without its performance - # degrading when other components/tok2vec are updated) - paths = sourced.get(listener, {}).get("replace_listeners", []) - if paths: - nlp.replace_listeners(name, listener, paths) + # Remove listeners not in the pipeline + listener_names = getattr(proc, "listening_components", []) + unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names] + for listener_name in unused_listener_names: + for listener in proc.listener_map.get(listener_name, []): + proc.remove_listener(listener, listener_name) + + for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + # If it's a component sourced from another pipeline, we check if + # the tok2vec listeners should be replaced with standalone tok2vec + # models (e.g. so component can be frozen without its performance + # degrading when other components/tok2vec are updated) + paths = sourced.get(listener, {}).get("replace_listeners", []) + if paths: + nlp.replace_listeners(name, listener, paths) return nlp def replace_listeners( diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index d017aa909..f623627eb 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -72,17 +72,16 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: - if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer - for listener in proc.listening_components: - # Don't warn about components not in the pipeline - if listener not in nlp.pipeline: - continue + for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + # Don't warn about components not in the pipeline + if listener not in nlp.pipe_names: + continue - if listener in frozen_components and name not in frozen_components: - logger.warning(Warnings.W087.format(name=name, listener=listener)) - # We always check this regardless, in case user freezes tok2vec - if listener not in frozen_components and name in frozen_components: - logger.warning(Warnings.W086.format(name=name, listener=listener)) + if listener in frozen_components and name not in frozen_components: + logger.warning(Warnings.W087.format(name=name, listener=listener)) + # We always check this regardless, in case user freezes tok2vec + if listener not in frozen_components and name in frozen_components: + logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp From 9fd41d674296bcfffc064cb7bcae8f0b5dcb6880 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 14:54:04 +0100 Subject: [PATCH 05/18] Remove Language.pipe cleanup arg --- website/docs/api/language.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index a90476dab..ca87cbb16 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -198,7 +198,6 @@ more efficient than processing texts one-by-one. | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | | `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | -| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ | @@ -872,10 +871,10 @@ when loading a config with > replace_listeners = ["model.tok2vec"] > ``` -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ | -| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~ | +| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ | | `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ | ## Language.meta {#meta tag="property"} From 83c1b919a7f35452a23a1016fd862e6034107cfb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 14:54:40 +0100 Subject: [PATCH 06/18] Fix positional/option in CLI types --- website/docs/api/cli.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 16e84e53f..73a03cba8 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -77,7 +77,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude] | Name | Description | | ------------------------------------------------ | --------------------------------------------------------------------------------------------- | -| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ | +| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ | | `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ | | `--silent`, `-s` 2.0.12 | Don't print anything, just return the values. ~~bool (flag)~~ | | `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ | @@ -259,7 +259,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | Name | Description | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | | `input_file` | Input file. ~~Path (positional)~~ | -| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ | +| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ | | `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | | `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | | `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | @@ -642,7 +642,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts] | Name | Description | | ----------------- | ---------------------------------------------------------------------------------- | | `model` | A loadable spaCy pipeline (package name or path). ~~str (positional)~~ | -| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ | +| `inputs` | Path to input file, or `-` for standard input. ~~Path (positional)~~ | | `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **PRINTS** | Profiling information for the pipeline. | @@ -1191,14 +1191,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] > $ python -m spacy project dvc all > ``` -| Name | Description | -| ----------------- | ----------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ | -| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | -| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | +| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | +| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | ## ray {#ray new="3"} @@ -1236,7 +1236,7 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ | | `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | From 9a254d39956ecee8dd124c6223711732324a35e4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 15:05:22 +0100 Subject: [PATCH 07/18] Include all en_core_web_sm components in examples --- website/docs/usage/processing-pipelines.md | 29 +++++++++++----------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 909a9c7de..25eaf6558 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."] In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a (potentially very large) iterable of texts as a stream. Because we're only accessing the named entities in `doc.ents` (set by the `ner` component), we'll -disable all other statistical components (the `tagger` and `parser`) during -processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and -access the named entity predictions: +disable all other components during processing. `nlp.pipe` yields `Doc` +objects, so we can iterate over them and access the named entity predictions: > #### ✏️ Things to try > @@ -73,7 +72,7 @@ texts = [ ] nlp = spacy.load("en_core_web_sm") -for doc in nlp.pipe(texts, disable=["tagger", "parser"]): +for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]): # Do something with the doc here print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -144,10 +143,12 @@ nlp = spacy.load("en_core_web_sm") ``` ... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the -pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize -`spacy.lang.en.English`, and create each pipeline component and add it to the -processing pipeline. It'll then load in the model data from the data directory -and return the modified `Language` class for you to use as the `nlp` object. +pipeline +`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy +will then initialize `spacy.lang.en.English`, and create each pipeline component +and add it to the processing pipeline. It'll then load in the model data from +the data directory and return the modified `Language` class for you to use as +the `nlp` object. @@ -171,7 +172,7 @@ the binary data: ```python ### spacy.load under the hood lang = "en" -pipeline = ["tok2vec", "tagger", "parser", "ner"] +pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"] data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English @@ -186,7 +187,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the components can access it to assign annotations to the `Doc` object, and subsequently to the `Token` and `Span` which are only views of the `Doc`, and don't own any data themselves. All components return the modified document, -which is then processed by the component next in the pipeline. +which is then processed by the next component in the pipeline. ```python ### The pipeline under the hood @@ -201,9 +202,9 @@ list of human-readable component names. ```python print(nlp.pipeline) -# [('tok2vec', ), ('tagger', ), ('parser', ), ('ner', )] +# [('tok2vec', ), ('tagger', ), ('parser', ), ('ner', ), ('attribute_ruler', ), ('lemmatizer', )] print(nlp.pipe_names) -# ['tok2vec', 'tagger', 'parser', 'ner'] +# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'] ``` ### Built-in pipeline components {#built-in} @@ -300,7 +301,7 @@ blocks. ```python ### Disable for block # 1. Use as a context manager -with nlp.select_pipes(disable=["tagger", "parser"]): +with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]): doc = nlp("I won't be tagged and parsed") doc = nlp("I will be tagged and parsed") @@ -324,7 +325,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword argument if you only want to disable components during processing: ```python -for doc in nlp.pipe(texts, disable=["tagger", "parser"]): +for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]): # Do something with the doc here ``` From c9e1a9ac174abe4c8113518955e56af6ea2c5a8d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 17 Mar 2021 21:28:04 +0100 Subject: [PATCH 08/18] Add multiprocessing section --- website/docs/usage/processing-pipelines.md | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 25eaf6558..9e8e87239 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -91,6 +91,55 @@ have to call `list()` on it first: +### Multiprocessing + +spaCy includes built-in support for multiprocessing with +[`nlp.pipe`](/api/language#pipe) using the `n_process` option: + +```python +# Multiprocessing with 4 processes +docs = nlp.pipe(texts, n_process=4) + +# With as many processes as CPUs (use with caution!) +docs = nlp.pipe(texts, n_process=-1) +``` + +Depending on your platform, starting many processes with multiprocessing can +add a lot of overhead. In particular, the default start method `spawn` used in +macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models +because the model data is copied in memory for each new process. See the +[Python docs on +multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) +for further details. + +For shorter tasks and in particular with `spawn`, it can be faster to use a +smaller number of processes with a larger batch size. The optimal `batch_size` +setting will depend on the pipeline components, the length of your documents, +the number of processes and how much memory is available. + +```python +# Default batch size is `nlp.batch_size` (typically 1000) +docs = nlp.pipe(texts, n_process=2, batch_size=2000) +``` + + + +Multiprocessing is not generally recommended on GPU because RAM is too limited. +If you want to try it out, be aware that it is only possible using `spawn` due +to limitations in CUDA. + + + + + +In Linux, transformer models may hang or deadlock with multiprocessing due to an +[issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One +suggested workaround is to use `spawn` instead of `fork` and another is to +limit the number of threads before loading any models using +`torch.set_num_threads(1)`. + + + ## Pipelines and built-in components {#pipelines} spaCy makes it very easy to create your own pipelines consisting of reusable From acc58719da2f0b7584eedc913fd691a8ab0c750f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Mar 2021 12:49:20 +0100 Subject: [PATCH 09/18] Update custom similarity hooks example --- website/docs/usage/processing-pipelines.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 9e8e87239..836bdac67 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1547,24 +1547,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`. | Name | Customizes | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `user_hooks` | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | +| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) | | `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) | ```python ### Add custom similarity hooks +from spacy.language import Language + + class SimilarityModel: - def __init__(self, model): - self._model = model + def __init__(self, name: str, index: int): + self.name = name + self.index = index def __call__(self, doc): doc.user_hooks["similarity"] = self.similarity doc.user_span_hooks["similarity"] = self.similarity doc.user_token_hooks["similarity"] = self.similarity + return doc def similarity(self, obj1, obj2): - y = self._model([obj1.vector, obj2.vector]) - return float(y[0]) + return obj1.vector[self.index] + obj2.vector[self.index] + + +@Language.factory("similarity_component", default_config={"index": 0}) +def create_similarity_component(nlp, name, index: int): + return SimilarityModel(name, index) ``` ## Developing plugins and wrappers {#plugins} From 0fb1881f36f68b42b6b096915c153ef189b21ff2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Mar 2021 13:29:51 +0100 Subject: [PATCH 10/18] Reformat processing pipelines --- website/docs/usage/processing-pipelines.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 836bdac67..a669bda7d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -54,8 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."] In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a (potentially very large) iterable of texts as a stream. Because we're only accessing the named entities in `doc.ents` (set by the `ner` component), we'll -disable all other components during processing. `nlp.pipe` yields `Doc` -objects, so we can iterate over them and access the named entity predictions: +disable all other components during processing. `nlp.pipe` yields `Doc` objects, +so we can iterate over them and access the named entity predictions: > #### ✏️ Things to try > @@ -104,12 +104,11 @@ docs = nlp.pipe(texts, n_process=4) docs = nlp.pipe(texts, n_process=-1) ``` -Depending on your platform, starting many processes with multiprocessing can -add a lot of overhead. In particular, the default start method `spawn` used in +Depending on your platform, starting many processes with multiprocessing can add +a lot of overhead. In particular, the default start method `spawn` used in macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models because the model data is copied in memory for each new process. See the -[Python docs on -multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) +[Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) for further details. For shorter tasks and in particular with `spawn`, it can be faster to use a @@ -134,8 +133,8 @@ to limitations in CUDA. In Linux, transformer models may hang or deadlock with multiprocessing due to an [issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One -suggested workaround is to use `spawn` instead of `fork` and another is to -limit the number of threads before loading any models using +suggested workaround is to use `spawn` instead of `fork` and another is to limit +the number of threads before loading any models using `torch.set_num_threads(1)`. @@ -1547,7 +1546,7 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`. | Name | Customizes | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | +| `user_hooks` | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents) | | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) | | `user_span_hooks` | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root) | From 40e5d3a980886548dd0c692654f00dd26bac519a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Mar 2021 16:56:10 +0100 Subject: [PATCH 11/18] Update saving/loading example --- website/docs/usage/saving-loading.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index f15493fd7..9dad077e7 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md' When serializing the pipeline, keep in mind that this will only save out the **binary data for the individual components** to allow spaCy to restore them – not the entire objects. This is a good thing, because it makes serialization -safe. But it also means that you have to take care of storing the language name -and pipeline component names as well, and restoring them separately before you -can load in the data. +safe. But it also means that you have to take care of storing the config, which +contains the pipeline configuration and all the relevant settings. > #### Saving the meta and config > @@ -33,24 +32,21 @@ can load in the data. ```python ### Serialize +config = nlp.config bytes_data = nlp.to_bytes() -lang = nlp.config["nlp"]["lang"] # "en" -pipeline = nlp.config["nlp"]["pipeline"] # ["tagger", "parser", "ner"] ``` ```python ### Deserialize -nlp = spacy.blank(lang) -for pipe_name in pipeline: - nlp.add_pipe(pipe_name) +lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"]) +nlp = lang_cls.from_config(config) nlp.from_bytes(bytes_data) ``` This is also how spaCy does it under the hood when loading a pipeline: it loads the `config.cfg` containing the language and pipeline information, initializes -the language class, creates and adds the pipeline components based on the -defined [factories](/usage/processing-pipeline#custom-components-factories) and -_then_ loads in the binary data. You can read more about this process +the language class, creates and adds the pipeline components based on the config +and _then_ loads in the binary data. You can read more about this process [here](/usage/processing-pipelines#pipelines). ## Serializing Doc objects efficiently {#docs new="2.2"} From 6354b642c5dcc806e4704f9b0caa0c0fe2543e13 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Mar 2021 19:01:10 +0100 Subject: [PATCH 12/18] Fix typo --- website/docs/models/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 1d03b4c3d..d37e9471d 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -175,7 +175,7 @@ disable everything else: nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) ``` -In the transformer models, `ner` listens to the `transformer` compoinent, so you +In the transformer models, `ner` listens to the `transformer` component, so you can disable all components related tagging, parsing, and lemmatization. ```python From 6a9a46776661c32ae9a95d9abddf81f7b905a118 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Mar 2021 08:12:49 +0100 Subject: [PATCH 13/18] Update website/docs/usage/processing-pipelines.md Co-authored-by: Ines Montani --- website/docs/usage/processing-pipelines.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index a669bda7d..52568658d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -91,7 +91,7 @@ have to call `list()` on it first: -### Multiprocessing +### Multiprocessing {#multiprocessing} spaCy includes built-in support for multiprocessing with [`nlp.pipe`](/api/language#pipe) using the `n_process` option: From 48b90c8e1cf3942862ebc14e61842148060fa784 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Mar 2021 09:43:52 +0100 Subject: [PATCH 14/18] Update deprecated doc.is_sentenced in Corpus --- spacy/training/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index ae7b89f15..079b872d6 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -155,7 +155,7 @@ class Corpus: continue elif self.max_length == 0 or len(reference) < self.max_length: yield self._make_example(nlp, reference, False) - elif reference.is_sentenced: + elif reference.has_annotation("SENT_START"): for ref_sent in reference.sents: if len(ref_sent) == 0: continue @@ -166,7 +166,7 @@ class Corpus: self, nlp: "Language", reference_docs: Iterable[Doc] ) -> Iterator[Example]: for reference in reference_docs: - if reference.is_sentenced: + if reference.has_annotation("SENT_START"): ref_sents = [sent.as_doc() for sent in reference.sents] else: ref_sents = [reference] From c771ec22f05385f0089eeb724a44ff3ec0a7815d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Mar 2021 10:11:10 +0100 Subject: [PATCH 15/18] Update matcher errors and docs * Mention `tagger+attribute_ruler` in `POS`/`MORPH` error messages for `Matcher` and `PhraseMatcher` * Document `Matcher.__call__(allow_missing=)` --- spacy/matcher/matcher.pyx | 4 +++- spacy/matcher/phrasematcher.pyx | 2 +- website/docs/api/matcher.md | 13 +++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index ec5d72f9e..26dca05eb 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -202,6 +202,8 @@ cdef class Matcher: doclike (Doc or Span): The document to match over. as_spans (bool): Return Span objects with labels instead of (match_id, start, end) tuples. + allow_missing (bool): Whether to skip checks for missing annotation for + attributes included in patterns. Defaults to False. RETURNS (list): A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is an integer. If as_spans is set @@ -222,7 +224,7 @@ cdef class Matcher: if attr == TAG: pipe = "tagger" elif attr in (POS, MORPH): - pipe = "morphologizer" + pipe = "morphologizer or tagger+attribute_ruler" elif attr == LEMMA: pipe = "lemmatizer" elif attr == DEP: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 088456b9a..e5ff2202c 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -194,7 +194,7 @@ cdef class PhraseMatcher: if attr == TAG: pipe = "tagger" elif attr in (POS, MORPH): - pipe = "morphologizer" + pipe = "morphologizer or tagger+attribute_ruler" elif attr == LEMMA: pipe = "lemmatizer" elif attr == DEP: diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 7c39d9caf..95a76586a 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -120,12 +120,13 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > matches = matcher(doc) > ``` -| Name | Description | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | -| _keyword-only_ | | -| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | -| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | +| Name | Description | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| _keyword-only_ | | +| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | +| `allow_missing` 3 | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | ## Matcher.\_\_len\_\_ {#len tag="method" new="2"} From 39153ef90f54fc92f96673789a719a4e0a8fc0b3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Mar 2021 10:45:16 +0100 Subject: [PATCH 16/18] Update lexeme_norm checks * Add util method for check * Add new languages to list with lexeme norm tables * Add check to all relevant components * Add config details to warning message Note that we're not actually inspecting the model config to see if `NORM` is used as an attribute, so it may warn in cases where it's not relevant. --- spacy/errors.py | 9 +++++++-- spacy/pipeline/morphologizer.pyx | 1 + spacy/pipeline/senter.pyx | 1 + spacy/pipeline/tagger.pyx | 1 + spacy/pipeline/transition_parser.pyx | 5 +---- spacy/util.py | 26 ++++++++++++++++++++------ 6 files changed, 31 insertions(+), 12 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 4f9e90b57..d8c5cc3a8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -73,8 +73,13 @@ class Warnings: "degree. If this is intentional or the language you're using " "doesn't have a normalization table, please ignore this warning. " "If this is surprising, make sure you have the spacy-lookups-data " - "package installed. The languages with lexeme normalization tables " - "are currently: {langs}") + "package installed and load the table in your config. The " + "languages with lexeme normalization tables are currently: " + "{langs}\n\nLoad the table in your config with:\n\n" + "[initialize.lookups]\n" + "@misc = \"spacy.LookupsDataLoader.v1\"\n" + "lang = ${{nlp.lang}}\n" + "tables = [\"lexeme_norm\"]\n") W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' "attribute or operator.") diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 38da71ec7..cd0081346 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -137,6 +137,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#initialize """ validate_get_examples(get_examples, "Morphologizer.initialize") + util.check_lexeme_norms(self.vocab, "morphologizer") if labels is not None: self.cfg["labels_morph"] = labels["morph"] self.cfg["labels_pos"] = labels["pos"] diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index c03ec0462..83cd06739 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer#initialize """ validate_get_examples(get_examples, "SentenceRecognizer.initialize") + util.check_lexeme_norms(self.vocab, "senter") doc_sample = [] label_sample = [] assert self.labels, Errors.E924.format(name=self.name) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 204308dcc..9af5245c1 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -249,6 +249,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#initialize """ validate_get_examples(get_examples, "Tagger.initialize") + util.check_lexeme_norms(self.vocab, "tagger") if labels is not None: for tag in labels: self.add_label(tag) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 816870a3e..4de57d311 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe): def initialize(self, get_examples, nlp=None, labels=None): validate_get_examples(get_examples, "Parser.initialize") - lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) - if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: - langs = ", ".join(util.LEXEME_NORM_LANGS) - util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) + util.check_lexeme_norms(self.vocab, "parser or NER") if labels is not None: actions = dict(labels) else: diff --git a/spacy/util.py b/spacy/util.py index 389e3504f..9915de935 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -59,7 +59,7 @@ if TYPE_CHECKING: OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. @@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co logger = logging.getLogger("spacy") logger_stream_handler = logging.StreamHandler() -logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")) +logger_stream_handler.setFormatter( + logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") +) logger.addHandler(logger_stream_handler) @@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool: if hasattr(func, attr): # function or class instance return True # https://stackoverflow.com/a/55767059 - if hasattr(func, "__qualname__") and hasattr(func, "__module__") \ - and func.__module__ in sys.modules: # method - cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] - return hasattr(cls_func, attr) + if ( + hasattr(func, "__qualname__") + and hasattr(func, "__module__") + and func.__module__ in sys.modules + ): # method + cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] + return hasattr(cls_func, attr) return False @@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy(): """ if is_in_jupyter(): from thinc.backends.cupy_ops import CupyOps + if CupyOps.xp is not None: from thinc.backends import contextvars_eq_thread_ops + if not contextvars_eq_thread_ops(): warnings.warn(Warnings.W111) + + +def check_lexeme_norms(vocab, component_name): + lexeme_norms = vocab.lookups.get_table("lexeme_norm", {}) + if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS: + langs = ", ".join(LEXEME_NORM_LANGS) + logger.debug(Warnings.W033.format(model=component_name, langs=langs)) From e39c0dcf336170ee0358f1c86a7146dcc54862c4 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sat, 20 Mar 2021 18:40:00 +0900 Subject: [PATCH 17/18] Fix mismatched backtick in Lexeme docs --- website/docs/api/lexeme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index c1837fd05..c99f19482 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -133,8 +133,8 @@ The L2 norm of the lexeme's vector representation. | `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ | | `lower` | Lowercase form of the word. ~~int~~ | | `lower_` | Lowercase form of the word. ~~str~~ | -| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | From 0d2b723e8d1ae02dcdf06500188f06172b098420 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 20 Mar 2021 11:38:55 +0100 Subject: [PATCH 18/18] Update entity setting section --- website/docs/usage/linguistic-features.md | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index fd76c6e4d..40ea2bf9c 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) # The model didn't recognize "fb" as an entity :( -fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity +# Create a span for the new entity +fb_ent = Span(doc, 0, 1, label="ORG") + +# Option 1: Modify the provided entity spans, leaving the rest unmodified +doc.set_ents([fb_ent], default="unmodified") + +# Option 2: Assign a complete list of ents to doc.ents doc.ents = list(doc.ents) + [fb_ent] -ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] +ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents] print('After', ents) -# [('fb', 0, 2, 'ORG')] 🎉 +# [('fb', 0, 1, 'ORG')] 🎉 ``` -Keep in mind that you need to create a `Span` with the start and end index of -the **token**, not the start and end index of the entity in the document. In -this case, "fb" is token `(0, 1)` – but at the document level, the entity will -have the start and end indices `(0, 2)`. +Keep in mind that `Span` is initialized with the start and end **token** +indices, not the character offsets. To create a span from character offsets, use +[`Doc.char_span`](/api/doc#char_span): + +```python +fb_ent = doc.char_span(0, 2, label="ORG") +``` #### Setting entity annotations from array {#setting-from-array} @@ -645,9 +654,10 @@ write efficient native code. ```python # cython: infer_types=True +from spacy.typedefs cimport attr_t from spacy.tokens.doc cimport Doc -cpdef set_entity(Doc doc, int start, int end, int ent_type): +cpdef set_entity(Doc doc, int start, int end, attr_t ent_type): for i in range(start, end): doc.c[i].ent_type = ent_type doc.c[start].ent_iob = 3