Recommend lookups tables from URLs or other loaders (#12283)

* Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer
mentioned) and recommend loading data from the `spacy-lookups-data` repo
or other sources rather than the `spacy-lookups-data` package.

If the tables can't be loaded from the `lookups` registry in the
lemmatizer, show how to specify the tables in `[initialize]` rather than
recommending the `spacy-lookups-data` package.

* Add tests for some rule-based lemmatizers

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2023-07-31 15:54:35 +02:00 committed by GitHub
parent eaaac5a08c
commit 2702db9fef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 139 additions and 28 deletions

View File

@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"
__projects_branch__ = "v3" __projects_branch__ = "v3"
__lookups_tag__ = "v1.0.3"
__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"

View File

@ -1,6 +1,8 @@
import warnings import warnings
from typing import Literal from typing import Literal
from . import about
class ErrorsWithCodes(type): class ErrorsWithCodes(type):
def __getattribute__(self, code): def __getattribute__(self, code):
@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
"table. This may degrade the performance of the model to some " "table. This may degrade the performance of the model to some "
"degree. If this is intentional or the language you're using " "degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. " "doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data " "If this is surprising, make sure you are loading the table in "
"package installed and load the table in your config. The " "your config. The languages with lexeme normalization tables are "
"languages with lexeme normalization tables are currently: " "currently: {langs}\n\nAn example of how to load a table in "
"{langs}\n\nLoad the table in your config with:\n\n" "your config :\n\n"
"[initialize.lookups]\n" "[initialize.lookups]\n"
"@misc = \"spacy.LookupsDataLoader.v1\"\n" "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
"lang = ${{nlp.lang}}\n" "lang = ${{nlp.lang}}\n"
f'url = "{about.__lookups_url__}"\n'
"tables = [\"lexeme_norm\"]\n") "tables = [\"lexeme_norm\"]\n")
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
"attribute or operator.") "attribute or operator.")
@ -983,6 +986,18 @@ class Errors(metaclass=ErrorsWithCodes):
"{existing_value}.") "{existing_value}.")
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.") E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.") E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
"[initialize] or in registered lookups (spacy-lookups-data). An "
"example for how to load lemmatizer tables in [initialize]:\n\n"
"[initialize.components]\n\n"
"[initialize.components.{pipe_name}]\n\n"
"[initialize.components.{pipe_name}.lookups]\n"
'@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
"lang = ${{nlp.lang}}\n"
f'url = "{about.__lookups_url__}"\n'
"tables = {tables}\n"
"# or required tables only: tables = {required_tables}\n")
E4011 = ("Server error ({status_code}), couldn't fetch {url}")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -128,13 +128,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language: class Language:
"""A text-processing pipeline. Usually you'll load this once per process, """A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application. and pass the instance around your application.

View File

@ -2,16 +2,40 @@ from collections import OrderedDict
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
import requests
import srsly import srsly
from preshed.bloom import BloomFilter from preshed.bloom import BloomFilter
from .errors import Errors from .errors import Errors
from .strings import get_string_id from .strings import get_string_id
from .util import SimpleFrozenDict, ensure_path, load_language_data, registry from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
UNSET = object() UNSET = object()
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables)
return lookups
@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
def load_lookups_data_from_url(lang, tables, url):
logger.debug(f"Loading lookups from {url}: {tables}")
lookups = Lookups()
for table in tables:
table_url = url + lang + "_" + table + ".json"
r = requests.get(table_url)
if r.status_code != 200:
raise ValueError(
Errors.E4011.format(status_code=r.status_code, url=table_url)
)
table_data = r.json()
lookups.add_table(table, table_data)
return lookups
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups": def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
"""Load the data from the spacy-lookups-data package for a given language, """Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty `Lookups` container if there's no data or if the package if available. Returns an empty `Lookups` container if there's no data or if the package

View File

@ -2,6 +2,7 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import srsly
from thinc.api import Model from thinc.api import Model
from .. import util from .. import util
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
""" """
required_tables, optional_tables = self.get_lookups_config(self.mode) required_tables, optional_tables = self.get_lookups_config(self.mode)
if lookups is None: if lookups is None:
logger.debug("Lemmatizer: loading tables from spacy-lookups-data") logger.debug(
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables) "Lemmatizer: no lemmatizer lookups tables provided, "
"trying to load tables from registered lookups (usually "
"spacy-lookups-data)"
)
lookups = load_lookups(
lang=self.vocab.lang, tables=required_tables, strict=False
)
missing_tables = set(required_tables) - set(lookups.tables)
if len(missing_tables) > 0:
raise ValueError(
Errors.E4010.format(
missing_tables=list(missing_tables),
pipe_name=self.name,
required_tables=srsly.json_dumps(required_tables),
tables=srsly.json_dumps(required_tables + optional_tables),
)
)
optional_lookups = load_lookups( optional_lookups = load_lookups(
lang=self.vocab.lang, tables=optional_tables, strict=False lang=self.vocab.lang, tables=optional_tables, strict=False
) )

View File

@ -2,9 +2,11 @@ import pickle
import pytest import pytest
import spacy
from spacy import registry, util from spacy import registry, util
from spacy.about import __lookups_url__
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lookups import Lookups from spacy.lookups import Lookups, load_lookups_data_from_url
from ..util import make_tempdir from ..util import make_tempdir
@ -113,3 +115,15 @@ def test_lemmatizer_serialize(nlp):
# Make sure that lemmatizer cache can be pickled # Make sure that lemmatizer cache can be pickled
pickle.dumps(lemmatizer2) pickle.dumps(lemmatizer2)
@pytest.mark.parametrize("lang", ("ca", "en"))
def test_lemmatizer_load_lookups_from_url(lang):
nlp = spacy.blank(lang)
lemmatizer = nlp.add_pipe("lemmatizer")
req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
lookups = load_lookups_data_from_url(
nlp.lang, req_tables + opt_tables, __lookups_url__
)
lemmatizer.initialize(lookups=lookups)
assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)

View File

@ -14,7 +14,7 @@ implement their own lemmatizer components via
[language-specific factories](/usage/processing-pipelines#factories-language). [language-specific factories](/usage/processing-pipelines#factories-language).
The default data used is provided by the The default data used is provided by the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
extension package. repository.
For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer). For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
> >
> ```python > ```python
> lemmatizer = nlp.add_pipe("lemmatizer") > lemmatizer = nlp.add_pipe("lemmatizer")
> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
> lemmatizer.initialize(lookups=lookups) > lemmatizer.initialize(lookups=lookups)
> ``` > ```
> >

View File

@ -9,6 +9,7 @@ menu:
- ['Batchers', 'batchers'] - ['Batchers', 'batchers']
- ['Augmenters', 'augmenters'] - ['Augmenters', 'augmenters']
- ['Callbacks', 'callbacks'] - ['Callbacks', 'callbacks']
- ['Miscellaneous', 'misc']
- ['Training & Alignment', 'gold'] - ['Training & Alignment', 'gold']
- ['Utility Functions', 'util'] - ['Utility Functions', 'util']
--- ---
@ -934,6 +935,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
| `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ | | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
| **CREATES** | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~ | | **CREATES** | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~ |
## Miscellaneous {id="misc"}
### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
> #### Example config
>
> ```ini
> [initialize.lookups]
> @misc = "spacy.LookupsDataLoader.v1"
> lang = ${nlp.lang}
> tables = ["lexeme_prob"]
> ```
Load the specified tables from the [`lookups` registry](#registry), which are
provided by a package such as
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------- |
| `lang` | The language. ~~str~~ |
| `tables` | The tables to load. ~~List[str]~~ |
| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
> #### Example config
>
> ```ini
> [initialize.components.lemmatizer.lookups]
> @misc = "spacy.LookupsDataLoaderFromURL.v1"
> lang = ${nlp.lang}
> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
> tables = ["lemma_rules","lemma_exc","lemma_index"]
> ```
Load the specified tables from the provided URL. The individual tables are
expected to have filenames in the format `{lang}_{table}.json` under the
specified URL directory as in the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
repository.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------- |
| `lang` | The language. ~~str~~ |
| `url` | The URL for the directory where the tables can be downloaded. ~~str~~ |
| `tables` | The tables to load. ~~List[str]~~ |
| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
## Training data and alignment {id="gold",source="spacy/training"} ## Training data and alignment {id="gold",source="spacy/training"}
### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"} ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}

View File

@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
``` ```
spaCy also lets you install extra dependencies by specifying the following spaCy also lets you install extra dependencies by specifying the following
keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
multiple comma-separated extras). See the `[options.extras_require]` section in multiple comma-separated extras). See the `[options.extras_require]` section in
spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included. spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
> #### Example > #### Example
> >
> ```bash > ```bash
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS > $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | | `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | | `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. | | `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
To install with extras: To install with extras:
```bash ```bash
$ pip install --no-build-isolation --editable .[lookups,cuda102] $ pip install --no-build-isolation --editable .[ja,cuda102]
``` ```
How to install compilers and related build tools: How to install compilers and related build tools:

View File

@ -148,11 +148,11 @@ component.
</Infobox> </Infobox>
The data for spaCy's lemmatizers is distributed in the package The data for spaCy's lemmatizers is distributed in the repository
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
provided trained pipelines already include all the required tables, but if you provided trained pipelines already include all the required tables, but if you
are creating new pipelines, you'll probably want to install `spacy-lookups-data` are creating new pipelines, you can load data from the repository in the
to provide the data when the lemmatizer is initialized. lemmatizer initialization.
### Lookup lemmatizer {id="lemmatizer-lookup"} ### Lookup lemmatizer {id="lemmatizer-lookup"}

View File

@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => {
const pipExtras = [ const pipExtras = [
hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda, hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
train && 'transformers', train && 'transformers',
train && 'lookups',
apple && 'apple', apple && 'apple',
...modelExtras, ...modelExtras,
] ]
@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => {
<QS config="train" package="conda" comment prompt={false}> <QS config="train" package="conda" comment prompt={false}>
# packages only available via pip # packages only available via pip
</QS> </QS>
<QS config="train" package="conda">
pip install spacy-lookups-data
</QS>
{languages.map(({ code, models: modelOptions }) => { {languages.map(({ code, models: modelOptions }) => {
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1] const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]