Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer mentioned) and recommend loading data from the `spacy-lookups-data` repo or other sources rather than the `spacy-lookups-data` package. If the tables can't be loaded from the `lookups` registry in the lemmatizer, show how to specify the tables in `[initialize]` rather than recommending the `spacy-lookups-data` package.
2025-08-25 14:34:55 +03:00 · 2023-02-15 18:17:43 +01:00 · 2023-02-15 18:17:43 +01:00 · e4af62c89f
commit e4af62c89f
parent cbc2ae933e
11 changed files with 125 additions and 27 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
 __projects_branch__ = "v3"
 __lookups_tag__ = "v1.0.3"
 __lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,6 +1,8 @@
 from typing import Literal
 import warnings
 from . import about
 class ErrorsWithCodes(type):
    def __getattribute__(self, code):
@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
            "table. This may degrade the performance of the model to some "
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
+            "If this is surprising, make sure you are loading the table in "
-            "package installed and load the table in your config. The "
+            "your config. The languages with lexeme normalization tables are "
-            "languages with lexeme normalization tables are currently: "
+            "currently: {langs}\n\nAn example of how to load a table in "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "your config :\n\n"
            "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
            "lang = ${{nlp.lang}}\n"
             f'url = "{about.__lookups_url__}"\n'
            "tables = [\"lexeme_norm\"]\n")
    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
            "attribute or operator.")
@ -961,6 +964,18 @@ class Errors(metaclass=ErrorsWithCodes):
    E4003 = ("Training examples for distillation must have the exact same tokens in the "
             "reference and predicted docs.")
    E4004 = ("Backprop is not supported when is_train is not set.")
    E4005 = ("Required lemmatizer table(s) {missing_tables} not found in "
             "[initialize] or in registered lookups (spacy-lookups-data). An "
             "example for how to load lemmatizer tables in [initialize]:\n\n"
             "[initialize.components]\n\n"
             "[initialize.components.{pipe_name}]\n\n"
             "[initialize.components.{pipe_name}.lookups]\n"
             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
             "lang = ${{nlp.lang}}\n"
             f'url = "{about.__lookups_url__}"\n'
             "tables = {tables}\n"
             "# or required tables only: tables = {required_tables}\n")
    E4006 = ("Server error ({status_code}), couldn't fetch {url}")
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -104,13 +104,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
    lookups = load_lookups(lang=lang, tables=tables)
    return lookups
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -1,17 +1,42 @@
 from typing import Any, List, Union, Optional, Dict
 from pathlib import Path
 import requests
 import srsly
 from preshed.bloom import BloomFilter
 from collections import OrderedDict
 from .errors import Errors
 from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
 from .util import logger
 from .strings import get_string_id
 UNSET = object()
@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
    lookups = load_lookups(lang=lang, tables=tables)
    return lookups
@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
 def load_lookups_data_from_url(lang, tables, url):
    logger.debug(f"Loading lookups from {url}: {tables}")
    lookups = Lookups()
    for table in tables:
        table_url = url + lang + "_" + table + ".json"
        r = requests.get(table_url)
        if r.status_code != 200:
            raise ValueError(
                Errors.E4006.format(status_code=r.status_code, url=table_url)
            )
        table_data = r.json()
        lookups.add_table(table, table_data)
    return lookups
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
    """Load the data from the spacy-lookups-data package for a given language,
    if available. Returns an empty `Lookups` container if there's no data or if the package
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -1,5 +1,6 @@
 from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
 from thinc.api import Model
 import srsly
 from pathlib import Path
 import warnings
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
        """
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
+            logger.debug(
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+                "Lemmatizer: no lemmatizer lookups tables provided, "
                "trying to load tables from registered lookups (usually "
                "spacy-lookups-data)"
            )
            lookups = load_lookups(
                lang=self.vocab.lang, tables=required_tables, strict=False
            )
            missing_tables = set(required_tables) - set(lookups.tables)
            if len(missing_tables) > 0:
                raise ValueError(
                    Errors.E4005.format(
                        missing_tables=list(missing_tables),
                        pipe_name=self.name,
                        required_tables=srsly.json_dumps(required_tables),
                        tables=srsly.json_dumps(required_tables + optional_tables),
                    )
                )
            optional_lookups = load_lookups(
                lang=self.vocab.lang, tables=optional_tables, strict=False
            )
--- a/spacy/util.py
+++ b/spacy/util.py
@ -55,7 +55,7 @@ if TYPE_CHECKING:
 # fmt: off
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 # Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@ -14,7 +14,7 @@ implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.
 For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
 > lookups = load_lookups(nlp.lang, req_tables + opt_tables)
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -9,6 +9,7 @@ menu:
  - ['Batchers', 'batchers']
  - ['Augmenters', 'augmenters']
  - ['Callbacks', 'callbacks']
  - ['Miscellaneous', 'misc']
  - ['Training & Alignment', 'gold']
  - ['Utility Functions', 'util']
 ---
@ -931,6 +932,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
 | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
 | **CREATES**                 | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~                                    |
 ## Miscellaneous {id="misc",version="3"}
 ### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader",tag="registered function"}
 > #### Example config
 >
 > ```ini
 > [initialize.lookups]
 > @misc = "spacy.LookupsDataLoader.v1"
 > lang = ${nlp.lang}
 > tables = ["lexeme_prob"]
 > ```
 Load the specified tables from the [`lookups` registry](#registry), which are
 provided by a package such as
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
 | Name        | Description                                                                                     |
 | ----------- | ----------------------------------------------------------------------------------------------- |
 | `lang`      | The language. ~~str~~                                                                           |
 | `tables`    | The tables to load. ~~List[str]~~                                                               |
 | **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
 ### spacy.LookupsDataLoaderFromURLFromUrl.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
 > #### Example config
 >
 > ```ini
 > [initialize.components.lemmatizer.lookups]
 > @misc = "spacy.LookupsDataLoaderFromURL.v1"
 > lang = ${nlp.lang}
 > url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
 > tables = ["lemma_rules","lemma_exc","lemma_index"]
 > ```
 Load the specified tables from the provided URL. The individual tables are
 expected to have filenames in the format `{lang}_{table}.json` under the
 specified URL directory as in the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
 repository.
 | Name        | Description                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------- |
 | `lang`      | The language. ~~str~~                                                                       |
 | `url`       | The URL for the directory where the tables can be downloaded. ~~str~~                       |
 | `tables`    | The tables to load. ~~List[str]~~                                                           |
 | **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
 ## Training data and alignment {id="gold",source="spacy/training"}
 ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
 multiple comma-separated extras). See the `[options.extras_require]` section in
 spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 > #### Example
 >
 > ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
 > ```
 | Name             | Description                                                                                                                                                                                                                                                    |
 | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
 | `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
 | `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
 | `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
 To install with extras:
 ```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
 ```
 How to install compilers and related build tools:
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@ -148,11 +148,11 @@ component.
 </Infobox>
-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
 provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
+are creating new pipelines, you can load data from the repository in the
-to provide the data when the lemmatizer is initialized.
+lemmatizer initialization.
 ### Lookup lemmatizer {id="lemmatizer-lookup"}
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => {
    const pipExtras = [
        hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
        train && 'transformers',
        train && 'lookups',
        apple && 'apple',
        ...modelExtras,
    ]
@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => {
            <QS config="train" package="conda" comment prompt={false}>
                # packages only available via pip
            </QS>
            <QS config="train" package="conda">
                pip install spacy-lookups-data
            </QS>
            {languages.map(({ code, models: modelOptions }) => {
                const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]