Recommend lookups tables from URLs or other loaders (#12283)

* Recommend lookups tables from URLs or other loaders Shift away from the `lookups` extra (which isn't removed, just no longer mentioned) and recommend loading data from the `spacy-lookups-data` repo or other sources rather than the `spacy-lookups-data` package. If the tables can't be loaded from the `lookups` registry in the lemmatizer, show how to specify the tables in `[initialize]` rather than recommending the `spacy-lookups-data` package. * Add tests for some rule-based lemmatizers * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-12-28 12:33:26 +03:00 · 2023-07-31 15:54:35 +02:00 · 2023-07-31 15:54:35 +02:00 · 2702db9fef
commit 2702db9fef
parent eaaac5a08c
11 changed files with 139 additions and 28 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
 __projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,6 +1,8 @@
 import warnings
 from typing import Literal

+from . import about
+

 class ErrorsWithCodes(type):
    def __getattribute__(self, code):
@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
            "table. This may degrade the performance of the model to some "
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed and load the table in your config. The "
-            "languages with lexeme normalization tables are currently: "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "If this is surprising, make sure you are loading the table in "
+            "your config. The languages with lexeme normalization tables are "
+            "currently: {langs}\n\nAn example of how to load a table in "
+            "your config :\n\n"
            "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
            "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
            "tables = [\"lexeme_norm\"]\n")
    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
            "attribute or operator.")
@ -983,6 +986,18 @@ class Errors(metaclass=ErrorsWithCodes):
             "{existing_value}.")
    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")


 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -128,13 +128,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
-    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
-    lookups = load_lookups(lang=lang, tables=tables)
-    return lookups
-
-
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -2,16 +2,40 @@ from collections import OrderedDict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

+import requests
 import srsly
 from preshed.bloom import BloomFilter

 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry

 UNSET = object()


+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+    logger.debug(f"Loading lookups from {url}: {tables}")
+    lookups = Lookups()
+    for table in tables:
+        table_url = url + lang + "_" + table + ".json"
+        r = requests.get(table_url)
+        if r.status_code != 200:
+            raise ValueError(
+                Errors.E4011.format(status_code=r.status_code, url=table_url)
+            )
+        table_data = r.json()
+        lookups.add_table(table, table_data)
+    return lookups
+
+
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
    """Load the data from the spacy-lookups-data package for a given language,
    if available. Returns an empty `Lookups` container if there's no data or if the package
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -2,6 +2,7 @@ import warnings
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

+import srsly
 from thinc.api import Model

 from .. import util
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
        """
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            logger.debug(
+                "Lemmatizer: no lemmatizer lookups tables provided, "
+                "trying to load tables from registered lookups (usually "
+                "spacy-lookups-data)"
+            )
+            lookups = load_lookups(
+                lang=self.vocab.lang, tables=required_tables, strict=False
+            )
+            missing_tables = set(required_tables) - set(lookups.tables)
+            if len(missing_tables) > 0:
+                raise ValueError(
+                    Errors.E4010.format(
+                        missing_tables=list(missing_tables),
+                        pipe_name=self.name,
+                        required_tables=srsly.json_dumps(required_tables),
+                        tables=srsly.json_dumps(required_tables + optional_tables),
+                    )
+                )
            optional_lookups = load_lookups(
                lang=self.vocab.lang, tables=optional_tables, strict=False
            )
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -2,9 +2,11 @@ import pickle

 import pytest

+import spacy
 from spacy import registry, util
+from spacy.about import __lookups_url__
 from spacy.lang.en import English
-from spacy.lookups import Lookups
+from spacy.lookups import Lookups, load_lookups_data_from_url

 from ..util import make_tempdir

@ -113,3 +115,15 @@ def test_lemmatizer_serialize(nlp):

    # Make sure that lemmatizer cache can be pickled
    pickle.dumps(lemmatizer2)
+
+
+@pytest.mark.parametrize("lang", ("ca", "en"))
+def test_lemmatizer_load_lookups_from_url(lang):
+    nlp = spacy.blank(lang)
+    lemmatizer = nlp.add_pipe("lemmatizer")
+    req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+    lookups = load_lookups_data_from_url(
+        nlp.lang, req_tables + opt_tables, __lookups_url__
+    )
+    lemmatizer.initialize(lookups=lookups)
+    assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@ -14,7 +14,7 @@ implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.

 For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).

@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
+> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -9,6 +9,7 @@ menu:
  - ['Batchers', 'batchers']
  - ['Augmenters', 'augmenters']
  - ['Callbacks', 'callbacks']
+  - ['Miscellaneous', 'misc']
  - ['Training & Alignment', 'gold']
  - ['Utility Functions', 'util']
 ---
@ -934,6 +935,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
 | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
 | **CREATES**                 | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~                                    |

+## Miscellaneous {id="misc"}
+
+### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.lookups]
+> @misc = "spacy.LookupsDataLoader.v1"
+> lang = ${nlp.lang}
+> tables = ["lexeme_prob"]
+> ```
+
+Load the specified tables from the [`lookups` registry](#registry), which are
+provided by a package such as
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                           |
+| `tables`    | The tables to load. ~~List[str]~~                                                               |
+| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
+
+### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.lemmatizer.lookups]
+> @misc = "spacy.LookupsDataLoaderFromURL.v1"
+> lang = ${nlp.lang}
+> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
+> tables = ["lemma_rules","lemma_exc","lemma_index"]
+> ```
+
+Load the specified tables from the provided URL. The individual tables are
+expected to have filenames in the format `{lang}_{table}.json` under the
+specified URL directory as in the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
+repository.
+
+| Name        | Description                                                                                 |
+| ----------- | ------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                       |
+| `url`       | The URL for the directory where the tables can be downloaded. ~~str~~                       |
+| `tables`    | The tables to load. ~~List[str]~~                                                           |
+| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
+
 ## Training data and alignment {id="gold",source="spacy/training"}

 ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```

 spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
 multiple comma-separated extras). See the `[options.extras_require]` section in
 spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.

 > #### Example
 >
 > ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
 > ```

 | Name             | Description                                                                                                                                                                                                                                                    |
 | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
 | `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
 | `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
 | `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
 To install with extras:

 ```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
 ```

 How to install compilers and related build tools:
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@ -148,11 +148,11 @@ component.

 </Infobox>

-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
 provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
-to provide the data when the lemmatizer is initialized.
+are creating new pipelines, you can load data from the repository in the
+lemmatizer initialization.

 ### Lookup lemmatizer {id="lemmatizer-lookup"}

--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => {
    const pipExtras = [
        hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
        train && 'transformers',
-        train && 'lookups',
        apple && 'apple',
        ...modelExtras,
    ]
@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => {
            <QS config="train" package="conda" comment prompt={false}>
                # packages only available via pip
            </QS>
-            <QS config="train" package="conda">
-                pip install spacy-lookups-data
-            </QS>

            {languages.map(({ code, models: modelOptions }) => {
                const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]