diff --git a/spacy/about.py b/spacy/about.py
index eddbeea09..eb85e6af3 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
__projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
diff --git a/spacy/errors.py b/spacy/errors.py
index eadbf63d6..56cdde409 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,8 @@
from typing import Literal
import warnings
+from . import about
+
class ErrorsWithCodes(type):
def __getattribute__(self, code):
@@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
"table. This may degrade the performance of the model to some "
"degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. "
- "If this is surprising, make sure you have the spacy-lookups-data "
- "package installed and load the table in your config. The "
- "languages with lexeme normalization tables are currently: "
- "{langs}\n\nLoad the table in your config with:\n\n"
+ "If this is surprising, make sure you are loading the table in "
+ "your config. The languages with lexeme normalization tables are "
+ "currently: {langs}\n\nAn example of how to load a table in "
+ "your config :\n\n"
"[initialize.lookups]\n"
- "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+ "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
"lang = ${{nlp.lang}}\n"
+ f'url = "{about.__lookups_url__}"\n'
"tables = [\"lexeme_norm\"]\n")
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
"attribute or operator.")
@@ -961,6 +964,18 @@ class Errors(metaclass=ErrorsWithCodes):
E4003 = ("Training examples for distillation must have the exact same tokens in the "
"reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.")
+ E4005 = ("Required lemmatizer table(s) {missing_tables} not found in "
+ "[initialize] or in registered lookups (spacy-lookups-data). An "
+ "example for how to load lemmatizer tables in [initialize]:\n\n"
+ "[initialize.components]\n\n"
+ "[initialize.components.{pipe_name}]\n\n"
+ "[initialize.components.{pipe_name}.lookups]\n"
+ '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+ "lang = ${{nlp.lang}}\n"
+ f'url = "{about.__lookups_url__}"\n'
+ "tables = {tables}\n"
+ "# or required tables only: tables = {required_tables}\n")
+ E4006 = ("Server error ({status_code}), couldn't fetch {url}")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index 13a3d101a..c5750ea85 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -104,13 +104,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
- util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
- lookups = load_lookups(lang=lang, tables=tables)
- return lookups
-
-
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
diff --git a/spacy/lookups.py b/spacy/lookups.py
index d7cc44fb3..0e6fb3b7c 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,17 +1,42 @@
from typing import Any, List, Union, Optional, Dict
from pathlib import Path
+import requests
import srsly
from preshed.bloom import BloomFilter
from collections import OrderedDict
from .errors import Errors
from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
+from .util import logger
from .strings import get_string_id
UNSET = object()
+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+ logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+ lookups = load_lookups(lang=lang, tables=tables)
+ return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+ logger.debug(f"Loading lookups from {url}: {tables}")
+ lookups = Lookups()
+ for table in tables:
+ table_url = url + lang + "_" + table + ".json"
+ r = requests.get(table_url)
+ if r.status_code != 200:
+ raise ValueError(
+ Errors.E4006.format(status_code=r.status_code, url=table_url)
+ )
+ table_data = r.json()
+ lookups.add_table(table, table_data)
+ return lookups
+
+
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty `Lookups` container if there's no data or if the package
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 9c2fc2f09..03495ba74 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -1,5 +1,6 @@
from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
from thinc.api import Model
+import srsly
from pathlib import Path
import warnings
@@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
"""
required_tables, optional_tables = self.get_lookups_config(self.mode)
if lookups is None:
- logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
- lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+ logger.debug(
+ "Lemmatizer: no lemmatizer lookups tables provided, "
+ "trying to load tables from registered lookups (usually "
+ "spacy-lookups-data)"
+ )
+ lookups = load_lookups(
+ lang=self.vocab.lang, tables=required_tables, strict=False
+ )
+ missing_tables = set(required_tables) - set(lookups.tables)
+ if len(missing_tables) > 0:
+ raise ValueError(
+ Errors.E4005.format(
+ missing_tables=list(missing_tables),
+ pipe_name=self.name,
+ required_tables=srsly.json_dumps(required_tables),
+ tables=srsly.json_dumps(required_tables + optional_tables),
+ )
+ )
optional_lookups = load_lookups(
lang=self.vocab.lang, tables=optional_tables, strict=False
)
diff --git a/spacy/util.py b/spacy/util.py
index e2ca0e6a4..d653e0305 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -55,7 +55,7 @@ if TYPE_CHECKING:
# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
diff --git a/website/docs/api/lemmatizer.mdx b/website/docs/api/lemmatizer.mdx
index f6657dbf4..5bd0112e2 100644
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@@ -14,7 +14,7 @@ implement their own lemmatizer components via
[language-specific factories](/usage/processing-pipelines#factories-language).
The default data used is provided by the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.
For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
@@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
+> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
> lemmatizer.initialize(lookups=lookups)
> ```
>
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index b13a6d28b..01690f161 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -9,6 +9,7 @@ menu:
- ['Batchers', 'batchers']
- ['Augmenters', 'augmenters']
- ['Callbacks', 'callbacks']
+ - ['Miscellaneous', 'misc']
- ['Training & Alignment', 'gold']
- ['Utility Functions', 'util']
---
@@ -931,6 +932,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
| `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
| **CREATES** | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~ |
+## Miscellaneous {id="misc",version="3"}
+
+### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader",tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.lookups]
+> @misc = "spacy.LookupsDataLoader.v1"
+> lang = ${nlp.lang}
+> tables = ["lexeme_prob"]
+> ```
+
+Load the specified tables from the [`lookups` registry](#registry), which are
+provided by a package such as
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `lang` | The language. ~~str~~ |
+| `tables` | The tables to load. ~~List[str]~~ |
+| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
+
+### spacy.LookupsDataLoaderFromURLFromUrl.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.lemmatizer.lookups]
+> @misc = "spacy.LookupsDataLoaderFromURL.v1"
+> lang = ${nlp.lang}
+> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
+> tables = ["lemma_rules","lemma_exc","lemma_index"]
+> ```
+
+Load the specified tables from the provided URL. The individual tables are
+expected to have filenames in the format `{lang}_{table}.json` under the
+specified URL directory as in the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
+repository.
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------------------------------------- |
+| `lang` | The language. ~~str~~ |
+| `url` | The URL for the directory where the tables can be downloaded. ~~str~~ |
+| `tables` | The tables to load. ~~List[str]~~ |
+| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
+
## Training data and alignment {id="gold",source="spacy/training"}
### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index 07f2bd282..b283d117e 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
```
spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
multiple comma-separated extras). See the `[options.extras_require]` section in
spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
> #### Example
>
> ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
> ```
| Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
@@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
To install with extras:
```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
```
How to install compilers and related build tools:
diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx
index 55d5680fe..add27de07 100644
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@@ -148,11 +148,11 @@ component.
-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
-to provide the data when the lemmatizer is initialized.
+are creating new pipelines, you can load data from the repository in the
+lemmatizer initialization.
### Lookup lemmatizer {id="lemmatizer-lookup"}
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index b6c8b9b4c..081040ceb 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => {
const pipExtras = [
hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
train && 'transformers',
- train && 'lookups',
apple && 'apple',
...modelExtras,
]
@@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => {
# packages only available via pip
-
- pip install spacy-lookups-data
-
{languages.map(({ code, models: modelOptions }) => {
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]