diff --git a/spacy/about.py b/spacy/about.py index eddbeea09..eb85e6af3 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" __projects_branch__ = "v3" +__lookups_tag__ = "v1.0.3" +__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/" diff --git a/spacy/errors.py b/spacy/errors.py index eadbf63d6..56cdde409 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,8 @@ from typing import Literal import warnings +from . import about + class ErrorsWithCodes(type): def __getattribute__(self, code): @@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes): "table. This may degrade the performance of the model to some " "degree. If this is intentional or the language you're using " "doesn't have a normalization table, please ignore this warning. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed and load the table in your config. The " - "languages with lexeme normalization tables are currently: " - "{langs}\n\nLoad the table in your config with:\n\n" + "If this is surprising, make sure you are loading the table in " + "your config. The languages with lexeme normalization tables are " + "currently: {langs}\n\nAn example of how to load a table in " + "your config :\n\n" "[initialize.lookups]\n" - "@misc = \"spacy.LookupsDataLoader.v1\"\n" + "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n" "lang = ${{nlp.lang}}\n" + f'url = "{about.__lookups_url__}"\n' "tables = [\"lexeme_norm\"]\n") W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " "attribute or operator.") @@ -961,6 +964,18 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("Required lemmatizer table(s) {missing_tables} not found in " + "[initialize] or in registered lookups (spacy-lookups-data). An " + "example for how to load lemmatizer tables in [initialize]:\n\n" + "[initialize.components]\n\n" + "[initialize.components.{pipe_name}]\n\n" + "[initialize.components.{pipe_name}.lookups]\n" + '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n' + "lang = ${{nlp.lang}}\n" + f'url = "{about.__lookups_url__}"\n' + "tables = {tables}\n" + "# or required tables only: tables = {required_tables}\n") + E4006 = ("Server error ({status_code}), couldn't fetch {url}") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/language.py b/spacy/language.py index 13a3d101a..c5750ea85 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -104,13 +104,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory -@registry.misc("spacy.LookupsDataLoader.v1") -def load_lookups_data(lang, tables): - util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") - lookups = load_lookups(lang=lang, tables=tables) - return lookups - - class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. diff --git a/spacy/lookups.py b/spacy/lookups.py index d7cc44fb3..0e6fb3b7c 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,17 +1,42 @@ from typing import Any, List, Union, Optional, Dict from pathlib import Path +import requests import srsly from preshed.bloom import BloomFilter from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path, registry, load_language_data +from .util import logger from .strings import get_string_id UNSET = object() +@registry.misc("spacy.LookupsDataLoader.v1") +def load_lookups_data(lang, tables): + logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + lookups = load_lookups(lang=lang, tables=tables) + return lookups + + +@registry.misc("spacy.LookupsDataLoaderFromURL.v1") +def load_lookups_data_from_url(lang, tables, url): + logger.debug(f"Loading lookups from {url}: {tables}") + lookups = Lookups() + for table in tables: + table_url = url + lang + "_" + table + ".json" + r = requests.get(table_url) + if r.status_code != 200: + raise ValueError( + Errors.E4006.format(status_code=r.status_code, url=table_url) + ) + table_data = r.json() + lookups.add_table(table, table_data) + return lookups + + def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups": """Load the data from the spacy-lookups-data package for a given language, if available. Returns an empty `Lookups` container if there's no data or if the package diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 9c2fc2f09..03495ba74 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,5 +1,6 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple from thinc.api import Model +import srsly from pathlib import Path import warnings @@ -155,8 +156,24 @@ class Lemmatizer(Pipe): """ required_tables, optional_tables = self.get_lookups_config(self.mode) if lookups is None: - logger.debug("Lemmatizer: loading tables from spacy-lookups-data") - lookups = load_lookups(lang=self.vocab.lang, tables=required_tables) + logger.debug( + "Lemmatizer: no lemmatizer lookups tables provided, " + "trying to load tables from registered lookups (usually " + "spacy-lookups-data)" + ) + lookups = load_lookups( + lang=self.vocab.lang, tables=required_tables, strict=False + ) + missing_tables = set(required_tables) - set(lookups.tables) + if len(missing_tables) > 0: + raise ValueError( + Errors.E4005.format( + missing_tables=list(missing_tables), + pipe_name=self.name, + required_tables=srsly.json_dumps(required_tables), + tables=srsly.json_dumps(required_tables + optional_tables), + ) + ) optional_lookups = load_lookups( lang=self.vocab.lang, tables=optional_tables, strict=False ) diff --git a/spacy/util.py b/spacy/util.py index e2ca0e6a4..d653e0305 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -55,7 +55,7 @@ if TYPE_CHECKING: # fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. diff --git a/website/docs/api/lemmatizer.mdx b/website/docs/api/lemmatizer.mdx index f6657dbf4..5bd0112e2 100644 --- a/website/docs/api/lemmatizer.mdx +++ b/website/docs/api/lemmatizer.mdx @@ -14,7 +14,7 @@ implement their own lemmatizer components via [language-specific factories](/usage/processing-pipelines#factories-language). The default data used is provided by the [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) -extension package. +repository. For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer). @@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk. > > ```python > lemmatizer = nlp.add_pipe("lemmatizer") +> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode) +> lookups = load_lookups(nlp.lang, req_tables + opt_tables) > lemmatizer.initialize(lookups=lookups) > ``` > diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index b13a6d28b..01690f161 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -9,6 +9,7 @@ menu: - ['Batchers', 'batchers'] - ['Augmenters', 'augmenters'] - ['Callbacks', 'callbacks'] + - ['Miscellaneous', 'misc'] - ['Training & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -931,6 +932,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`, | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ | | **CREATES** | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~ | +## Miscellaneous {id="misc",version="3"} + +### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader",tag="registered function"} + +> #### Example config +> +> ```ini +> [initialize.lookups] +> @misc = "spacy.LookupsDataLoader.v1" +> lang = ${nlp.lang} +> tables = ["lexeme_prob"] +> ``` + +Load the specified tables from the [`lookups` registry](#registry), which are +provided by a package such as +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `lang` | The language. ~~str~~ | +| `tables` | The tables to load. ~~List[str]~~ | +| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ | + +### spacy.LookupsDataLoaderFromURLFromUrl.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"} + +> #### Example config +> +> ```ini +> [initialize.components.lemmatizer.lookups] +> @misc = "spacy.LookupsDataLoaderFromURL.v1" +> lang = ${nlp.lang} +> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/" +> tables = ["lemma_rules","lemma_exc","lemma_index"] +> ``` + +Load the specified tables from the provided URL. The individual tables are +expected to have filenames in the format `{lang}_{table}.json` under the +specified URL directory as in the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/) +repository. + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------- | +| `lang` | The language. ~~str~~ | +| `url` | The URL for the directory where the tables can be downloaded. ~~str~~ | +| `tables` | The tables to load. ~~List[str]~~ | +| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ | + ## Training data and alignment {id="gold",source="spacy/training"} ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"} diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx index 07f2bd282..b283d117e 100644 --- a/website/docs/usage/index.mdx +++ b/website/docs/usage/index.mdx @@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS ``` spaCy also lets you install extra dependencies by specifying the following -keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with +keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with multiple comma-separated extras). See the `[options.extras_require]` section in spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included. > #### Example > > ```bash -> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS +> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS > ``` | Name | Description | | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. | | `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | | `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | | `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. | @@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy To install with extras: ```bash -$ pip install --no-build-isolation --editable .[lookups,cuda102] +$ pip install --no-build-isolation --editable .[ja,cuda102] ``` How to install compilers and related build tools: diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx index 55d5680fe..add27de07 100644 --- a/website/docs/usage/linguistic-features.mdx +++ b/website/docs/usage/linguistic-features.mdx @@ -148,11 +148,11 @@ component. -The data for spaCy's lemmatizers is distributed in the package +The data for spaCy's lemmatizers is distributed in the repository [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The provided trained pipelines already include all the required tables, but if you -are creating new pipelines, you'll probably want to install `spacy-lookups-data` -to provide the data when the lemmatizer is initialized. +are creating new pipelines, you can load data from the repository in the +lemmatizer initialization. ### Lookup lemmatizer {id="lemmatizer-lookup"} diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index b6c8b9b4c..081040ceb 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => { const pipExtras = [ hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda, train && 'transformers', - train && 'lookups', apple && 'apple', ...modelExtras, ] @@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => { # packages only available via pip - - pip install spacy-lookups-data - {languages.map(({ code, models: modelOptions }) => { const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]