mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Recommend lookups tables from URLs or other loaders (#12283)
* Recommend lookups tables from URLs or other loaders Shift away from the `lookups` extra (which isn't removed, just no longer mentioned) and recommend loading data from the `spacy-lookups-data` repo or other sources rather than the `spacy-lookups-data` package. If the tables can't be loaded from the `lookups` registry in the lemmatizer, show how to specify the tables in `[initialize]` rather than recommending the `spacy-lookups-data` package. * Add tests for some rule-based lemmatizers * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
eaaac5a08c
commit
2702db9fef
|
@ -5,3 +5,5 @@ __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
|||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
__projects_branch__ = "v3"
|
||||
__lookups_tag__ = "v1.0.3"
|
||||
__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import warnings
|
||||
from typing import Literal
|
||||
|
||||
from . import about
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
def __getattribute__(self, code):
|
||||
|
@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"table. This may degrade the performance of the model to some "
|
||||
"degree. If this is intentional or the language you're using "
|
||||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed and load the table in your config. The "
|
||||
"languages with lexeme normalization tables are currently: "
|
||||
"{langs}\n\nLoad the table in your config with:\n\n"
|
||||
"If this is surprising, make sure you are loading the table in "
|
||||
"your config. The languages with lexeme normalization tables are "
|
||||
"currently: {langs}\n\nAn example of how to load a table in "
|
||||
"your config :\n\n"
|
||||
"[initialize.lookups]\n"
|
||||
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||
"@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
f'url = "{about.__lookups_url__}"\n'
|
||||
"tables = [\"lexeme_norm\"]\n")
|
||||
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
|
||||
"attribute or operator.")
|
||||
|
@ -983,6 +986,18 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"{existing_value}.")
|
||||
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
|
||||
E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
|
||||
E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
|
||||
"[initialize] or in registered lookups (spacy-lookups-data). An "
|
||||
"example for how to load lemmatizer tables in [initialize]:\n\n"
|
||||
"[initialize.components]\n\n"
|
||||
"[initialize.components.{pipe_name}]\n\n"
|
||||
"[initialize.components.{pipe_name}.lookups]\n"
|
||||
'@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
f'url = "{about.__lookups_url__}"\n'
|
||||
"tables = {tables}\n"
|
||||
"# or required tables only: tables = {required_tables}\n")
|
||||
E4011 = ("Server error ({status_code}), couldn't fetch {url}")
|
||||
|
||||
|
||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||
|
|
|
@ -128,13 +128,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
|
|
@ -2,16 +2,40 @@ from collections import OrderedDict
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
import srsly
|
||||
from preshed.bloom import BloomFilter
|
||||
|
||||
from .errors import Errors
|
||||
from .strings import get_string_id
|
||||
from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
|
||||
from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
|
||||
|
||||
UNSET = object()
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
|
||||
def load_lookups_data_from_url(lang, tables, url):
|
||||
logger.debug(f"Loading lookups from {url}: {tables}")
|
||||
lookups = Lookups()
|
||||
for table in tables:
|
||||
table_url = url + lang + "_" + table + ".json"
|
||||
r = requests.get(table_url)
|
||||
if r.status_code != 200:
|
||||
raise ValueError(
|
||||
Errors.E4011.format(status_code=r.status_code, url=table_url)
|
||||
)
|
||||
table_data = r.json()
|
||||
lookups.add_table(table, table_data)
|
||||
return lookups
|
||||
|
||||
|
||||
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
|
||||
"""Load the data from the spacy-lookups-data package for a given language,
|
||||
if available. Returns an empty `Lookups` container if there's no data or if the package
|
||||
|
|
|
@ -2,6 +2,7 @@ import warnings
|
|||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model
|
||||
|
||||
from .. import util
|
||||
|
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||
if lookups is None:
|
||||
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
|
||||
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
|
||||
logger.debug(
|
||||
"Lemmatizer: no lemmatizer lookups tables provided, "
|
||||
"trying to load tables from registered lookups (usually "
|
||||
"spacy-lookups-data)"
|
||||
)
|
||||
lookups = load_lookups(
|
||||
lang=self.vocab.lang, tables=required_tables, strict=False
|
||||
)
|
||||
missing_tables = set(required_tables) - set(lookups.tables)
|
||||
if len(missing_tables) > 0:
|
||||
raise ValueError(
|
||||
Errors.E4010.format(
|
||||
missing_tables=list(missing_tables),
|
||||
pipe_name=self.name,
|
||||
required_tables=srsly.json_dumps(required_tables),
|
||||
tables=srsly.json_dumps(required_tables + optional_tables),
|
||||
)
|
||||
)
|
||||
optional_lookups = load_lookups(
|
||||
lang=self.vocab.lang, tables=optional_tables, strict=False
|
||||
)
|
||||
|
|
|
@ -2,9 +2,11 @@ import pickle
|
|||
|
||||
import pytest
|
||||
|
||||
import spacy
|
||||
from spacy import registry, util
|
||||
from spacy.about import __lookups_url__
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.lookups import Lookups, load_lookups_data_from_url
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -113,3 +115,15 @@ def test_lemmatizer_serialize(nlp):
|
|||
|
||||
# Make sure that lemmatizer cache can be pickled
|
||||
pickle.dumps(lemmatizer2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", ("ca", "en"))
|
||||
def test_lemmatizer_load_lookups_from_url(lang):
|
||||
nlp = spacy.blank(lang)
|
||||
lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
|
||||
lookups = load_lookups_data_from_url(
|
||||
nlp.lang, req_tables + opt_tables, __lookups_url__
|
||||
)
|
||||
lemmatizer.initialize(lookups=lookups)
|
||||
assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)
|
||||
|
|
|
@ -14,7 +14,7 @@ implement their own lemmatizer components via
|
|||
[language-specific factories](/usage/processing-pipelines#factories-language).
|
||||
The default data used is provided by the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||
extension package.
|
||||
repository.
|
||||
|
||||
For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
|
||||
|
||||
|
@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
|
|||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
|
||||
> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
|
||||
> lemmatizer.initialize(lookups=lookups)
|
||||
> ```
|
||||
>
|
||||
|
|
|
@ -9,6 +9,7 @@ menu:
|
|||
- ['Batchers', 'batchers']
|
||||
- ['Augmenters', 'augmenters']
|
||||
- ['Callbacks', 'callbacks']
|
||||
- ['Miscellaneous', 'misc']
|
||||
- ['Training & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
---
|
||||
|
@ -934,6 +935,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
|
|||
| `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
|
||||
| **CREATES** | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~ |
|
||||
|
||||
## Miscellaneous {id="misc"}
|
||||
|
||||
### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.lookups]
|
||||
> @misc = "spacy.LookupsDataLoader.v1"
|
||||
> lang = ${nlp.lang}
|
||||
> tables = ["lexeme_prob"]
|
||||
> ```
|
||||
|
||||
Load the specified tables from the [`lookups` registry](#registry), which are
|
||||
provided by a package such as
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `lang` | The language. ~~str~~ |
|
||||
| `tables` | The tables to load. ~~List[str]~~ |
|
||||
| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
|
||||
|
||||
### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.components.lemmatizer.lookups]
|
||||
> @misc = "spacy.LookupsDataLoaderFromURL.v1"
|
||||
> lang = ${nlp.lang}
|
||||
> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
|
||||
> tables = ["lemma_rules","lemma_exc","lemma_index"]
|
||||
> ```
|
||||
|
||||
Load the specified tables from the provided URL. The individual tables are
|
||||
expected to have filenames in the format `{lang}_{table}.json` under the
|
||||
specified URL directory as in the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
|
||||
repository.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------- |
|
||||
| `lang` | The language. ~~str~~ |
|
||||
| `url` | The URL for the directory where the tables can be downloaded. ~~str~~ |
|
||||
| `tables` | The tables to load. ~~List[str]~~ |
|
||||
| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
|
||||
|
||||
## Training data and alignment {id="gold",source="spacy/training"}
|
||||
|
||||
### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
|
||||
|
|
|
@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
|
|||
```
|
||||
|
||||
spaCy also lets you install extra dependencies by specifying the following
|
||||
keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
|
||||
keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
|
||||
multiple comma-separated extras). See the `[options.extras_require]` section in
|
||||
spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```bash
|
||||
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
|
||||
> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
||||
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
||||
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
||||
| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
|
||||
|
@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
|
|||
To install with extras:
|
||||
|
||||
```bash
|
||||
$ pip install --no-build-isolation --editable .[lookups,cuda102]
|
||||
$ pip install --no-build-isolation --editable .[ja,cuda102]
|
||||
```
|
||||
|
||||
How to install compilers and related build tools:
|
||||
|
|
|
@ -148,11 +148,11 @@ component.
|
|||
|
||||
</Infobox>
|
||||
|
||||
The data for spaCy's lemmatizers is distributed in the package
|
||||
The data for spaCy's lemmatizers is distributed in the repository
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
|
||||
provided trained pipelines already include all the required tables, but if you
|
||||
are creating new pipelines, you'll probably want to install `spacy-lookups-data`
|
||||
to provide the data when the lemmatizer is initialized.
|
||||
are creating new pipelines, you can load data from the repository in the
|
||||
lemmatizer initialization.
|
||||
|
||||
### Lookup lemmatizer {id="lemmatizer-lookup"}
|
||||
|
||||
|
|
|
@ -46,7 +46,6 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
const pipExtras = [
|
||||
hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
|
||||
train && 'transformers',
|
||||
train && 'lookups',
|
||||
apple && 'apple',
|
||||
...modelExtras,
|
||||
]
|
||||
|
@ -210,9 +209,6 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
<QS config="train" package="conda" comment prompt={false}>
|
||||
# packages only available via pip
|
||||
</QS>
|
||||
<QS config="train" package="conda">
|
||||
pip install spacy-lookups-data
|
||||
</QS>
|
||||
|
||||
{languages.map(({ code, models: modelOptions }) => {
|
||||
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
|
||||
|
|
Loading…
Reference in New Issue
Block a user