mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Allow IETF language codes, aliases, and close matches (#9342)
* use language-matching to allow language code aliases Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * link to "IETF language tags" in docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Make requirements consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * change "two-letter language ID" to "IETF language tag" in language docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * use langcodes 3.2 and handle language-tag errors better Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * all unknown language codes are ImportErrors Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
parent
1ee6541ab0
commit
53b5f245ed
|
@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<3.10.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
|
@ -62,6 +62,7 @@ install_requires =
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
|
|
@ -282,7 +282,7 @@ class Errors:
|
||||||
"you forget to call the `set_extension` method?")
|
"you forget to call the `set_extension` method?")
|
||||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
|
||||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||||
"package or a valid path to a data directory.")
|
"package or a valid path to a data directory.")
|
||||||
E052 = ("Can't find model directory: {path}")
|
E052 = ("Can't find model directory: {path}")
|
||||||
|
|
|
@ -105,7 +105,7 @@ class Language:
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): Two-letter language ID, i.e. ISO code.
|
lang (str): IETF language code, such as 'en'.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry, ignore_error, raise_error
|
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||||
import spacy
|
import spacy
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
|
@ -502,6 +502,55 @@ def test_spacy_blank():
|
||||||
assert nlp.meta["name"] == "my_custom_model"
|
assert nlp.meta["name"] == "my_custom_model"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"lang,target",
|
||||||
|
[
|
||||||
|
('en', 'en'),
|
||||||
|
('fra', 'fr'),
|
||||||
|
('fre', 'fr'),
|
||||||
|
('iw', 'he'),
|
||||||
|
('mo', 'ro'),
|
||||||
|
('mul', 'xx'),
|
||||||
|
('no', 'nb'),
|
||||||
|
('pt-BR', 'pt'),
|
||||||
|
('xx', 'xx'),
|
||||||
|
('zh-Hans', 'zh'),
|
||||||
|
('zh-Hant', None),
|
||||||
|
('zxx', None)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_language_matching(lang, target):
|
||||||
|
"""
|
||||||
|
Test that we can look up languages by equivalent or nearly-equivalent
|
||||||
|
language codes.
|
||||||
|
"""
|
||||||
|
assert find_matching_language(lang) == target
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"lang,target",
|
||||||
|
[
|
||||||
|
('en', 'en'),
|
||||||
|
('fra', 'fr'),
|
||||||
|
('fre', 'fr'),
|
||||||
|
('iw', 'he'),
|
||||||
|
('mo', 'ro'),
|
||||||
|
('mul', 'xx'),
|
||||||
|
('no', 'nb'),
|
||||||
|
('pt-BR', 'pt'),
|
||||||
|
('xx', 'xx'),
|
||||||
|
('zh-Hans', 'zh'),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_blank_languages(lang, target):
|
||||||
|
"""
|
||||||
|
Test that we can get spacy.blank in various languages, including codes
|
||||||
|
that are defined to be equivalent or that match by CLDR language matching.
|
||||||
|
"""
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
assert nlp.lang == target
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||||
def test_language_init_invalid_vocab(value):
|
def test_language_init_invalid_vocab(value):
|
||||||
err_fragment = "invalid value"
|
err_fragment = "invalid value"
|
||||||
|
|
|
@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
|
||||||
nlp = util.load_model("blank:en")
|
nlp = util.load_model("blank:en")
|
||||||
assert nlp.lang == "en"
|
assert nlp.lang == "en"
|
||||||
assert nlp.pipeline == []
|
assert nlp.pipeline == []
|
||||||
|
|
||||||
|
# ImportError for loading an unsupported language
|
||||||
|
with pytest.raises(ImportError):
|
||||||
|
util.load_model("blank:zxx")
|
||||||
|
|
||||||
|
# ImportError for requesting an invalid language code that isn't registered
|
||||||
with pytest.raises(ImportError):
|
with pytest.raises(ImportError):
|
||||||
util.load_model("blank:fjsfijsdof")
|
util.load_model("blank:fjsfijsdof")
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ import numpy
|
||||||
import srsly
|
import srsly
|
||||||
import catalogue
|
import catalogue
|
||||||
from catalogue import RegistryError, Registry
|
from catalogue import RegistryError, Registry
|
||||||
|
import langcodes
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||||
|
@ -28,6 +29,7 @@ import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import shlex
|
import shlex
|
||||||
import inspect
|
import inspect
|
||||||
|
import pkgutil
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -256,19 +258,88 @@ def lang_class_is_loaded(lang: str) -> bool:
|
||||||
return lang in registry.languages
|
return lang in registry.languages
|
||||||
|
|
||||||
|
|
||||||
|
def find_matching_language(lang: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Given an IETF language code, find a supported spaCy language that is a
|
||||||
|
close match for it (according to Unicode CLDR language-matching rules).
|
||||||
|
This allows for language aliases, ISO 639-2 codes, more detailed language
|
||||||
|
tags, and close matches.
|
||||||
|
|
||||||
|
Returns the language code if a matching language is available, or None
|
||||||
|
if there is no matching language.
|
||||||
|
|
||||||
|
>>> find_matching_language('en')
|
||||||
|
'en'
|
||||||
|
>>> find_matching_language('pt-BR') # Brazilian Portuguese
|
||||||
|
'pt'
|
||||||
|
>>> find_matching_language('fra') # an ISO 639-2 code for French
|
||||||
|
'fr'
|
||||||
|
>>> find_matching_language('iw') # obsolete alias for Hebrew
|
||||||
|
'he'
|
||||||
|
>>> find_matching_language('no') # Norwegian
|
||||||
|
'nb'
|
||||||
|
>>> find_matching_language('mo') # old code for ro-MD
|
||||||
|
'ro'
|
||||||
|
>>> find_matching_language('zh-Hans') # Simplified Chinese
|
||||||
|
'zh'
|
||||||
|
>>> find_matching_language('zxx')
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
import spacy.lang # noqa: F401
|
||||||
|
if lang == 'xx':
|
||||||
|
return 'xx'
|
||||||
|
|
||||||
|
# Find out which language modules we have
|
||||||
|
possible_languages = []
|
||||||
|
for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
|
||||||
|
code = modinfo.name
|
||||||
|
if code == 'xx':
|
||||||
|
# Temporarily make 'xx' into a valid language code
|
||||||
|
possible_languages.append('mul')
|
||||||
|
elif langcodes.tag_is_valid(code):
|
||||||
|
possible_languages.append(code)
|
||||||
|
|
||||||
|
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
|
||||||
|
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
|
||||||
|
# more possibilities, like variants of Chinese like 'wuu', but text that
|
||||||
|
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||||
|
# shouldn't automatically match.
|
||||||
|
match = langcodes.closest_supported_match(
|
||||||
|
lang, possible_languages, max_distance=9
|
||||||
|
)
|
||||||
|
if match == 'mul':
|
||||||
|
# Convert 'mul' back to spaCy's 'xx'
|
||||||
|
return 'xx'
|
||||||
|
else:
|
||||||
|
return match
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(lang: str) -> "Language":
|
def get_lang_class(lang: str) -> "Language":
|
||||||
"""Import and load a Language class.
|
"""Import and load a Language class.
|
||||||
|
|
||||||
lang (str): Two-letter language code, e.g. 'en'.
|
lang (str): IETF language code, such as 'en'.
|
||||||
RETURNS (Language): Language class.
|
RETURNS (Language): Language class.
|
||||||
"""
|
"""
|
||||||
# Check if language is registered / entry point is available
|
# Check if language is registered / entry point is available
|
||||||
if lang in registry.languages:
|
if lang in registry.languages:
|
||||||
return registry.languages.get(lang)
|
return registry.languages.get(lang)
|
||||||
else:
|
else:
|
||||||
|
# Find the language in the spacy.lang subpackage
|
||||||
try:
|
try:
|
||||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||||
except ImportError as err:
|
except ImportError as err:
|
||||||
|
# Find a matching language. For example, if the language 'no' is
|
||||||
|
# requested, we can use language-matching to load `spacy.lang.nb`.
|
||||||
|
try:
|
||||||
|
match = find_matching_language(lang)
|
||||||
|
except langcodes.tag_parser.LanguageTagError:
|
||||||
|
# proceed to raising an import error
|
||||||
|
match = None
|
||||||
|
|
||||||
|
if match:
|
||||||
|
lang = match
|
||||||
|
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||||
|
else:
|
||||||
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
|
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
|
||||||
set_lang_class(lang, getattr(module, module.__all__[0]))
|
set_lang_class(lang, getattr(module, module.__all__[0]))
|
||||||
return registry.languages.get(lang)
|
return registry.languages.get(lang)
|
||||||
|
|
|
@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
|
||||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
|
|
|
@ -1039,7 +1039,7 @@ available to the loaded object.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||||
| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
|
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
|
||||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
||||||
|
|
||||||
## Defaults {#defaults}
|
## Defaults {#defaults}
|
||||||
|
|
|
@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user