Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
Elia Robyn Lake (Robyn Speer) 2021-10-05 03:52:22 -04:00 committed by GitHub
parent 1ee6541ab0
commit 53b5f245ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 136 additions and 8 deletions

View File

@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
langcodes>=3.2.0,<4.0.0

View File

@ -62,6 +62,7 @@ install_requires =
setuptools setuptools
packaging>=20.0 packaging>=20.0
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =

View File

@ -282,7 +282,7 @@ class Errors:
"you forget to call the `set_extension` method?") "you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute " E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?") "'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang: {err}") E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.") "package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}") E052 = ("Can't find model directory: {path}")

View File

@ -105,7 +105,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp` Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline. object and processing pipeline.
lang (str): Two-letter language ID, i.e. ISO code. lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language DOCS: https://spacy.io/api/language
""" """

View File

@ -8,7 +8,7 @@ from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error from spacy.util import registry, ignore_error, raise_error, find_matching_language
import spacy import spacy
from thinc.api import NumpyOps, get_current_ops from thinc.api import NumpyOps, get_current_ops
@ -502,6 +502,55 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model" assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
"lang,target",
[
('en', 'en'),
('fra', 'fr'),
('fre', 'fr'),
('iw', 'he'),
('mo', 'ro'),
('mul', 'xx'),
('no', 'nb'),
('pt-BR', 'pt'),
('xx', 'xx'),
('zh-Hans', 'zh'),
('zh-Hant', None),
('zxx', None)
]
)
def test_language_matching(lang, target):
"""
Test that we can look up languages by equivalent or nearly-equivalent
language codes.
"""
assert find_matching_language(lang) == target
@pytest.mark.parametrize(
"lang,target",
[
('en', 'en'),
('fra', 'fr'),
('fre', 'fr'),
('iw', 'he'),
('mo', 'ro'),
('mul', 'xx'),
('no', 'nb'),
('pt-BR', 'pt'),
('xx', 'xx'),
('zh-Hans', 'zh'),
]
)
def test_blank_languages(lang, target):
"""
Test that we can get spacy.blank in various languages, including codes
that are defined to be equivalent or that match by CLDR language matching.
"""
nlp = spacy.blank(lang)
assert nlp.lang == target
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab]) @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
def test_language_init_invalid_vocab(value): def test_language_init_invalid_vocab(value):
err_fragment = "invalid value" err_fragment = "invalid value"

View File

@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
nlp = util.load_model("blank:en") nlp = util.load_model("blank:en")
assert nlp.lang == "en" assert nlp.lang == "en"
assert nlp.pipeline == [] assert nlp.pipeline == []
# ImportError for loading an unsupported language
with pytest.raises(ImportError):
util.load_model("blank:zxx")
# ImportError for requesting an invalid language code that isn't registered
with pytest.raises(ImportError): with pytest.raises(ImportError):
util.load_model("blank:fjsfijsdof") util.load_model("blank:fjsfijsdof")

View File

@ -16,6 +16,7 @@ import numpy
import srsly import srsly
import catalogue import catalogue
from catalogue import RegistryError, Registry from catalogue import RegistryError, Registry
import langcodes
import sys import sys
import warnings import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.specifiers import SpecifierSet, InvalidSpecifier
@ -28,6 +29,7 @@ import tempfile
import shutil import shutil
import shlex import shlex
import inspect import inspect
import pkgutil
import logging import logging
try: try:
@ -256,20 +258,89 @@ def lang_class_is_loaded(lang: str) -> bool:
return lang in registry.languages return lang in registry.languages
def find_matching_language(lang: str) -> Optional[str]:
"""
Given an IETF language code, find a supported spaCy language that is a
close match for it (according to Unicode CLDR language-matching rules).
This allows for language aliases, ISO 639-2 codes, more detailed language
tags, and close matches.
Returns the language code if a matching language is available, or None
if there is no matching language.
>>> find_matching_language('en')
'en'
>>> find_matching_language('pt-BR') # Brazilian Portuguese
'pt'
>>> find_matching_language('fra') # an ISO 639-2 code for French
'fr'
>>> find_matching_language('iw') # obsolete alias for Hebrew
'he'
>>> find_matching_language('no') # Norwegian
'nb'
>>> find_matching_language('mo') # old code for ro-MD
'ro'
>>> find_matching_language('zh-Hans') # Simplified Chinese
'zh'
>>> find_matching_language('zxx')
None
"""
import spacy.lang # noqa: F401
if lang == 'xx':
return 'xx'
# Find out which language modules we have
possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
code = modinfo.name
if code == 'xx':
# Temporarily make 'xx' into a valid language code
possible_languages.append('mul')
elif langcodes.tag_is_valid(code):
possible_languages.append(code)
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
# more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match.
match = langcodes.closest_supported_match(
lang, possible_languages, max_distance=9
)
if match == 'mul':
# Convert 'mul' back to spaCy's 'xx'
return 'xx'
else:
return match
def get_lang_class(lang: str) -> "Language": def get_lang_class(lang: str) -> "Language":
"""Import and load a Language class. """Import and load a Language class.
lang (str): Two-letter language code, e.g. 'en'. lang (str): IETF language code, such as 'en'.
RETURNS (Language): Language class. RETURNS (Language): Language class.
""" """
# Check if language is registered / entry point is available # Check if language is registered / entry point is available
if lang in registry.languages: if lang in registry.languages:
return registry.languages.get(lang) return registry.languages.get(lang)
else: else:
# Find the language in the spacy.lang subpackage
try: try:
module = importlib.import_module(f".lang.{lang}", "spacy") module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err: except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err # Find a matching language. For example, if the language 'no' is
# requested, we can use language-matching to load `spacy.lang.nb`.
try:
match = find_matching_language(lang)
except langcodes.tag_parser.LanguageTagError:
# proceed to raising an import error
match = None
if match:
lang = match
module = importlib.import_module(f".lang.{lang}", "spacy")
else:
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
set_lang_class(lang, getattr(module, module.__all__[0])) set_lang_class(lang, getattr(module, module.__all__[0]))
return registry.languages.get(lang) return registry.languages.get(lang)

View File

@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| Name | Description | | Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | | `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |

View File

@ -1039,7 +1039,7 @@ available to the loaded object.
| Name | Description | | Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | | `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
## Defaults {#defaults} ## Defaults {#defaults}

View File

@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| Name | Description | | Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | | `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |