Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * link to "IETF language tags" in docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Make requirements consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * change "two-letter language ID" to "IETF language tag" in language docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * use langcodes 3.2 and handle language-tag errors better Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * all unknown language codes are ImportErrors Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
2025-07-14 18:22:27 +03:00 · 2021-10-05 03:52:22 -04:00 · 2021-10-05 03:52:22 -04:00 · 53b5f245ed
commit 53b5f245ed
parent 1ee6541ab0
10 changed files with 136 additions and 8 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
 hypothesis>=3.27.0,<7.0.0
 langcodes>=3.2.0,<4.0.0
--- a/setup.cfg
+++ b/setup.cfg
@ -62,6 +62,7 @@ install_requires =
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
 console_scripts =
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -282,7 +282,7 @@ class Errors:
            "you forget to call the `set_extension` method?")
    E047 = ("Can't assign a value to unregistered extension attribute "
            "'{name}'. Did you forget to call the `set_extension` method?")
-    E048 = ("Can't import language {lang} from spacy.lang: {err}")
+    E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
            "package or a valid path to a data directory.")
    E052 = ("Can't find model directory: {path}")
--- a/spacy/language.py
+++ b/spacy/language.py
@ -105,7 +105,7 @@ class Language:
    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): Two-letter language ID, i.e. ISO code.
+    lang (str): IETF language code, such as 'en'.
    DOCS: https://spacy.io/api/language
    """
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
 import spacy
 from thinc.api import NumpyOps, get_current_ops
@ -502,6 +502,55 @@ def test_spacy_blank():
    assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
    "lang,target",
    [
        ('en', 'en'),
        ('fra', 'fr'),
        ('fre', 'fr'),
        ('iw', 'he'),
        ('mo', 'ro'),
        ('mul', 'xx'),
        ('no', 'nb'),
        ('pt-BR', 'pt'),
        ('xx', 'xx'),
        ('zh-Hans', 'zh'),
        ('zh-Hant', None),
        ('zxx', None)
    ]
 )
 def test_language_matching(lang, target):
    """
    Test that we can look up languages by equivalent or nearly-equivalent
    language codes.
    """
    assert find_matching_language(lang) == target
@pytest.mark.parametrize(
    "lang,target",
    [
        ('en', 'en'),
        ('fra', 'fr'),
        ('fre', 'fr'),
        ('iw', 'he'),
        ('mo', 'ro'),
        ('mul', 'xx'),
        ('no', 'nb'),
        ('pt-BR', 'pt'),
        ('xx', 'xx'),
        ('zh-Hans', 'zh'),
    ]
 )
 def test_blank_languages(lang, target):
    """
    Test that we can get spacy.blank in various languages, including codes
    that are defined to be equivalent or that match by CLDR language matching.
    """
    nlp = spacy.blank(lang)
    assert nlp.lang == target
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
 def test_language_init_invalid_vocab(value):
    err_fragment = "invalid value"
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
    nlp = util.load_model("blank:en")
    assert nlp.lang == "en"
    assert nlp.pipeline == []
    # ImportError for loading an unsupported language
    with pytest.raises(ImportError):
        util.load_model("blank:zxx")
    # ImportError for requesting an invalid language code that isn't registered
    with pytest.raises(ImportError):
        util.load_model("blank:fjsfijsdof")
--- a/spacy/util.py
+++ b/spacy/util.py
@ -16,6 +16,7 @@ import numpy
 import srsly
 import catalogue
 from catalogue import RegistryError, Registry
 import langcodes
 import sys
 import warnings
 from packaging.specifiers import SpecifierSet, InvalidSpecifier
@ -28,6 +29,7 @@ import tempfile
 import shutil
 import shlex
 import inspect
 import pkgutil
 import logging
 try:
@ -256,19 +258,88 @@ def lang_class_is_loaded(lang: str) -> bool:
    return lang in registry.languages
 def find_matching_language(lang: str) -> Optional[str]:
    """
    Given an IETF language code, find a supported spaCy language that is a
    close match for it (according to Unicode CLDR language-matching rules).
    This allows for language aliases, ISO 639-2 codes, more detailed language
    tags, and close matches.
    Returns the language code if a matching language is available, or None
    if there is no matching language.
    >>> find_matching_language('en')
    'en'
    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
    'pt'
    >>> find_matching_language('fra')  # an ISO 639-2 code for French
    'fr'
    >>> find_matching_language('iw')  # obsolete alias for Hebrew
    'he'
    >>> find_matching_language('no')  # Norwegian
    'nb'
    >>> find_matching_language('mo')  # old code for ro-MD
    'ro'
    >>> find_matching_language('zh-Hans')  # Simplified Chinese
    'zh'
    >>> find_matching_language('zxx')
    None
    """
    import spacy.lang  # noqa: F401
    if lang == 'xx':
        return 'xx'
    # Find out which language modules we have
    possible_languages = []
    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
        code = modinfo.name
        if code == 'xx':
            # Temporarily make 'xx' into a valid language code
            possible_languages.append('mul')
        elif langcodes.tag_is_valid(code):
            possible_languages.append(code)
    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
    # more possibilities, like variants of Chinese like 'wuu', but text that
    # is labeled that way is probably trying to be distinct from 'zh' and
    # shouldn't automatically match.
    match = langcodes.closest_supported_match(
        lang, possible_languages, max_distance=9
    )
    if match == 'mul':
        # Convert 'mul' back to spaCy's 'xx'
        return 'xx'
    else:
        return match
 def get_lang_class(lang: str) -> "Language":
    """Import and load a Language class.
-    lang (str): Two-letter language code, e.g. 'en'.
+    lang (str): IETF language code, such as 'en'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
    if lang in registry.languages:
        return registry.languages.get(lang)
    else:
        # Find the language in the spacy.lang subpackage
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
            # Find a matching language. For example, if the language 'no' is
            # requested, we can use language-matching to load `spacy.lang.nb`.
            try:
                match = find_matching_language(lang)
            except langcodes.tag_parser.LanguageTagError:
                # proceed to raising an import error
                match = None
            if match:
                lang = match
                module = importlib.import_module(f".lang.{lang}", "spacy")
            else:
                raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
        set_lang_class(lang, getattr(module, module.__all__[0]))
    return registry.languages.get(lang)
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | Name               | Description                                                                                                                                                                                                                                                         |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
+| `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -1039,7 +1039,7 @@ available to the loaded object.
 | Name             | Description                                                                                                                                                                       |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            |
-| `lang`           | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~                                                                           |
+| `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  |
 | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
 ## Defaults {#defaults}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
 | Name                                | Description                                                                                                                                                        |
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                              | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~                                                           |
+| `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                |
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |