Remove dependency on langcodes

2025-10-31 16:07:41 +03:00 · 2025-02-28 14:00:17 +08:00 · 2025-02-28 14:00:17 +08:00 · 5b6412e88b
commit 5b6412e88b
parent b3c46c315e
8 changed files with 101 additions and 63 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
--- a/setup.cfg
+++ b/setup.cfg
@ -65,7 +65,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
 console_scripts =
--- a/spacy/language.py
+++ b/spacy/language.py
@ -143,7 +143,7 @@ class Language:
    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
    DOCS: https://spacy.io/api/language
    """
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -656,17 +656,12 @@ def test_spacy_blank():
@pytest.mark.parametrize(
    "lang,target",
    [
        ("en", "en"),
        ("fra", "fr"),
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
        ("scc", "sr"),
        ("mul", "xx"),
        ("no", "nb"),
        ("pt-BR", "pt"),
        ("xx", "xx"),
        ("zh-Hans", "zh"),
        ("zh-Hant", None),
        ("zxx", None),
    ],
 )
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
        ("scc", "sr"),
        ("mul", "xx"),
        ("no", "nb"),
        ("pt-BR", "pt"),
        ("xx", "xx"),
        ("zh-Hans", "zh"),
    ],
 )
 def test_blank_languages(lang, target):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -5,7 +5,6 @@ import inspect
 import itertools
 import logging
 import os
 import pkgutil
 import re
 import shlex
 import shutil
@ -40,7 +39,6 @@ from typing import (
 )
 import catalogue
 import langcodes
 import numpy
 import srsly
 import thinc
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
 # Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 LANG_ALIASES = {
    "af": ["afr"],
    "am": ["amh"],
    "ar": ["ara"],
    "az": ["aze"],
    "bg": ["bul"],
    "bn": ["ben"],
    "bo": ["bod", "tib"],
    "ca": ["cat"],
    "cs": ["ces", "cze"],
    "da": ["dan"],
    "de": ["deu", "ger"],
    "el": ["ell", "gre"],
    "en": ["eng"],
    "es": ["spa"],
    "et": ["est"],
    "eu": ["eus", "baq"],
    "fa": ["fas", "per"],
    "fi": ["fin"],
    "fo": ["fao"],
    "fr": ["fra", "fre"],
    "ga": ["gle"],
    "gd": ["gla"],
    "gu": ["guj"],
    "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
    "hi": ["hin"],
    "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
    "hu": ["hun"],
    "hy": ["hye"],
    "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
    "is": ["isl", "ice"],
    "it": ["ita"],
    "ja": ["jpn"],
    "kn": ["kan"],
    "ko": ["kor"],
    "ky": ["kir"],
    "la": ["lat"],
    "lb": ["ltz"],
    "lg": ["lug"],
    "lt": ["lit"],
    "lv": ["lav"],
    "mk": ["mkd", "mac"],
    "ml": ["mal"],
    "mr": ["mar"],
    "ms": ["msa", "may"],
    "nb": ["nob"],
    "ne": ["nep"],
    "nl": ["nld", "dut"],
    "nn": ["nno"],
    "pl": ["pol"],
    "pt": ["por"],
    "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
    "ru": ["rus"],
    "sa": ["san"],
    "si": ["sin"],
    "sk": ["slk", "slo"],
    "sl": ["slv"],
    "sq": ["sqi", "alb"],
    "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
    "sv": ["swe"],
    "ta": ["tam"],
    "te": ["tel"],
    "th": ["tha"],
    "ti": ["tir"],
    "tl": ["tgl"],
    "tn": ["tsn"],
    "tr": ["tur"],
    "tt": ["tat"],
    "uk": ["ukr"],
    "ur": ["urd"],
    "vi": ["viw"],
    "yo": ["yor"],
    "zh": ["zho", "chi"],
    "xx": ["mul"],
 }
 # fmt: on
 logger = logging.getLogger("spacy")
@ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool:
 def find_matching_language(lang: str) -> Optional[str]:
    """
-    Given an IETF language code, find a supported spaCy language that is a
+    Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
-    close match for it (according to Unicode CLDR language-matching rules).
+    find a supported spaCy language.
    This allows for language aliases, ISO 639-2 codes, more detailed language
    tags, and close matches.
    Returns the language code if a matching language is available, or None
    if there is no matching language.
-    >>> find_matching_language('en')
+    >>> find_matching_language('fra')  # ISO 639-3 code for French
    'en'
    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
    'pt'
    >>> find_matching_language('fra')  # an ISO 639-2 code for French
    'fr'
-    >>> find_matching_language('iw')  # obsolete alias for Hebrew
+    >>> find_matching_language('fre')  # ISO 639-2/B code for French
    'fr'
    >>> find_matching_language('iw')  # Obsolete ISO 639-1 code for Hebrew
    'he'
-    >>> find_matching_language('no')  # Norwegian
+    >>> find_matching_language('mo')  # Deprecated code for Moldavian
    'nb'
    >>> find_matching_language('mo')  # old code for ro-MD
    'ro'
-    >>> find_matching_language('zh-Hans')  # Simplified Chinese
+    >>> find_matching_language('scc')  # Deprecated ISO 639-2/B code for Serbian
-    'zh'
+    'sr'
    >>> find_matching_language('zxx')
    None
    """
    import spacy.lang  # noqa: F401
-    if lang == "xx":
+    # Check aliases
-        return "xx"
+    for lang_code, aliases in LANG_ALIASES.items():
        if lang in aliases:
            return lang_code
-    # Find out which language modules we have
+    return None
    possible_languages = []
    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined]
        code = modinfo.name
        if code == "xx":
            # Temporarily make 'xx' into a valid language code
            possible_languages.append("mul")
        elif langcodes.tag_is_valid(code):
            possible_languages.append(code)
    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
    # more possibilities, like variants of Chinese like 'wuu', but text that
    # is labeled that way is probably trying to be distinct from 'zh' and
    # shouldn't automatically match.
    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
    if match == "mul":
        # Convert 'mul' back to spaCy's 'xx'
        return "xx"
    else:
        return match
 def get_lang_class(lang: str) -> Type["Language"]:
    """Import and load a Language class.
-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
@ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
-            # Find a matching language. For example, if the language 'no' is
+            # Find a matching language. For example, if the language 'eng' is
-            # requested, we can use language-matching to load `spacy.lang.nb`.
+            # requested, we can use language-matching to load `spacy.lang.en`.
-            try:
+            match = find_matching_language(lang)
                match = find_matching_language(lang)
            except langcodes.tag_parser.LanguageTagError:
                # proceed to raising an import error
                match = None
            if match:
                lang = match
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | Name               | Description                                                                                                                                                                                                                                                         |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 |
+| `lang`             | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~                                                                                                                                 |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@ -1078,7 +1078,7 @@ details.
 | Name             | Description                                                                                                                                                                       |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            |
-| `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  |
+| `lang`           | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~                                                                  |
 | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
 ## Defaults {id="defaults"}
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
 | Name                                | Description                                                                                                                                                        |
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                |
+| `name`                              | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~                                |
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |