Remove dependency on langcodes

2025-10-31 16:07:41 +03:00 · 2025-02-28 14:00:17 +08:00 · 2025-02-28 14:00:17 +08:00 · 5b6412e88b
commit 5b6412e88b
parent b3c46c315e
8 changed files with 101 additions and 63 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 jinja2
-langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
--- a/setup.cfg
+++ b/setup.cfg
@ -65,7 +65,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
 console_scripts =
--- a/spacy/language.py
+++ b/spacy/language.py
@ -143,7 +143,7 @@ class Language:

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.

    DOCS: https://spacy.io/api/language
    """
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -656,17 +656,12 @@ def test_spacy_blank():
@pytest.mark.parametrize(
    "lang,target",
    [
-        ("en", "en"),
        ("fra", "fr"),
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
+        ("scc", "sr"),
        ("mul", "xx"),
-        ("no", "nb"),
-        ("pt-BR", "pt"),
-        ("xx", "xx"),
-        ("zh-Hans", "zh"),
-        ("zh-Hant", None),
        ("zxx", None),
    ],
 )
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
+        ("scc", "sr"),
        ("mul", "xx"),
-        ("no", "nb"),
-        ("pt-BR", "pt"),
        ("xx", "xx"),
-        ("zh-Hans", "zh"),
    ],
 )
 def test_blank_languages(lang, target):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -5,7 +5,6 @@ import inspect
 import itertools
 import logging
 import os
-import pkgutil
 import re
 import shlex
 import shutil
@ -40,7 +39,6 @@ from typing import (
 )

 import catalogue
-import langcodes
 import numpy
 import srsly
 import thinc
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
 # Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
+
+LANG_ALIASES = {
+    "af": ["afr"],
+    "am": ["amh"],
+    "ar": ["ara"],
+    "az": ["aze"],
+    "bg": ["bul"],
+    "bn": ["ben"],
+    "bo": ["bod", "tib"],
+    "ca": ["cat"],
+    "cs": ["ces", "cze"],
+    "da": ["dan"],
+    "de": ["deu", "ger"],
+    "el": ["ell", "gre"],
+    "en": ["eng"],
+    "es": ["spa"],
+    "et": ["est"],
+    "eu": ["eus", "baq"],
+    "fa": ["fas", "per"],
+    "fi": ["fin"],
+    "fo": ["fao"],
+    "fr": ["fra", "fre"],
+    "ga": ["gle"],
+    "gd": ["gla"],
+    "gu": ["guj"],
+    "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
+    "hi": ["hin"],
+    "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
+    "hu": ["hun"],
+    "hy": ["hye"],
+    "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
+    "is": ["isl", "ice"],
+    "it": ["ita"],
+    "ja": ["jpn"],
+    "kn": ["kan"],
+    "ko": ["kor"],
+    "ky": ["kir"],
+    "la": ["lat"],
+    "lb": ["ltz"],
+    "lg": ["lug"],
+    "lt": ["lit"],
+    "lv": ["lav"],
+    "mk": ["mkd", "mac"],
+    "ml": ["mal"],
+    "mr": ["mar"],
+    "ms": ["msa", "may"],
+    "nb": ["nob"],
+    "ne": ["nep"],
+    "nl": ["nld", "dut"],
+    "nn": ["nno"],
+    "pl": ["pol"],
+    "pt": ["por"],
+    "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
+    "ru": ["rus"],
+    "sa": ["san"],
+    "si": ["sin"],
+    "sk": ["slk", "slo"],
+    "sl": ["slv"],
+    "sq": ["sqi", "alb"],
+    "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
+    "sv": ["swe"],
+    "ta": ["tam"],
+    "te": ["tel"],
+    "th": ["tha"],
+    "ti": ["tir"],
+    "tl": ["tgl"],
+    "tn": ["tsn"],
+    "tr": ["tur"],
+    "tt": ["tat"],
+    "uk": ["ukr"],
+    "ur": ["urd"],
+    "vi": ["viw"],
+    "yo": ["yor"],
+    "zh": ["zho", "chi"],
+
+    "xx": ["mul"],
+}
 # fmt: on

 logger = logging.getLogger("spacy")
@ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool:

 def find_matching_language(lang: str) -> Optional[str]:
    """
-    Given an IETF language code, find a supported spaCy language that is a
-    close match for it (according to Unicode CLDR language-matching rules).
-    This allows for language aliases, ISO 639-2 codes, more detailed language
-    tags, and close matches.
+    Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
+    find a supported spaCy language.

    Returns the language code if a matching language is available, or None
    if there is no matching language.

-    >>> find_matching_language('en')
-    'en'
-    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
-    'pt'
-    >>> find_matching_language('fra')  # an ISO 639-2 code for French
+    >>> find_matching_language('fra')  # ISO 639-3 code for French
    'fr'
-    >>> find_matching_language('iw')  # obsolete alias for Hebrew
+    >>> find_matching_language('fre')  # ISO 639-2/B code for French
+    'fr'
+    >>> find_matching_language('iw')  # Obsolete ISO 639-1 code for Hebrew
    'he'
-    >>> find_matching_language('no')  # Norwegian
-    'nb'
-    >>> find_matching_language('mo')  # old code for ro-MD
+    >>> find_matching_language('mo')  # Deprecated code for Moldavian
    'ro'
-    >>> find_matching_language('zh-Hans')  # Simplified Chinese
-    'zh'
+    >>> find_matching_language('scc')  # Deprecated ISO 639-2/B code for Serbian
+    'sr'
    >>> find_matching_language('zxx')
    None
    """
    import spacy.lang  # noqa: F401

-    if lang == "xx":
-        return "xx"
+    # Check aliases
+    for lang_code, aliases in LANG_ALIASES.items():
+        if lang in aliases:
+            return lang_code

-    # Find out which language modules we have
-    possible_languages = []
-    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined]
-        code = modinfo.name
-        if code == "xx":
-            # Temporarily make 'xx' into a valid language code
-            possible_languages.append("mul")
-        elif langcodes.tag_is_valid(code):
-            possible_languages.append(code)
-
-    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
-    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
-    # more possibilities, like variants of Chinese like 'wuu', but text that
-    # is labeled that way is probably trying to be distinct from 'zh' and
-    # shouldn't automatically match.
-    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
-    if match == "mul":
-        # Convert 'mul' back to spaCy's 'xx'
-        return "xx"
-    else:
-        return match
+    return None


 def get_lang_class(lang: str) -> Type["Language"]:
    """Import and load a Language class.

-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
@ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
-            # Find a matching language. For example, if the language 'no' is
-            # requested, we can use language-matching to load `spacy.lang.nb`.
-            try:
-                match = find_matching_language(lang)
-            except langcodes.tag_parser.LanguageTagError:
-                # proceed to raising an import error
-                match = None
+            # Find a matching language. For example, if the language 'eng' is
+            # requested, we can use language-matching to load `spacy.lang.en`.
+            match = find_matching_language(lang)

            if match:
                lang = match
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr

 | Name               | Description                                                                                                                                                                                                                                                         |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 |
+| `lang`             | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~                                                                                                                                 |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@ -1078,7 +1078,7 @@ details.
 | Name             | Description                                                                                                                                                                       |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            |
-| `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  |
+| `lang`           | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~                                                                  |
 | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |

 ## Defaults {id="defaults"}
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of

 | Name                                | Description                                                                                                                                                        |
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                |
+| `name`                              | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~                                |
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |