Remove dependency on langcodes

This commit is contained in:
BLKSerene 2025-02-28 14:00:17 +08:00
parent b3c46c315e
commit 5b6412e88b
8 changed files with 101 additions and 63 deletions

View File

@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0

View File

@ -65,7 +65,6 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
langcodes>=3.2.0,<4.0.0
[options.entry_points]
console_scripts =

View File

@ -143,7 +143,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (str): IETF language code, such as 'en'.
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
DOCS: https://spacy.io/api/language
"""

View File

@ -656,17 +656,12 @@ def test_spacy_blank():
@pytest.mark.parametrize(
"lang,target",
[
("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
("scc", "sr"),
("mul", "xx"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"),
("zh-Hant", None),
("zxx", None),
],
)
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
("scc", "sr"),
("mul", "xx"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"),
],
)
def test_blank_languages(lang, target):

View File

@ -5,7 +5,6 @@ import inspect
import itertools
import logging
import os
import pkgutil
import re
import shlex
import shutil
@ -40,7 +39,6 @@ from typing import (
)
import catalogue
import langcodes
import numpy
import srsly
import thinc
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
LANG_ALIASES = {
"af": ["afr"],
"am": ["amh"],
"ar": ["ara"],
"az": ["aze"],
"bg": ["bul"],
"bn": ["ben"],
"bo": ["bod", "tib"],
"ca": ["cat"],
"cs": ["ces", "cze"],
"da": ["dan"],
"de": ["deu", "ger"],
"el": ["ell", "gre"],
"en": ["eng"],
"es": ["spa"],
"et": ["est"],
"eu": ["eus", "baq"],
"fa": ["fas", "per"],
"fi": ["fin"],
"fo": ["fao"],
"fr": ["fra", "fre"],
"ga": ["gle"],
"gd": ["gla"],
"gu": ["guj"],
"he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
"hi": ["hin"],
"hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
"hu": ["hun"],
"hy": ["hye"],
"id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
"is": ["isl", "ice"],
"it": ["ita"],
"ja": ["jpn"],
"kn": ["kan"],
"ko": ["kor"],
"ky": ["kir"],
"la": ["lat"],
"lb": ["ltz"],
"lg": ["lug"],
"lt": ["lit"],
"lv": ["lav"],
"mk": ["mkd", "mac"],
"ml": ["mal"],
"mr": ["mar"],
"ms": ["msa", "may"],
"nb": ["nob"],
"ne": ["nep"],
"nl": ["nld", "dut"],
"nn": ["nno"],
"pl": ["pol"],
"pt": ["por"],
"ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
"ru": ["rus"],
"sa": ["san"],
"si": ["sin"],
"sk": ["slk", "slo"],
"sl": ["slv"],
"sq": ["sqi", "alb"],
"sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
"sv": ["swe"],
"ta": ["tam"],
"te": ["tel"],
"th": ["tha"],
"ti": ["tir"],
"tl": ["tgl"],
"tn": ["tsn"],
"tr": ["tur"],
"tt": ["tat"],
"uk": ["ukr"],
"ur": ["urd"],
"vi": ["viw"],
"yo": ["yor"],
"zh": ["zho", "chi"],
"xx": ["mul"],
}
# fmt: on
logger = logging.getLogger("spacy")
@ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool:
def find_matching_language(lang: str) -> Optional[str]:
"""
Given an IETF language code, find a supported spaCy language that is a
close match for it (according to Unicode CLDR language-matching rules).
This allows for language aliases, ISO 639-2 codes, more detailed language
tags, and close matches.
Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
find a supported spaCy language.
Returns the language code if a matching language is available, or None
if there is no matching language.
>>> find_matching_language('en')
'en'
>>> find_matching_language('pt-BR') # Brazilian Portuguese
'pt'
>>> find_matching_language('fra') # an ISO 639-2 code for French
>>> find_matching_language('fra') # ISO 639-3 code for French
'fr'
>>> find_matching_language('iw') # obsolete alias for Hebrew
>>> find_matching_language('fre') # ISO 639-2/B code for French
'fr'
>>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew
'he'
>>> find_matching_language('no') # Norwegian
'nb'
>>> find_matching_language('mo') # old code for ro-MD
>>> find_matching_language('mo') # Deprecated code for Moldavian
'ro'
>>> find_matching_language('zh-Hans') # Simplified Chinese
'zh'
>>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian
'sr'
>>> find_matching_language('zxx')
None
"""
import spacy.lang # noqa: F401
if lang == "xx":
return "xx"
# Check aliases
for lang_code, aliases in LANG_ALIASES.items():
if lang in aliases:
return lang_code
# Find out which language modules we have
possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
code = modinfo.name
if code == "xx":
# Temporarily make 'xx' into a valid language code
possible_languages.append("mul")
elif langcodes.tag_is_valid(code):
possible_languages.append(code)
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
# more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match.
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
if match == "mul":
# Convert 'mul' back to spaCy's 'xx'
return "xx"
else:
return match
return None
def get_lang_class(lang: str) -> Type["Language"]:
"""Import and load a Language class.
lang (str): IETF language code, such as 'en'.
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
@ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
try:
module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
# Find a matching language. For example, if the language 'no' is
# requested, we can use language-matching to load `spacy.lang.nb`.
try:
match = find_matching_language(lang)
except langcodes.tag_parser.LanguageTagError:
# proceed to raising an import error
match = None
# Find a matching language. For example, if the language 'eng' is
# requested, we can use language-matching to load `spacy.lang.en`.
match = find_matching_language(lang)
if match:
lang = match

View File

@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |

View File

@ -1078,7 +1078,7 @@ details.
| Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ |
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
## Defaults {id="defaults"}

View File

@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |