diff --git a/requirements.txt b/requirements.txt index bfdcf0d96..8ab1534ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 jinja2 -langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index daba8865f..3412ea9d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,6 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - langcodes>=3.2.0,<4.0.0 [options.entry_points] console_scripts = diff --git a/spacy/language.py b/spacy/language.py index 93840c922..b1d061294 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -143,7 +143,7 @@ class Language: Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (str): IETF language code, such as 'en'. + lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'. DOCS: https://spacy.io/api/language """ diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index ee707f793..dbfd69f14 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -656,17 +656,12 @@ def test_spacy_blank(): @pytest.mark.parametrize( "lang,target", [ - ("en", "en"), ("fra", "fr"), ("fre", "fr"), ("iw", "he"), ("mo", "ro"), + ("scc", "sr"), ("mul", "xx"), - ("no", "nb"), - ("pt-BR", "pt"), - ("xx", "xx"), - ("zh-Hans", "zh"), - ("zh-Hant", None), ("zxx", None), ], ) @@ -686,11 +681,9 @@ def test_language_matching(lang, target): ("fre", "fr"), ("iw", "he"), ("mo", "ro"), + ("scc", "sr"), ("mul", "xx"), - ("no", "nb"), - ("pt-BR", "pt"), ("xx", "xx"), - ("zh-Hans", "zh"), ], ) def test_blank_languages(lang, target): diff --git a/spacy/util.py b/spacy/util.py index c127be03c..9b4ced988 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -5,7 +5,6 @@ import inspect import itertools import logging import os -import pkgutil import re import shlex import shutil @@ -40,7 +39,6 @@ from typing import ( ) import catalogue -import langcodes import numpy import srsly import thinc @@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt" # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] + +LANG_ALIASES = { + "af": ["afr"], + "am": ["amh"], + "ar": ["ara"], + "az": ["aze"], + "bg": ["bul"], + "bn": ["ben"], + "bo": ["bod", "tib"], + "ca": ["cat"], + "cs": ["ces", "cze"], + "da": ["dan"], + "de": ["deu", "ger"], + "el": ["ell", "gre"], + "en": ["eng"], + "es": ["spa"], + "et": ["est"], + "eu": ["eus", "baq"], + "fa": ["fas", "per"], + "fi": ["fin"], + "fo": ["fao"], + "fr": ["fra", "fre"], + "ga": ["gle"], + "gd": ["gla"], + "gu": ["guj"], + "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew + "hi": ["hin"], + "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian + "hu": ["hun"], + "hy": ["hye"], + "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew + "is": ["isl", "ice"], + "it": ["ita"], + "ja": ["jpn"], + "kn": ["kan"], + "ko": ["kor"], + "ky": ["kir"], + "la": ["lat"], + "lb": ["ltz"], + "lg": ["lug"], + "lt": ["lit"], + "lv": ["lav"], + "mk": ["mkd", "mac"], + "ml": ["mal"], + "mr": ["mar"], + "ms": ["msa", "may"], + "nb": ["nob"], + "ne": ["nep"], + "nl": ["nld", "dut"], + "nn": ["nno"], + "pl": ["pol"], + "pt": ["por"], + "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian + "ru": ["rus"], + "sa": ["san"], + "si": ["sin"], + "sk": ["slk", "slo"], + "sl": ["slv"], + "sq": ["sqi", "alb"], + "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian + "sv": ["swe"], + "ta": ["tam"], + "te": ["tel"], + "th": ["tha"], + "ti": ["tir"], + "tl": ["tgl"], + "tn": ["tsn"], + "tr": ["tur"], + "tt": ["tat"], + "uk": ["ukr"], + "ur": ["urd"], + "vi": ["viw"], + "yo": ["yor"], + "zh": ["zho", "chi"], + + "xx": ["mul"], +} # fmt: on logger = logging.getLogger("spacy") @@ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool: def find_matching_language(lang: str) -> Optional[str]: """ - Given an IETF language code, find a supported spaCy language that is a - close match for it (according to Unicode CLDR language-matching rules). - This allows for language aliases, ISO 639-2 codes, more detailed language - tags, and close matches. + Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code, + find a supported spaCy language. Returns the language code if a matching language is available, or None if there is no matching language. - >>> find_matching_language('en') - 'en' - >>> find_matching_language('pt-BR') # Brazilian Portuguese - 'pt' - >>> find_matching_language('fra') # an ISO 639-2 code for French + >>> find_matching_language('fra') # ISO 639-3 code for French 'fr' - >>> find_matching_language('iw') # obsolete alias for Hebrew + >>> find_matching_language('fre') # ISO 639-2/B code for French + 'fr' + >>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew 'he' - >>> find_matching_language('no') # Norwegian - 'nb' - >>> find_matching_language('mo') # old code for ro-MD + >>> find_matching_language('mo') # Deprecated code for Moldavian 'ro' - >>> find_matching_language('zh-Hans') # Simplified Chinese - 'zh' + >>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian + 'sr' >>> find_matching_language('zxx') None """ import spacy.lang # noqa: F401 - if lang == "xx": - return "xx" + # Check aliases + for lang_code, aliases in LANG_ALIASES.items(): + if lang in aliases: + return lang_code - # Find out which language modules we have - possible_languages = [] - for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined] - code = modinfo.name - if code == "xx": - # Temporarily make 'xx' into a valid language code - possible_languages.append("mul") - elif langcodes.tag_is_valid(code): - possible_languages.append(code) - - # Distances from 1-9 allow near misses like Bosnian -> Croatian and - # Norwegian -> Norwegian Bokmål. A distance of 10 would include several - # more possibilities, like variants of Chinese like 'wuu', but text that - # is labeled that way is probably trying to be distinct from 'zh' and - # shouldn't automatically match. - match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) - if match == "mul": - # Convert 'mul' back to spaCy's 'xx' - return "xx" - else: - return match + return None def get_lang_class(lang: str) -> Type["Language"]: """Import and load a Language class. - lang (str): IETF language code, such as 'en'. + lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]: try: module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: - # Find a matching language. For example, if the language 'no' is - # requested, we can use language-matching to load `spacy.lang.nb`. - try: - match = find_matching_language(lang) - except langcodes.tag_parser.LanguageTagError: - # proceed to raising an import error - match = None + # Find a matching language. For example, if the language 'eng' is + # requested, we can use language-matching to load `spacy.lang.en`. + match = find_matching_language(lang) if match: lang = match diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 6c47c8f1e..09a978259 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | Name | Description | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ | +| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index b969ddc53..a1c6601ab 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -1078,7 +1078,7 @@ details. | Name | Description | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ | +| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {id="defaults"} diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9cdc0c8ab..340f10f77 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of | Name | Description | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ | +| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ | | _keyword-only_ | | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |