diff --git a/requirements.txt b/requirements.txt index 6f9addbe9..85de453b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 +langcodes>=3.2.0,<4.0.0 diff --git a/setup.cfg b/setup.cfg index da9944a5e..4313612d4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,6 +62,7 @@ install_requires = setuptools packaging>=20.0 typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" + langcodes>=3.2.0,<4.0.0 [options.entry_points] console_scripts = diff --git a/spacy/errors.py b/spacy/errors.py index 064f33f31..120828fd6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -282,7 +282,7 @@ class Errors: "you forget to call the `set_extension` method?") E047 = ("Can't assign a value to unregistered extension attribute " "'{name}'. Did you forget to call the `set_extension` method?") - E048 = ("Can't import language {lang} from spacy.lang: {err}") + E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}") E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") diff --git a/spacy/language.py b/spacy/language.py index 512306796..d87f86bd3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -105,7 +105,7 @@ class Language: Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (str): Two-letter language ID, i.e. ISO code. + lang (str): IETF language code, such as 'en'. DOCS: https://spacy.io/api/language """ diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index e3c25fece..7a9021af0 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -8,7 +8,7 @@ from spacy.vocab import Vocab from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German -from spacy.util import registry, ignore_error, raise_error +from spacy.util import registry, ignore_error, raise_error, find_matching_language import spacy from thinc.api import NumpyOps, get_current_ops @@ -502,6 +502,55 @@ def test_spacy_blank(): assert nlp.meta["name"] == "my_custom_model" +@pytest.mark.parametrize( + "lang,target", + [ + ('en', 'en'), + ('fra', 'fr'), + ('fre', 'fr'), + ('iw', 'he'), + ('mo', 'ro'), + ('mul', 'xx'), + ('no', 'nb'), + ('pt-BR', 'pt'), + ('xx', 'xx'), + ('zh-Hans', 'zh'), + ('zh-Hant', None), + ('zxx', None) + ] +) +def test_language_matching(lang, target): + """ + Test that we can look up languages by equivalent or nearly-equivalent + language codes. + """ + assert find_matching_language(lang) == target + + +@pytest.mark.parametrize( + "lang,target", + [ + ('en', 'en'), + ('fra', 'fr'), + ('fre', 'fr'), + ('iw', 'he'), + ('mo', 'ro'), + ('mul', 'xx'), + ('no', 'nb'), + ('pt-BR', 'pt'), + ('xx', 'xx'), + ('zh-Hans', 'zh'), + ] +) +def test_blank_languages(lang, target): + """ + Test that we can get spacy.blank in various languages, including codes + that are defined to be equivalent or that match by CLDR language matching. + """ + nlp = spacy.blank(lang) + assert nlp.lang == target + + @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab]) def test_language_init_invalid_vocab(value): err_fragment = "invalid value" diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 45cbdf45b..f17d5e62e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -139,6 +139,12 @@ def test_load_model_blank_shortcut(): nlp = util.load_model("blank:en") assert nlp.lang == "en" assert nlp.pipeline == [] + + # ImportError for loading an unsupported language + with pytest.raises(ImportError): + util.load_model("blank:zxx") + + # ImportError for requesting an invalid language code that isn't registered with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof") diff --git a/spacy/util.py b/spacy/util.py index e747d5fbc..fc1c0e76d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -16,6 +16,7 @@ import numpy import srsly import catalogue from catalogue import RegistryError, Registry +import langcodes import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier @@ -28,6 +29,7 @@ import tempfile import shutil import shlex import inspect +import pkgutil import logging try: @@ -256,20 +258,89 @@ def lang_class_is_loaded(lang: str) -> bool: return lang in registry.languages +def find_matching_language(lang: str) -> Optional[str]: + """ + Given an IETF language code, find a supported spaCy language that is a + close match for it (according to Unicode CLDR language-matching rules). + This allows for language aliases, ISO 639-2 codes, more detailed language + tags, and close matches. + + Returns the language code if a matching language is available, or None + if there is no matching language. + + >>> find_matching_language('en') + 'en' + >>> find_matching_language('pt-BR') # Brazilian Portuguese + 'pt' + >>> find_matching_language('fra') # an ISO 639-2 code for French + 'fr' + >>> find_matching_language('iw') # obsolete alias for Hebrew + 'he' + >>> find_matching_language('no') # Norwegian + 'nb' + >>> find_matching_language('mo') # old code for ro-MD + 'ro' + >>> find_matching_language('zh-Hans') # Simplified Chinese + 'zh' + >>> find_matching_language('zxx') + None + """ + import spacy.lang # noqa: F401 + if lang == 'xx': + return 'xx' + + # Find out which language modules we have + possible_languages = [] + for modinfo in pkgutil.iter_modules(spacy.lang.__path__): + code = modinfo.name + if code == 'xx': + # Temporarily make 'xx' into a valid language code + possible_languages.append('mul') + elif langcodes.tag_is_valid(code): + possible_languages.append(code) + + # Distances from 1-9 allow near misses like Bosnian -> Croatian and + # Norwegian -> Norwegian Bokmål. A distance of 10 would include several + # more possibilities, like variants of Chinese like 'wuu', but text that + # is labeled that way is probably trying to be distinct from 'zh' and + # shouldn't automatically match. + match = langcodes.closest_supported_match( + lang, possible_languages, max_distance=9 + ) + if match == 'mul': + # Convert 'mul' back to spaCy's 'xx' + return 'xx' + else: + return match + + def get_lang_class(lang: str) -> "Language": """Import and load a Language class. - lang (str): Two-letter language code, e.g. 'en'. + lang (str): IETF language code, such as 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available if lang in registry.languages: return registry.languages.get(lang) else: + # Find the language in the spacy.lang subpackage try: module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: - raise ImportError(Errors.E048.format(lang=lang, err=err)) from err + # Find a matching language. For example, if the language 'no' is + # requested, we can use language-matching to load `spacy.lang.nb`. + try: + match = find_matching_language(lang) + except langcodes.tag_parser.LanguageTagError: + # proceed to raising an import error + match = None + + if match: + lang = match + module = importlib.import_module(f".lang.{lang}", "spacy") + else: + raise ImportError(Errors.E048.format(lang=lang, err=err)) from err set_lang_class(lang, getattr(module, module.__all__[0])) return registry.languages.get(lang) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 10ab2083e..aadeebd77 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | Name | Description | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | +| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 0aa33b281..4cf063fcc 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -1039,7 +1039,7 @@ available to the loaded object. | Name | Description | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | +| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {#defaults} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 48c16e559..b48cd47f3 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of | Name | Description | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | +| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ | | _keyword-only_ | | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |