Allow IETF language codes, aliases, and close matches (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
Elia Robyn Lake (Robyn Speer) 2021-10-05 03:52:22 -04:00 committed by GitHub
parent 1ee6541ab0
commit 53b5f245ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 136 additions and 8 deletions

View File

@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0
langcodes>=3.2.0,<4.0.0

View File

@ -62,6 +62,7 @@ install_requires =
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]
console_scripts =

View File

@ -282,7 +282,7 @@ class Errors:
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang: {err}")
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}")

View File

@ -105,7 +105,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (str): Two-letter language ID, i.e. ISO code.
lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language
"""

View File

@ -8,7 +8,7 @@ from spacy.vocab import Vocab
from spacy.training import Example
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error
from spacy.util import registry, ignore_error, raise_error, find_matching_language
import spacy
from thinc.api import NumpyOps, get_current_ops
@ -502,6 +502,55 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
"lang,target",
[
('en', 'en'),
('fra', 'fr'),
('fre', 'fr'),
('iw', 'he'),
('mo', 'ro'),
('mul', 'xx'),
('no', 'nb'),
('pt-BR', 'pt'),
('xx', 'xx'),
('zh-Hans', 'zh'),
('zh-Hant', None),
('zxx', None)
]
)
def test_language_matching(lang, target):
"""
Test that we can look up languages by equivalent or nearly-equivalent
language codes.
"""
assert find_matching_language(lang) == target
@pytest.mark.parametrize(
"lang,target",
[
('en', 'en'),
('fra', 'fr'),
('fre', 'fr'),
('iw', 'he'),
('mo', 'ro'),
('mul', 'xx'),
('no', 'nb'),
('pt-BR', 'pt'),
('xx', 'xx'),
('zh-Hans', 'zh'),
]
)
def test_blank_languages(lang, target):
"""
Test that we can get spacy.blank in various languages, including codes
that are defined to be equivalent or that match by CLDR language matching.
"""
nlp = spacy.blank(lang)
assert nlp.lang == target
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"

View File

@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
nlp = util.load_model("blank:en")
assert nlp.lang == "en"
assert nlp.pipeline == []
# ImportError for loading an unsupported language
with pytest.raises(ImportError):
util.load_model("blank:zxx")
# ImportError for requesting an invalid language code that isn't registered
with pytest.raises(ImportError):
util.load_model("blank:fjsfijsdof")

View File

@ -16,6 +16,7 @@ import numpy
import srsly
import catalogue
from catalogue import RegistryError, Registry
import langcodes
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
@ -28,6 +29,7 @@ import tempfile
import shutil
import shlex
import inspect
import pkgutil
import logging
try:
@ -256,20 +258,89 @@ def lang_class_is_loaded(lang: str) -> bool:
return lang in registry.languages
def find_matching_language(lang: str) -> Optional[str]:
"""
Given an IETF language code, find a supported spaCy language that is a
close match for it (according to Unicode CLDR language-matching rules).
This allows for language aliases, ISO 639-2 codes, more detailed language
tags, and close matches.
Returns the language code if a matching language is available, or None
if there is no matching language.
>>> find_matching_language('en')
'en'
>>> find_matching_language('pt-BR') # Brazilian Portuguese
'pt'
>>> find_matching_language('fra') # an ISO 639-2 code for French
'fr'
>>> find_matching_language('iw') # obsolete alias for Hebrew
'he'
>>> find_matching_language('no') # Norwegian
'nb'
>>> find_matching_language('mo') # old code for ro-MD
'ro'
>>> find_matching_language('zh-Hans') # Simplified Chinese
'zh'
>>> find_matching_language('zxx')
None
"""
import spacy.lang # noqa: F401
if lang == 'xx':
return 'xx'
# Find out which language modules we have
possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
code = modinfo.name
if code == 'xx':
# Temporarily make 'xx' into a valid language code
possible_languages.append('mul')
elif langcodes.tag_is_valid(code):
possible_languages.append(code)
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
# more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match.
match = langcodes.closest_supported_match(
lang, possible_languages, max_distance=9
)
if match == 'mul':
# Convert 'mul' back to spaCy's 'xx'
return 'xx'
else:
return match
def get_lang_class(lang: str) -> "Language":
"""Import and load a Language class.
lang (str): Two-letter language code, e.g. 'en'.
lang (str): IETF language code, such as 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
if lang in registry.languages:
return registry.languages.get(lang)
else:
# Find the language in the spacy.lang subpackage
try:
module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
# Find a matching language. For example, if the language 'no' is
# requested, we can use language-matching to load `spacy.lang.nb`.
try:
match = find_matching_language(lang)
except langcodes.tag_parser.LanguageTagError:
# proceed to raising an import error
match = None
if match:
lang = match
module = importlib.import_module(f".lang.{lang}", "spacy")
else:
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
set_lang_class(lang, getattr(module, module.__all__[0]))
return registry.languages.get(lang)

View File

@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |

View File

@ -1039,7 +1039,7 @@ available to the loaded object.
| Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
## Defaults {#defaults}

View File

@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |