mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Allow IETF language codes, aliases, and close matches (#9342)
* use language-matching to allow language code aliases Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * link to "IETF language tags" in docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Make requirements consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * change "two-letter language ID" to "IETF language tag" in language docs Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * use langcodes 3.2 and handle language-tag errors better Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * all unknown language codes are ImportErrors Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
parent
1ee6541ab0
commit
53b5f245ed
|
@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
|
|
@ -62,6 +62,7 @@ install_requires =
|
|||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
|
@ -282,7 +282,7 @@ class Errors:
|
|||
"you forget to call the `set_extension` method?")
|
||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
||||
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||
"package or a valid path to a data directory.")
|
||||
E052 = ("Can't find model directory: {path}")
|
||||
|
|
|
@ -105,7 +105,7 @@ class Language:
|
|||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (str): Two-letter language ID, i.e. ISO code.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
|
|||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.util import registry, ignore_error, raise_error
|
||||
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||
import spacy
|
||||
from thinc.api import NumpyOps, get_current_ops
|
||||
|
||||
|
@ -502,6 +502,55 @@ def test_spacy_blank():
|
|||
assert nlp.meta["name"] == "my_custom_model"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
('en', 'en'),
|
||||
('fra', 'fr'),
|
||||
('fre', 'fr'),
|
||||
('iw', 'he'),
|
||||
('mo', 'ro'),
|
||||
('mul', 'xx'),
|
||||
('no', 'nb'),
|
||||
('pt-BR', 'pt'),
|
||||
('xx', 'xx'),
|
||||
('zh-Hans', 'zh'),
|
||||
('zh-Hant', None),
|
||||
('zxx', None)
|
||||
]
|
||||
)
|
||||
def test_language_matching(lang, target):
|
||||
"""
|
||||
Test that we can look up languages by equivalent or nearly-equivalent
|
||||
language codes.
|
||||
"""
|
||||
assert find_matching_language(lang) == target
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
('en', 'en'),
|
||||
('fra', 'fr'),
|
||||
('fre', 'fr'),
|
||||
('iw', 'he'),
|
||||
('mo', 'ro'),
|
||||
('mul', 'xx'),
|
||||
('no', 'nb'),
|
||||
('pt-BR', 'pt'),
|
||||
('xx', 'xx'),
|
||||
('zh-Hans', 'zh'),
|
||||
]
|
||||
)
|
||||
def test_blank_languages(lang, target):
|
||||
"""
|
||||
Test that we can get spacy.blank in various languages, including codes
|
||||
that are defined to be equivalent or that match by CLDR language matching.
|
||||
"""
|
||||
nlp = spacy.blank(lang)
|
||||
assert nlp.lang == target
|
||||
|
||||
|
||||
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
|
|
|
@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
|
|||
nlp = util.load_model("blank:en")
|
||||
assert nlp.lang == "en"
|
||||
assert nlp.pipeline == []
|
||||
|
||||
# ImportError for loading an unsupported language
|
||||
with pytest.raises(ImportError):
|
||||
util.load_model("blank:zxx")
|
||||
|
||||
# ImportError for requesting an invalid language code that isn't registered
|
||||
with pytest.raises(ImportError):
|
||||
util.load_model("blank:fjsfijsdof")
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ import numpy
|
|||
import srsly
|
||||
import catalogue
|
||||
from catalogue import RegistryError, Registry
|
||||
import langcodes
|
||||
import sys
|
||||
import warnings
|
||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||
|
@ -28,6 +29,7 @@ import tempfile
|
|||
import shutil
|
||||
import shlex
|
||||
import inspect
|
||||
import pkgutil
|
||||
import logging
|
||||
|
||||
try:
|
||||
|
@ -256,20 +258,89 @@ def lang_class_is_loaded(lang: str) -> bool:
|
|||
return lang in registry.languages
|
||||
|
||||
|
||||
def find_matching_language(lang: str) -> Optional[str]:
|
||||
"""
|
||||
Given an IETF language code, find a supported spaCy language that is a
|
||||
close match for it (according to Unicode CLDR language-matching rules).
|
||||
This allows for language aliases, ISO 639-2 codes, more detailed language
|
||||
tags, and close matches.
|
||||
|
||||
Returns the language code if a matching language is available, or None
|
||||
if there is no matching language.
|
||||
|
||||
>>> find_matching_language('en')
|
||||
'en'
|
||||
>>> find_matching_language('pt-BR') # Brazilian Portuguese
|
||||
'pt'
|
||||
>>> find_matching_language('fra') # an ISO 639-2 code for French
|
||||
'fr'
|
||||
>>> find_matching_language('iw') # obsolete alias for Hebrew
|
||||
'he'
|
||||
>>> find_matching_language('no') # Norwegian
|
||||
'nb'
|
||||
>>> find_matching_language('mo') # old code for ro-MD
|
||||
'ro'
|
||||
>>> find_matching_language('zh-Hans') # Simplified Chinese
|
||||
'zh'
|
||||
>>> find_matching_language('zxx')
|
||||
None
|
||||
"""
|
||||
import spacy.lang # noqa: F401
|
||||
if lang == 'xx':
|
||||
return 'xx'
|
||||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
|
||||
code = modinfo.name
|
||||
if code == 'xx':
|
||||
# Temporarily make 'xx' into a valid language code
|
||||
possible_languages.append('mul')
|
||||
elif langcodes.tag_is_valid(code):
|
||||
possible_languages.append(code)
|
||||
|
||||
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
|
||||
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
|
||||
# more possibilities, like variants of Chinese like 'wuu', but text that
|
||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||
# shouldn't automatically match.
|
||||
match = langcodes.closest_supported_match(
|
||||
lang, possible_languages, max_distance=9
|
||||
)
|
||||
if match == 'mul':
|
||||
# Convert 'mul' back to spaCy's 'xx'
|
||||
return 'xx'
|
||||
else:
|
||||
return match
|
||||
|
||||
|
||||
def get_lang_class(lang: str) -> "Language":
|
||||
"""Import and load a Language class.
|
||||
|
||||
lang (str): Two-letter language code, e.g. 'en'.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
RETURNS (Language): Language class.
|
||||
"""
|
||||
# Check if language is registered / entry point is available
|
||||
if lang in registry.languages:
|
||||
return registry.languages.get(lang)
|
||||
else:
|
||||
# Find the language in the spacy.lang subpackage
|
||||
try:
|
||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||
except ImportError as err:
|
||||
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
|
||||
# Find a matching language. For example, if the language 'no' is
|
||||
# requested, we can use language-matching to load `spacy.lang.nb`.
|
||||
try:
|
||||
match = find_matching_language(lang)
|
||||
except langcodes.tag_parser.LanguageTagError:
|
||||
# proceed to raising an import error
|
||||
match = None
|
||||
|
||||
if match:
|
||||
lang = match
|
||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||
else:
|
||||
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
|
||||
set_lang_class(lang, getattr(module, module.__all__[0]))
|
||||
return registry.languages.get(lang)
|
||||
|
||||
|
|
|
@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
|||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
|
|
|
@ -1039,7 +1039,7 @@ available to the loaded object.
|
|||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||
| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
|
||||
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
|
||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
||||
|
||||
## Defaults {#defaults}
|
||||
|
|
|
@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
|
|||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
||||
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user