mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Remove dependency on langcodes (#13760)
This PR removes the dependency on langcodes introduced in #9342. While the introduction of langcodes allows a significantly wider range of language codes, there are some unexpected side effects: zh-Hant (Traditional Chinese) should be mapped to zh intead of None, as spaCy's Chinese model is based on pkuseg which supports tokenization of both Simplified and Traditional Chinese. Since it is possible that spaCy may have a model for Norwegian Nynorsk in the future, mapping no (macrolanguage Norwegian) to nb (Norwegian Bokmål) might be misleading. In that case, the user should be asked to specify nb or nn (Norwegian Nynorsk) specifically or consult the doc. Same as above for regional variants of languages such as en_gb and en_us. Overall, IMHO, introducing an extra dependency just for the conversion of language codes is an overkill. It is possible that most user just need the conversion between 2/3-letter ISO codes and a simple dictionary lookup should suffice. With this PR, ISO 639-1 and ISO 639-3 codes are supported. ISO 639-2/B (bibliographic codes which are not favored and used in ISO 639-3) and deprecated ISO 639-1/2 codes are also supported to maximize backward compatibility.
This commit is contained in:
parent
864c2f3b51
commit
7b1d6e58ff
|
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||||
jinja2
|
jinja2
|
||||||
langcodes>=3.2.0,<4.0.0
|
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
|
|
@ -65,7 +65,6 @@ install_requires =
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
langcodes>=3.2.0,<4.0.0
|
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
|
|
@ -141,7 +141,7 @@ class Language:
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): IETF language code, such as 'en'.
|
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -656,17 +656,12 @@ def test_spacy_blank():
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"lang,target",
|
"lang,target",
|
||||||
[
|
[
|
||||||
("en", "en"),
|
|
||||||
("fra", "fr"),
|
("fra", "fr"),
|
||||||
("fre", "fr"),
|
("fre", "fr"),
|
||||||
("iw", "he"),
|
("iw", "he"),
|
||||||
("mo", "ro"),
|
("mo", "ro"),
|
||||||
|
("scc", "sr"),
|
||||||
("mul", "xx"),
|
("mul", "xx"),
|
||||||
("no", "nb"),
|
|
||||||
("pt-BR", "pt"),
|
|
||||||
("xx", "xx"),
|
|
||||||
("zh-Hans", "zh"),
|
|
||||||
("zh-Hant", None),
|
|
||||||
("zxx", None),
|
("zxx", None),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
|
||||||
("fre", "fr"),
|
("fre", "fr"),
|
||||||
("iw", "he"),
|
("iw", "he"),
|
||||||
("mo", "ro"),
|
("mo", "ro"),
|
||||||
|
("scc", "sr"),
|
||||||
("mul", "xx"),
|
("mul", "xx"),
|
||||||
("no", "nb"),
|
|
||||||
("pt-BR", "pt"),
|
|
||||||
("xx", "xx"),
|
("xx", "xx"),
|
||||||
("zh-Hans", "zh"),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_blank_languages(lang, target):
|
def test_blank_languages(lang, target):
|
||||||
|
|
143
spacy/util.py
143
spacy/util.py
|
@ -5,7 +5,6 @@ import inspect
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pkgutil
|
|
||||||
import re
|
import re
|
||||||
import shlex
|
import shlex
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -40,7 +39,6 @@ from typing import (
|
||||||
)
|
)
|
||||||
|
|
||||||
import catalogue
|
import catalogue
|
||||||
import langcodes
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
import thinc
|
import thinc
|
||||||
|
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
|
||||||
# Default order of sections in the config file. Not all sections needs to exist,
|
# Default order of sections in the config file. Not all sections needs to exist,
|
||||||
# and additional sections are added at the end, in alphabetical order.
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
||||||
|
|
||||||
|
LANG_ALIASES = {
|
||||||
|
"af": ["afr"],
|
||||||
|
"am": ["amh"],
|
||||||
|
"ar": ["ara"],
|
||||||
|
"az": ["aze"],
|
||||||
|
"bg": ["bul"],
|
||||||
|
"bn": ["ben"],
|
||||||
|
"bo": ["bod", "tib"],
|
||||||
|
"ca": ["cat"],
|
||||||
|
"cs": ["ces", "cze"],
|
||||||
|
"da": ["dan"],
|
||||||
|
"de": ["deu", "ger"],
|
||||||
|
"el": ["ell", "gre"],
|
||||||
|
"en": ["eng"],
|
||||||
|
"es": ["spa"],
|
||||||
|
"et": ["est"],
|
||||||
|
"eu": ["eus", "baq"],
|
||||||
|
"fa": ["fas", "per"],
|
||||||
|
"fi": ["fin"],
|
||||||
|
"fo": ["fao"],
|
||||||
|
"fr": ["fra", "fre"],
|
||||||
|
"ga": ["gle"],
|
||||||
|
"gd": ["gla"],
|
||||||
|
"gu": ["guj"],
|
||||||
|
"he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
|
||||||
|
"hi": ["hin"],
|
||||||
|
"hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
|
||||||
|
"hu": ["hun"],
|
||||||
|
"hy": ["hye"],
|
||||||
|
"id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
|
||||||
|
"is": ["isl", "ice"],
|
||||||
|
"it": ["ita"],
|
||||||
|
"ja": ["jpn"],
|
||||||
|
"kn": ["kan"],
|
||||||
|
"ko": ["kor"],
|
||||||
|
"ky": ["kir"],
|
||||||
|
"la": ["lat"],
|
||||||
|
"lb": ["ltz"],
|
||||||
|
"lg": ["lug"],
|
||||||
|
"lt": ["lit"],
|
||||||
|
"lv": ["lav"],
|
||||||
|
"mk": ["mkd", "mac"],
|
||||||
|
"ml": ["mal"],
|
||||||
|
"mr": ["mar"],
|
||||||
|
"ms": ["msa", "may"],
|
||||||
|
"nb": ["nob"],
|
||||||
|
"ne": ["nep"],
|
||||||
|
"nl": ["nld", "dut"],
|
||||||
|
"nn": ["nno"],
|
||||||
|
"pl": ["pol"],
|
||||||
|
"pt": ["por"],
|
||||||
|
"ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
|
||||||
|
"ru": ["rus"],
|
||||||
|
"sa": ["san"],
|
||||||
|
"si": ["sin"],
|
||||||
|
"sk": ["slk", "slo"],
|
||||||
|
"sl": ["slv"],
|
||||||
|
"sq": ["sqi", "alb"],
|
||||||
|
"sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
|
||||||
|
"sv": ["swe"],
|
||||||
|
"ta": ["tam"],
|
||||||
|
"te": ["tel"],
|
||||||
|
"th": ["tha"],
|
||||||
|
"ti": ["tir"],
|
||||||
|
"tl": ["tgl"],
|
||||||
|
"tn": ["tsn"],
|
||||||
|
"tr": ["tur"],
|
||||||
|
"tt": ["tat"],
|
||||||
|
"uk": ["ukr"],
|
||||||
|
"ur": ["urd"],
|
||||||
|
"vi": ["viw"],
|
||||||
|
"yo": ["yor"],
|
||||||
|
"zh": ["zho", "chi"],
|
||||||
|
|
||||||
|
"xx": ["mul"],
|
||||||
|
}
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
|
@ -305,63 +380,39 @@ def lang_class_is_loaded(lang: str) -> bool:
|
||||||
|
|
||||||
def find_matching_language(lang: str) -> Optional[str]:
|
def find_matching_language(lang: str) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Given an IETF language code, find a supported spaCy language that is a
|
Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
|
||||||
close match for it (according to Unicode CLDR language-matching rules).
|
find a supported spaCy language.
|
||||||
This allows for language aliases, ISO 639-2 codes, more detailed language
|
|
||||||
tags, and close matches.
|
|
||||||
|
|
||||||
Returns the language code if a matching language is available, or None
|
Returns the language code if a matching language is available, or None
|
||||||
if there is no matching language.
|
if there is no matching language.
|
||||||
|
|
||||||
>>> find_matching_language('en')
|
>>> find_matching_language('fra') # ISO 639-3 code for French
|
||||||
'en'
|
|
||||||
>>> find_matching_language('pt-BR') # Brazilian Portuguese
|
|
||||||
'pt'
|
|
||||||
>>> find_matching_language('fra') # an ISO 639-2 code for French
|
|
||||||
'fr'
|
'fr'
|
||||||
>>> find_matching_language('iw') # obsolete alias for Hebrew
|
>>> find_matching_language('fre') # ISO 639-2/B code for French
|
||||||
|
'fr'
|
||||||
|
>>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew
|
||||||
'he'
|
'he'
|
||||||
>>> find_matching_language('no') # Norwegian
|
>>> find_matching_language('mo') # Deprecated code for Moldavian
|
||||||
'nb'
|
|
||||||
>>> find_matching_language('mo') # old code for ro-MD
|
|
||||||
'ro'
|
'ro'
|
||||||
>>> find_matching_language('zh-Hans') # Simplified Chinese
|
>>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian
|
||||||
'zh'
|
'sr'
|
||||||
>>> find_matching_language('zxx')
|
>>> find_matching_language('zxx')
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
import spacy.lang # noqa: F401
|
import spacy.lang # noqa: F401
|
||||||
|
|
||||||
if lang == "xx":
|
# Check aliases
|
||||||
return "xx"
|
for lang_code, aliases in LANG_ALIASES.items():
|
||||||
|
if lang in aliases:
|
||||||
|
return lang_code
|
||||||
|
|
||||||
# Find out which language modules we have
|
return None
|
||||||
possible_languages = []
|
|
||||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
|
|
||||||
code = modinfo.name
|
|
||||||
if code == "xx":
|
|
||||||
# Temporarily make 'xx' into a valid language code
|
|
||||||
possible_languages.append("mul")
|
|
||||||
elif langcodes.tag_is_valid(code):
|
|
||||||
possible_languages.append(code)
|
|
||||||
|
|
||||||
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
|
|
||||||
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
|
|
||||||
# more possibilities, like variants of Chinese like 'wuu', but text that
|
|
||||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
|
||||||
# shouldn't automatically match.
|
|
||||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
|
||||||
if match == "mul":
|
|
||||||
# Convert 'mul' back to spaCy's 'xx'
|
|
||||||
return "xx"
|
|
||||||
else:
|
|
||||||
return match
|
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(lang: str) -> Type["Language"]:
|
def get_lang_class(lang: str) -> Type["Language"]:
|
||||||
"""Import and load a Language class.
|
"""Import and load a Language class.
|
||||||
|
|
||||||
lang (str): IETF language code, such as 'en'.
|
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
|
||||||
RETURNS (Language): Language class.
|
RETURNS (Language): Language class.
|
||||||
"""
|
"""
|
||||||
# Check if language is registered / entry point is available
|
# Check if language is registered / entry point is available
|
||||||
|
@ -372,13 +423,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
|
||||||
try:
|
try:
|
||||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||||
except ImportError as err:
|
except ImportError as err:
|
||||||
# Find a matching language. For example, if the language 'no' is
|
# Find a matching language. For example, if the language 'eng' is
|
||||||
# requested, we can use language-matching to load `spacy.lang.nb`.
|
# requested, we can use language-matching to load `spacy.lang.en`.
|
||||||
try:
|
match = find_matching_language(lang)
|
||||||
match = find_matching_language(lang)
|
|
||||||
except langcodes.tag_parser.LanguageTagError:
|
|
||||||
# proceed to raising an import error
|
|
||||||
match = None
|
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
lang = match
|
lang = match
|
||||||
|
|
|
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ |
|
||||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
|
|
|
@ -1078,7 +1078,7 @@ details.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||||
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
|
| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ |
|
||||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
||||||
|
|
||||||
## Defaults {id="defaults"}
|
## Defaults {id="defaults"}
|
||||||
|
|
|
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
|
| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user