mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
Remove dependency on langcodes
This commit is contained in:
parent
b3c46c315e
commit
5b6412e88b
|
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
|
|
@ -65,7 +65,6 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
|
@ -143,7 +143,7 @@ class Language:
|
|||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
|
|
@ -656,17 +656,12 @@ def test_spacy_blank():
|
|||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
("en", "en"),
|
||||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("scc", "sr"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
("zh-Hant", None),
|
||||
("zxx", None),
|
||||
],
|
||||
)
|
||||
|
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
|
|||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("scc", "sr"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
],
|
||||
)
|
||||
def test_blank_languages(lang, target):
|
||||
|
|
143
spacy/util.py
143
spacy/util.py
|
@ -5,7 +5,6 @@ import inspect
|
|||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import pkgutil
|
||||
import re
|
||||
import shlex
|
||||
import shutil
|
||||
|
@ -40,7 +39,6 @@ from typing import (
|
|||
)
|
||||
|
||||
import catalogue
|
||||
import langcodes
|
||||
import numpy
|
||||
import srsly
|
||||
import thinc
|
||||
|
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
|
|||
# Default order of sections in the config file. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
||||
|
||||
LANG_ALIASES = {
|
||||
"af": ["afr"],
|
||||
"am": ["amh"],
|
||||
"ar": ["ara"],
|
||||
"az": ["aze"],
|
||||
"bg": ["bul"],
|
||||
"bn": ["ben"],
|
||||
"bo": ["bod", "tib"],
|
||||
"ca": ["cat"],
|
||||
"cs": ["ces", "cze"],
|
||||
"da": ["dan"],
|
||||
"de": ["deu", "ger"],
|
||||
"el": ["ell", "gre"],
|
||||
"en": ["eng"],
|
||||
"es": ["spa"],
|
||||
"et": ["est"],
|
||||
"eu": ["eus", "baq"],
|
||||
"fa": ["fas", "per"],
|
||||
"fi": ["fin"],
|
||||
"fo": ["fao"],
|
||||
"fr": ["fra", "fre"],
|
||||
"ga": ["gle"],
|
||||
"gd": ["gla"],
|
||||
"gu": ["guj"],
|
||||
"he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
|
||||
"hi": ["hin"],
|
||||
"hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
|
||||
"hu": ["hun"],
|
||||
"hy": ["hye"],
|
||||
"id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
|
||||
"is": ["isl", "ice"],
|
||||
"it": ["ita"],
|
||||
"ja": ["jpn"],
|
||||
"kn": ["kan"],
|
||||
"ko": ["kor"],
|
||||
"ky": ["kir"],
|
||||
"la": ["lat"],
|
||||
"lb": ["ltz"],
|
||||
"lg": ["lug"],
|
||||
"lt": ["lit"],
|
||||
"lv": ["lav"],
|
||||
"mk": ["mkd", "mac"],
|
||||
"ml": ["mal"],
|
||||
"mr": ["mar"],
|
||||
"ms": ["msa", "may"],
|
||||
"nb": ["nob"],
|
||||
"ne": ["nep"],
|
||||
"nl": ["nld", "dut"],
|
||||
"nn": ["nno"],
|
||||
"pl": ["pol"],
|
||||
"pt": ["por"],
|
||||
"ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
|
||||
"ru": ["rus"],
|
||||
"sa": ["san"],
|
||||
"si": ["sin"],
|
||||
"sk": ["slk", "slo"],
|
||||
"sl": ["slv"],
|
||||
"sq": ["sqi", "alb"],
|
||||
"sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
|
||||
"sv": ["swe"],
|
||||
"ta": ["tam"],
|
||||
"te": ["tel"],
|
||||
"th": ["tha"],
|
||||
"ti": ["tir"],
|
||||
"tl": ["tgl"],
|
||||
"tn": ["tsn"],
|
||||
"tr": ["tur"],
|
||||
"tt": ["tat"],
|
||||
"uk": ["ukr"],
|
||||
"ur": ["urd"],
|
||||
"vi": ["viw"],
|
||||
"yo": ["yor"],
|
||||
"zh": ["zho", "chi"],
|
||||
|
||||
"xx": ["mul"],
|
||||
}
|
||||
# fmt: on
|
||||
|
||||
logger = logging.getLogger("spacy")
|
||||
|
@ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool:
|
|||
|
||||
def find_matching_language(lang: str) -> Optional[str]:
|
||||
"""
|
||||
Given an IETF language code, find a supported spaCy language that is a
|
||||
close match for it (according to Unicode CLDR language-matching rules).
|
||||
This allows for language aliases, ISO 639-2 codes, more detailed language
|
||||
tags, and close matches.
|
||||
Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
|
||||
find a supported spaCy language.
|
||||
|
||||
Returns the language code if a matching language is available, or None
|
||||
if there is no matching language.
|
||||
|
||||
>>> find_matching_language('en')
|
||||
'en'
|
||||
>>> find_matching_language('pt-BR') # Brazilian Portuguese
|
||||
'pt'
|
||||
>>> find_matching_language('fra') # an ISO 639-2 code for French
|
||||
>>> find_matching_language('fra') # ISO 639-3 code for French
|
||||
'fr'
|
||||
>>> find_matching_language('iw') # obsolete alias for Hebrew
|
||||
>>> find_matching_language('fre') # ISO 639-2/B code for French
|
||||
'fr'
|
||||
>>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew
|
||||
'he'
|
||||
>>> find_matching_language('no') # Norwegian
|
||||
'nb'
|
||||
>>> find_matching_language('mo') # old code for ro-MD
|
||||
>>> find_matching_language('mo') # Deprecated code for Moldavian
|
||||
'ro'
|
||||
>>> find_matching_language('zh-Hans') # Simplified Chinese
|
||||
'zh'
|
||||
>>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian
|
||||
'sr'
|
||||
>>> find_matching_language('zxx')
|
||||
None
|
||||
"""
|
||||
import spacy.lang # noqa: F401
|
||||
|
||||
if lang == "xx":
|
||||
return "xx"
|
||||
# Check aliases
|
||||
for lang_code, aliases in LANG_ALIASES.items():
|
||||
if lang in aliases:
|
||||
return lang_code
|
||||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
|
||||
code = modinfo.name
|
||||
if code == "xx":
|
||||
# Temporarily make 'xx' into a valid language code
|
||||
possible_languages.append("mul")
|
||||
elif langcodes.tag_is_valid(code):
|
||||
possible_languages.append(code)
|
||||
|
||||
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
|
||||
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
|
||||
# more possibilities, like variants of Chinese like 'wuu', but text that
|
||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||
# shouldn't automatically match.
|
||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||
if match == "mul":
|
||||
# Convert 'mul' back to spaCy's 'xx'
|
||||
return "xx"
|
||||
else:
|
||||
return match
|
||||
return None
|
||||
|
||||
|
||||
def get_lang_class(lang: str) -> Type["Language"]:
|
||||
"""Import and load a Language class.
|
||||
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
|
||||
RETURNS (Language): Language class.
|
||||
"""
|
||||
# Check if language is registered / entry point is available
|
||||
|
@ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
|
|||
try:
|
||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||
except ImportError as err:
|
||||
# Find a matching language. For example, if the language 'no' is
|
||||
# requested, we can use language-matching to load `spacy.lang.nb`.
|
||||
try:
|
||||
match = find_matching_language(lang)
|
||||
except langcodes.tag_parser.LanguageTagError:
|
||||
# proceed to raising an import error
|
||||
match = None
|
||||
# Find a matching language. For example, if the language 'eng' is
|
||||
# requested, we can use language-matching to load `spacy.lang.en`.
|
||||
match = find_matching_language(lang)
|
||||
|
||||
if match:
|
||||
lang = match
|
||||
|
|
|
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
|||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
|
||||
| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
|
|
|
@ -1078,7 +1078,7 @@ details.
|
|||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
|
||||
| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ |
|
||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
||||
|
||||
## Defaults {id="defaults"}
|
||||
|
|
|
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
|
|||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
|
||||
| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user