mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Remove dependency on langcodes
This commit is contained in:
		
							parent
							
								
									b3c46c315e
								
							
						
					
					
						commit
						5b6412e88b
					
				|  | @ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0 | ||||||
| tqdm>=4.38.0,<5.0.0 | tqdm>=4.38.0,<5.0.0 | ||||||
| pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 | ||||||
| jinja2 | jinja2 | ||||||
| langcodes>=3.2.0,<4.0.0 |  | ||||||
| # Official Python utilities | # Official Python utilities | ||||||
| setuptools | setuptools | ||||||
| packaging>=20.0 | packaging>=20.0 | ||||||
|  |  | ||||||
|  | @ -65,7 +65,6 @@ install_requires = | ||||||
|     # Official Python utilities |     # Official Python utilities | ||||||
|     setuptools |     setuptools | ||||||
|     packaging>=20.0 |     packaging>=20.0 | ||||||
|     langcodes>=3.2.0,<4.0.0 |  | ||||||
| 
 | 
 | ||||||
| [options.entry_points] | [options.entry_points] | ||||||
| console_scripts = | console_scripts = | ||||||
|  |  | ||||||
|  | @ -143,7 +143,7 @@ class Language: | ||||||
| 
 | 
 | ||||||
|     Defaults (class): Settings, data and factory methods for creating the `nlp` |     Defaults (class): Settings, data and factory methods for creating the `nlp` | ||||||
|         object and processing pipeline. |         object and processing pipeline. | ||||||
|     lang (str): IETF language code, such as 'en'. |     lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'. | ||||||
| 
 | 
 | ||||||
|     DOCS: https://spacy.io/api/language |     DOCS: https://spacy.io/api/language | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|  | @ -656,17 +656,12 @@ def test_spacy_blank(): | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "lang,target", |     "lang,target", | ||||||
|     [ |     [ | ||||||
|         ("en", "en"), |  | ||||||
|         ("fra", "fr"), |         ("fra", "fr"), | ||||||
|         ("fre", "fr"), |         ("fre", "fr"), | ||||||
|         ("iw", "he"), |         ("iw", "he"), | ||||||
|         ("mo", "ro"), |         ("mo", "ro"), | ||||||
|  |         ("scc", "sr"), | ||||||
|         ("mul", "xx"), |         ("mul", "xx"), | ||||||
|         ("no", "nb"), |  | ||||||
|         ("pt-BR", "pt"), |  | ||||||
|         ("xx", "xx"), |  | ||||||
|         ("zh-Hans", "zh"), |  | ||||||
|         ("zh-Hant", None), |  | ||||||
|         ("zxx", None), |         ("zxx", None), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  | @ -686,11 +681,9 @@ def test_language_matching(lang, target): | ||||||
|         ("fre", "fr"), |         ("fre", "fr"), | ||||||
|         ("iw", "he"), |         ("iw", "he"), | ||||||
|         ("mo", "ro"), |         ("mo", "ro"), | ||||||
|  |         ("scc", "sr"), | ||||||
|         ("mul", "xx"), |         ("mul", "xx"), | ||||||
|         ("no", "nb"), |  | ||||||
|         ("pt-BR", "pt"), |  | ||||||
|         ("xx", "xx"), |         ("xx", "xx"), | ||||||
|         ("zh-Hans", "zh"), |  | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_blank_languages(lang, target): | def test_blank_languages(lang, target): | ||||||
|  |  | ||||||
							
								
								
									
										143
									
								
								spacy/util.py
									
									
									
									
									
								
							
							
						
						
									
										143
									
								
								spacy/util.py
									
									
									
									
									
								
							|  | @ -5,7 +5,6 @@ import inspect | ||||||
| import itertools | import itertools | ||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
| import pkgutil |  | ||||||
| import re | import re | ||||||
| import shlex | import shlex | ||||||
| import shutil | import shutil | ||||||
|  | @ -40,7 +39,6 @@ from typing import ( | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| import catalogue | import catalogue | ||||||
| import langcodes |  | ||||||
| import numpy | import numpy | ||||||
| import srsly | import srsly | ||||||
| import thinc | import thinc | ||||||
|  | @ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt" | ||||||
| # Default order of sections in the config file. Not all sections needs to exist, | # Default order of sections in the config file. Not all sections needs to exist, | ||||||
| # and additional sections are added at the end, in alphabetical order. | # and additional sections are added at the end, in alphabetical order. | ||||||
| CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] | CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] | ||||||
|  | 
 | ||||||
|  | LANG_ALIASES = { | ||||||
|  |     "af": ["afr"], | ||||||
|  |     "am": ["amh"], | ||||||
|  |     "ar": ["ara"], | ||||||
|  |     "az": ["aze"], | ||||||
|  |     "bg": ["bul"], | ||||||
|  |     "bn": ["ben"], | ||||||
|  |     "bo": ["bod", "tib"], | ||||||
|  |     "ca": ["cat"], | ||||||
|  |     "cs": ["ces", "cze"], | ||||||
|  |     "da": ["dan"], | ||||||
|  |     "de": ["deu", "ger"], | ||||||
|  |     "el": ["ell", "gre"], | ||||||
|  |     "en": ["eng"], | ||||||
|  |     "es": ["spa"], | ||||||
|  |     "et": ["est"], | ||||||
|  |     "eu": ["eus", "baq"], | ||||||
|  |     "fa": ["fas", "per"], | ||||||
|  |     "fi": ["fin"], | ||||||
|  |     "fo": ["fao"], | ||||||
|  |     "fr": ["fra", "fre"], | ||||||
|  |     "ga": ["gle"], | ||||||
|  |     "gd": ["gla"], | ||||||
|  |     "gu": ["guj"], | ||||||
|  |     "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew | ||||||
|  |     "hi": ["hin"], | ||||||
|  |     "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian | ||||||
|  |     "hu": ["hun"], | ||||||
|  |     "hy": ["hye"], | ||||||
|  |     "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew | ||||||
|  |     "is": ["isl", "ice"], | ||||||
|  |     "it": ["ita"], | ||||||
|  |     "ja": ["jpn"], | ||||||
|  |     "kn": ["kan"], | ||||||
|  |     "ko": ["kor"], | ||||||
|  |     "ky": ["kir"], | ||||||
|  |     "la": ["lat"], | ||||||
|  |     "lb": ["ltz"], | ||||||
|  |     "lg": ["lug"], | ||||||
|  |     "lt": ["lit"], | ||||||
|  |     "lv": ["lav"], | ||||||
|  |     "mk": ["mkd", "mac"], | ||||||
|  |     "ml": ["mal"], | ||||||
|  |     "mr": ["mar"], | ||||||
|  |     "ms": ["msa", "may"], | ||||||
|  |     "nb": ["nob"], | ||||||
|  |     "ne": ["nep"], | ||||||
|  |     "nl": ["nld", "dut"], | ||||||
|  |     "nn": ["nno"], | ||||||
|  |     "pl": ["pol"], | ||||||
|  |     "pt": ["por"], | ||||||
|  |     "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian | ||||||
|  |     "ru": ["rus"], | ||||||
|  |     "sa": ["san"], | ||||||
|  |     "si": ["sin"], | ||||||
|  |     "sk": ["slk", "slo"], | ||||||
|  |     "sl": ["slv"], | ||||||
|  |     "sq": ["sqi", "alb"], | ||||||
|  |     "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian | ||||||
|  |     "sv": ["swe"], | ||||||
|  |     "ta": ["tam"], | ||||||
|  |     "te": ["tel"], | ||||||
|  |     "th": ["tha"], | ||||||
|  |     "ti": ["tir"], | ||||||
|  |     "tl": ["tgl"], | ||||||
|  |     "tn": ["tsn"], | ||||||
|  |     "tr": ["tur"], | ||||||
|  |     "tt": ["tat"], | ||||||
|  |     "uk": ["ukr"], | ||||||
|  |     "ur": ["urd"], | ||||||
|  |     "vi": ["viw"], | ||||||
|  |     "yo": ["yor"], | ||||||
|  |     "zh": ["zho", "chi"], | ||||||
|  | 
 | ||||||
|  |     "xx": ["mul"], | ||||||
|  | } | ||||||
| # fmt: on | # fmt: on | ||||||
| 
 | 
 | ||||||
| logger = logging.getLogger("spacy") | logger = logging.getLogger("spacy") | ||||||
|  | @ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool: | ||||||
| 
 | 
 | ||||||
| def find_matching_language(lang: str) -> Optional[str]: | def find_matching_language(lang: str) -> Optional[str]: | ||||||
|     """ |     """ | ||||||
|     Given an IETF language code, find a supported spaCy language that is a |     Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code, | ||||||
|     close match for it (according to Unicode CLDR language-matching rules). |     find a supported spaCy language. | ||||||
|     This allows for language aliases, ISO 639-2 codes, more detailed language |  | ||||||
|     tags, and close matches. |  | ||||||
| 
 | 
 | ||||||
|     Returns the language code if a matching language is available, or None |     Returns the language code if a matching language is available, or None | ||||||
|     if there is no matching language. |     if there is no matching language. | ||||||
| 
 | 
 | ||||||
|     >>> find_matching_language('en') |     >>> find_matching_language('fra')  # ISO 639-3 code for French | ||||||
|     'en' |  | ||||||
|     >>> find_matching_language('pt-BR')  # Brazilian Portuguese |  | ||||||
|     'pt' |  | ||||||
|     >>> find_matching_language('fra')  # an ISO 639-2 code for French |  | ||||||
|     'fr' |     'fr' | ||||||
|     >>> find_matching_language('iw')  # obsolete alias for Hebrew |     >>> find_matching_language('fre')  # ISO 639-2/B code for French | ||||||
|  |     'fr' | ||||||
|  |     >>> find_matching_language('iw')  # Obsolete ISO 639-1 code for Hebrew | ||||||
|     'he' |     'he' | ||||||
|     >>> find_matching_language('no')  # Norwegian |     >>> find_matching_language('mo')  # Deprecated code for Moldavian | ||||||
|     'nb' |  | ||||||
|     >>> find_matching_language('mo')  # old code for ro-MD |  | ||||||
|     'ro' |     'ro' | ||||||
|     >>> find_matching_language('zh-Hans')  # Simplified Chinese |     >>> find_matching_language('scc')  # Deprecated ISO 639-2/B code for Serbian | ||||||
|     'zh' |     'sr' | ||||||
|     >>> find_matching_language('zxx') |     >>> find_matching_language('zxx') | ||||||
|     None |     None | ||||||
|     """ |     """ | ||||||
|     import spacy.lang  # noqa: F401 |     import spacy.lang  # noqa: F401 | ||||||
| 
 | 
 | ||||||
|     if lang == "xx": |     # Check aliases | ||||||
|         return "xx" |     for lang_code, aliases in LANG_ALIASES.items(): | ||||||
|  |         if lang in aliases: | ||||||
|  |             return lang_code | ||||||
| 
 | 
 | ||||||
|     # Find out which language modules we have |     return None | ||||||
|     possible_languages = [] |  | ||||||
|     for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined] |  | ||||||
|         code = modinfo.name |  | ||||||
|         if code == "xx": |  | ||||||
|             # Temporarily make 'xx' into a valid language code |  | ||||||
|             possible_languages.append("mul") |  | ||||||
|         elif langcodes.tag_is_valid(code): |  | ||||||
|             possible_languages.append(code) |  | ||||||
| 
 |  | ||||||
|     # Distances from 1-9 allow near misses like Bosnian -> Croatian and |  | ||||||
|     # Norwegian -> Norwegian Bokmål. A distance of 10 would include several |  | ||||||
|     # more possibilities, like variants of Chinese like 'wuu', but text that |  | ||||||
|     # is labeled that way is probably trying to be distinct from 'zh' and |  | ||||||
|     # shouldn't automatically match. |  | ||||||
|     match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) |  | ||||||
|     if match == "mul": |  | ||||||
|         # Convert 'mul' back to spaCy's 'xx' |  | ||||||
|         return "xx" |  | ||||||
|     else: |  | ||||||
|         return match |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_lang_class(lang: str) -> Type["Language"]: | def get_lang_class(lang: str) -> Type["Language"]: | ||||||
|     """Import and load a Language class. |     """Import and load a Language class. | ||||||
| 
 | 
 | ||||||
|     lang (str): IETF language code, such as 'en'. |     lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'. | ||||||
|     RETURNS (Language): Language class. |     RETURNS (Language): Language class. | ||||||
|     """ |     """ | ||||||
|     # Check if language is registered / entry point is available |     # Check if language is registered / entry point is available | ||||||
|  | @ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]: | ||||||
|         try: |         try: | ||||||
|             module = importlib.import_module(f".lang.{lang}", "spacy") |             module = importlib.import_module(f".lang.{lang}", "spacy") | ||||||
|         except ImportError as err: |         except ImportError as err: | ||||||
|             # Find a matching language. For example, if the language 'no' is |             # Find a matching language. For example, if the language 'eng' is | ||||||
|             # requested, we can use language-matching to load `spacy.lang.nb`. |             # requested, we can use language-matching to load `spacy.lang.en`. | ||||||
|             try: |             match = find_matching_language(lang) | ||||||
|                 match = find_matching_language(lang) |  | ||||||
|             except langcodes.tag_parser.LanguageTagError: |  | ||||||
|                 # proceed to raising an import error |  | ||||||
|                 match = None |  | ||||||
| 
 | 
 | ||||||
|             if match: |             if match: | ||||||
|                 lang = match |                 lang = match | ||||||
|  |  | ||||||
|  | @ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | ||||||
| 
 | 
 | ||||||
| | Name               | Description                                                                                                                                                                                                                                                         | | | Name               | Description                                                                                                                                                                                                                                                         | | ||||||
| | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 | | | `lang`             | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~                                                                                                                                 | | ||||||
| | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | ||||||
| | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               | | | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               | | ||||||
| | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  | | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  | | ||||||
|  |  | ||||||
|  | @ -1078,7 +1078,7 @@ details. | ||||||
| | Name             | Description                                                                                                                                                                       | | | Name             | Description                                                                                                                                                                       | | ||||||
| | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            | | | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            | | ||||||
| | `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  | | | `lang`           | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~                                                                  | | ||||||
| | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | | ||||||
| 
 | 
 | ||||||
| ## Defaults {id="defaults"} | ## Defaults {id="defaults"} | ||||||
|  |  | ||||||
|  | @ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of | ||||||
| 
 | 
 | ||||||
| | Name                                | Description                                                                                                                                                        | | | Name                                | Description                                                                                                                                                        | | ||||||
| | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                | | | `name`                              | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~                                | | ||||||
| | _keyword-only_                      |                                                                                                                                                                    | | | _keyword-only_                      |                                                                                                                                                                    | | ||||||
| | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              | | | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              | | ||||||
| | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user