mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Remove dependency on langcodes
This commit is contained in:
		
							parent
							
								
									b3c46c315e
								
							
						
					
					
						commit
						5b6412e88b
					
				|  | @ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0 | |||
| tqdm>=4.38.0,<5.0.0 | ||||
| pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 | ||||
| jinja2 | ||||
| langcodes>=3.2.0,<4.0.0 | ||||
| # Official Python utilities | ||||
| setuptools | ||||
| packaging>=20.0 | ||||
|  |  | |||
|  | @ -65,7 +65,6 @@ install_requires = | |||
|     # Official Python utilities | ||||
|     setuptools | ||||
|     packaging>=20.0 | ||||
|     langcodes>=3.2.0,<4.0.0 | ||||
| 
 | ||||
| [options.entry_points] | ||||
| console_scripts = | ||||
|  |  | |||
|  | @ -143,7 +143,7 @@ class Language: | |||
| 
 | ||||
|     Defaults (class): Settings, data and factory methods for creating the `nlp` | ||||
|         object and processing pipeline. | ||||
|     lang (str): IETF language code, such as 'en'. | ||||
|     lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/language | ||||
|     """ | ||||
|  |  | |||
|  | @ -656,17 +656,12 @@ def test_spacy_blank(): | |||
| @pytest.mark.parametrize( | ||||
|     "lang,target", | ||||
|     [ | ||||
|         ("en", "en"), | ||||
|         ("fra", "fr"), | ||||
|         ("fre", "fr"), | ||||
|         ("iw", "he"), | ||||
|         ("mo", "ro"), | ||||
|         ("scc", "sr"), | ||||
|         ("mul", "xx"), | ||||
|         ("no", "nb"), | ||||
|         ("pt-BR", "pt"), | ||||
|         ("xx", "xx"), | ||||
|         ("zh-Hans", "zh"), | ||||
|         ("zh-Hant", None), | ||||
|         ("zxx", None), | ||||
|     ], | ||||
| ) | ||||
|  | @ -686,11 +681,9 @@ def test_language_matching(lang, target): | |||
|         ("fre", "fr"), | ||||
|         ("iw", "he"), | ||||
|         ("mo", "ro"), | ||||
|         ("scc", "sr"), | ||||
|         ("mul", "xx"), | ||||
|         ("no", "nb"), | ||||
|         ("pt-BR", "pt"), | ||||
|         ("xx", "xx"), | ||||
|         ("zh-Hans", "zh"), | ||||
|     ], | ||||
| ) | ||||
| def test_blank_languages(lang, target): | ||||
|  |  | |||
							
								
								
									
										143
									
								
								spacy/util.py
									
									
									
									
									
								
							
							
						
						
									
										143
									
								
								spacy/util.py
									
									
									
									
									
								
							|  | @ -5,7 +5,6 @@ import inspect | |||
| import itertools | ||||
| import logging | ||||
| import os | ||||
| import pkgutil | ||||
| import re | ||||
| import shlex | ||||
| import shutil | ||||
|  | @ -40,7 +39,6 @@ from typing import ( | |||
| ) | ||||
| 
 | ||||
| import catalogue | ||||
| import langcodes | ||||
| import numpy | ||||
| import srsly | ||||
| import thinc | ||||
|  | @ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt" | |||
| # Default order of sections in the config file. Not all sections needs to exist, | ||||
| # and additional sections are added at the end, in alphabetical order. | ||||
| CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] | ||||
| 
 | ||||
| LANG_ALIASES = { | ||||
|     "af": ["afr"], | ||||
|     "am": ["amh"], | ||||
|     "ar": ["ara"], | ||||
|     "az": ["aze"], | ||||
|     "bg": ["bul"], | ||||
|     "bn": ["ben"], | ||||
|     "bo": ["bod", "tib"], | ||||
|     "ca": ["cat"], | ||||
|     "cs": ["ces", "cze"], | ||||
|     "da": ["dan"], | ||||
|     "de": ["deu", "ger"], | ||||
|     "el": ["ell", "gre"], | ||||
|     "en": ["eng"], | ||||
|     "es": ["spa"], | ||||
|     "et": ["est"], | ||||
|     "eu": ["eus", "baq"], | ||||
|     "fa": ["fas", "per"], | ||||
|     "fi": ["fin"], | ||||
|     "fo": ["fao"], | ||||
|     "fr": ["fra", "fre"], | ||||
|     "ga": ["gle"], | ||||
|     "gd": ["gla"], | ||||
|     "gu": ["guj"], | ||||
|     "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew | ||||
|     "hi": ["hin"], | ||||
|     "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian | ||||
|     "hu": ["hun"], | ||||
|     "hy": ["hye"], | ||||
|     "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew | ||||
|     "is": ["isl", "ice"], | ||||
|     "it": ["ita"], | ||||
|     "ja": ["jpn"], | ||||
|     "kn": ["kan"], | ||||
|     "ko": ["kor"], | ||||
|     "ky": ["kir"], | ||||
|     "la": ["lat"], | ||||
|     "lb": ["ltz"], | ||||
|     "lg": ["lug"], | ||||
|     "lt": ["lit"], | ||||
|     "lv": ["lav"], | ||||
|     "mk": ["mkd", "mac"], | ||||
|     "ml": ["mal"], | ||||
|     "mr": ["mar"], | ||||
|     "ms": ["msa", "may"], | ||||
|     "nb": ["nob"], | ||||
|     "ne": ["nep"], | ||||
|     "nl": ["nld", "dut"], | ||||
|     "nn": ["nno"], | ||||
|     "pl": ["pol"], | ||||
|     "pt": ["por"], | ||||
|     "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian | ||||
|     "ru": ["rus"], | ||||
|     "sa": ["san"], | ||||
|     "si": ["sin"], | ||||
|     "sk": ["slk", "slo"], | ||||
|     "sl": ["slv"], | ||||
|     "sq": ["sqi", "alb"], | ||||
|     "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian | ||||
|     "sv": ["swe"], | ||||
|     "ta": ["tam"], | ||||
|     "te": ["tel"], | ||||
|     "th": ["tha"], | ||||
|     "ti": ["tir"], | ||||
|     "tl": ["tgl"], | ||||
|     "tn": ["tsn"], | ||||
|     "tr": ["tur"], | ||||
|     "tt": ["tat"], | ||||
|     "uk": ["ukr"], | ||||
|     "ur": ["urd"], | ||||
|     "vi": ["viw"], | ||||
|     "yo": ["yor"], | ||||
|     "zh": ["zho", "chi"], | ||||
| 
 | ||||
|     "xx": ["mul"], | ||||
| } | ||||
| # fmt: on | ||||
| 
 | ||||
| logger = logging.getLogger("spacy") | ||||
|  | @ -293,63 +368,39 @@ def lang_class_is_loaded(lang: str) -> bool: | |||
| 
 | ||||
| def find_matching_language(lang: str) -> Optional[str]: | ||||
|     """ | ||||
|     Given an IETF language code, find a supported spaCy language that is a | ||||
|     close match for it (according to Unicode CLDR language-matching rules). | ||||
|     This allows for language aliases, ISO 639-2 codes, more detailed language | ||||
|     tags, and close matches. | ||||
|     Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code, | ||||
|     find a supported spaCy language. | ||||
| 
 | ||||
|     Returns the language code if a matching language is available, or None | ||||
|     if there is no matching language. | ||||
| 
 | ||||
|     >>> find_matching_language('en') | ||||
|     'en' | ||||
|     >>> find_matching_language('pt-BR')  # Brazilian Portuguese | ||||
|     'pt' | ||||
|     >>> find_matching_language('fra')  # an ISO 639-2 code for French | ||||
|     >>> find_matching_language('fra')  # ISO 639-3 code for French | ||||
|     'fr' | ||||
|     >>> find_matching_language('iw')  # obsolete alias for Hebrew | ||||
|     >>> find_matching_language('fre')  # ISO 639-2/B code for French | ||||
|     'fr' | ||||
|     >>> find_matching_language('iw')  # Obsolete ISO 639-1 code for Hebrew | ||||
|     'he' | ||||
|     >>> find_matching_language('no')  # Norwegian | ||||
|     'nb' | ||||
|     >>> find_matching_language('mo')  # old code for ro-MD | ||||
|     >>> find_matching_language('mo')  # Deprecated code for Moldavian | ||||
|     'ro' | ||||
|     >>> find_matching_language('zh-Hans')  # Simplified Chinese | ||||
|     'zh' | ||||
|     >>> find_matching_language('scc')  # Deprecated ISO 639-2/B code for Serbian | ||||
|     'sr' | ||||
|     >>> find_matching_language('zxx') | ||||
|     None | ||||
|     """ | ||||
|     import spacy.lang  # noqa: F401 | ||||
| 
 | ||||
|     if lang == "xx": | ||||
|         return "xx" | ||||
|     # Check aliases | ||||
|     for lang_code, aliases in LANG_ALIASES.items(): | ||||
|         if lang in aliases: | ||||
|             return lang_code | ||||
| 
 | ||||
|     # Find out which language modules we have | ||||
|     possible_languages = [] | ||||
|     for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined] | ||||
|         code = modinfo.name | ||||
|         if code == "xx": | ||||
|             # Temporarily make 'xx' into a valid language code | ||||
|             possible_languages.append("mul") | ||||
|         elif langcodes.tag_is_valid(code): | ||||
|             possible_languages.append(code) | ||||
| 
 | ||||
|     # Distances from 1-9 allow near misses like Bosnian -> Croatian and | ||||
|     # Norwegian -> Norwegian Bokmål. A distance of 10 would include several | ||||
|     # more possibilities, like variants of Chinese like 'wuu', but text that | ||||
|     # is labeled that way is probably trying to be distinct from 'zh' and | ||||
|     # shouldn't automatically match. | ||||
|     match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) | ||||
|     if match == "mul": | ||||
|         # Convert 'mul' back to spaCy's 'xx' | ||||
|         return "xx" | ||||
|     else: | ||||
|         return match | ||||
|     return None | ||||
| 
 | ||||
| 
 | ||||
| def get_lang_class(lang: str) -> Type["Language"]: | ||||
|     """Import and load a Language class. | ||||
| 
 | ||||
|     lang (str): IETF language code, such as 'en'. | ||||
|     lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'. | ||||
|     RETURNS (Language): Language class. | ||||
|     """ | ||||
|     # Check if language is registered / entry point is available | ||||
|  | @ -360,13 +411,9 @@ def get_lang_class(lang: str) -> Type["Language"]: | |||
|         try: | ||||
|             module = importlib.import_module(f".lang.{lang}", "spacy") | ||||
|         except ImportError as err: | ||||
|             # Find a matching language. For example, if the language 'no' is | ||||
|             # requested, we can use language-matching to load `spacy.lang.nb`. | ||||
|             try: | ||||
|                 match = find_matching_language(lang) | ||||
|             except langcodes.tag_parser.LanguageTagError: | ||||
|                 # proceed to raising an import error | ||||
|                 match = None | ||||
|             # Find a matching language. For example, if the language 'eng' is | ||||
|             # requested, we can use language-matching to load `spacy.lang.en`. | ||||
|             match = find_matching_language(lang) | ||||
| 
 | ||||
|             if match: | ||||
|                 lang = match | ||||
|  |  | |||
|  | @ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | |||
| 
 | ||||
| | Name               | Description                                                                                                                                                                                                                                                         | | ||||
| | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 | | ||||
| | `lang`             | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~                                                                                                                                 | | ||||
| | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | ||||
| | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               | | ||||
| | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  | | ||||
|  |  | |||
|  | @ -1078,7 +1078,7 @@ details. | |||
| | Name             | Description                                                                                                                                                                       | | ||||
| | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            | | ||||
| | `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  | | ||||
| | `lang`           | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~                                                                  | | ||||
| | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | | ||||
| 
 | ||||
| ## Defaults {id="defaults"} | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of | |||
| 
 | ||||
| | Name                                | Description                                                                                                                                                        | | ||||
| | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                | | ||||
| | `name`                              | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~                                | | ||||
| | _keyword-only_                      |                                                                                                                                                                    | | ||||
| | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              | | ||||
| | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user