From 432db3d299bfad783a8dc91e97c5dfc6d81e45a1 Mon Sep 17 00:00:00 2001 From: thomashacker Date: Wed, 9 Nov 2022 13:47:16 +0100 Subject: [PATCH] Rename lang codes --- spacy/lang/{is => isl}/__init__.py | 2 +- spacy/lang/{is => isl}/stop_words.py | 0 spacy/lang/{xx => mul}/__init__.py | 4 +- spacy/lang/{xx => mul}/examples.py | 0 spacy/scorer.py | 2 +- spacy/tests/conftest.py | 10 ++-- spacy/tests/doc/test_doc_api.py | 2 +- spacy/tests/lang/{is => isl}/__init__.py | 0 spacy/tests/lang/{is => isl}/test_text.py | 8 +-- .../tests/lang/{is => isl}/test_tokenizer.py | 8 +-- spacy/tests/lang/{xx => mul}/__init__.py | 0 spacy/tests/lang/{xx => mul}/test_text.py | 4 +- .../tests/lang/{xx => mul}/test_tokenizer.py | 8 +-- spacy/tests/lang/test_initialize.py | 4 +- spacy/tests/pipeline/test_span_ruler.py | 52 +++++++++---------- spacy/tests/test_language.py | 8 +-- spacy/tests/tokenizer/test_explain.py | 2 +- .../training/converters/conll_ner_to_docs.py | 4 +- spacy/training/converters/json_to_docs.py | 2 +- spacy/util.py | 8 +-- website/docs/usage/models.md | 8 +-- website/meta/languages.json | 4 +- 22 files changed, 68 insertions(+), 72 deletions(-) rename spacy/lang/{is => isl}/__init__.py (93%) rename spacy/lang/{is => isl}/stop_words.py (100%) rename spacy/lang/{xx => mul}/__init__.py (67%) rename spacy/lang/{xx => mul}/examples.py (100%) rename spacy/tests/lang/{is => isl}/__init__.py (100%) rename spacy/tests/lang/{is => isl}/test_text.py (85%) rename spacy/tests/lang/{is => isl}/test_tokenizer.py (72%) rename spacy/tests/lang/{xx => mul}/__init__.py (100%) rename spacy/tests/lang/{xx => mul}/test_text.py (96%) rename spacy/tests/lang/{xx => mul}/test_tokenizer.py (68%) diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py similarity index 93% rename from spacy/lang/is/__init__.py rename to spacy/lang/isl/__init__.py index 318363beb..16d1f7957 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/isl/__init__.py @@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults): class Icelandic(Language): - lang = "is" + lang = "isl" Defaults = IcelandicDefaults diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py similarity index 100% rename from spacy/lang/is/stop_words.py rename to spacy/lang/isl/stop_words.py diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py similarity index 67% rename from spacy/lang/xx/__init__.py rename to spacy/lang/mul/__init__.py index aff8403ff..5170f1e86 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/mul/__init__.py @@ -3,10 +3,10 @@ from ...language import Language class MultiLanguage(Language): """Language class to be used for models that support multiple languages. - This module allows models to specify their language ID as 'xx'. + This module allows models to specify their language ID as 'mul'. """ - lang = "xx" + lang = "mul" __all__ = ["MultiLanguage"] diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py similarity index 100% rename from spacy/lang/xx/examples.py rename to spacy/lang/mul/examples.py diff --git a/spacy/scorer.py b/spacy/scorer.py index 16fc303a0..5902ab0da 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -104,7 +104,7 @@ class Scorer: def __init__( self, nlp: Optional["Language"] = None, - default_lang: str = "xx", + default_lang: str = "mul", default_pipeline: Iterable[str] = DEFAULT_PIPELINE, **cfg, ) -> None: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b1dc77ef0..65b376cda 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -52,7 +52,7 @@ def pytest_runtest_setup(item): @pytest.fixture(scope="module") def tokenizer(): - return get_lang_class("xx")().tokenizer + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") @@ -212,8 +212,8 @@ def id_tokenizer(): @pytest.fixture(scope="session") -def is_tokenizer(): - return get_lang_class("is")().tokenizer +def isl_tokenizer(): + return get_lang_class("isl")().tokenizer @pytest.fixture(scope="session") @@ -465,8 +465,8 @@ def vi_tokenizer(): @pytest.fixture(scope="session") -def xx_tokenizer(): - return get_lang_class("xx")().tokenizer +def mul_tokenizer(): + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 38003dea9..06ef96b84 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS from spacy.attrs import SENT_START, TAG from spacy.lang.en import English -from spacy.lang.xx import MultiLanguage +from spacy.lang.mul import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/isl/__init__.py similarity index 100% rename from spacy/tests/lang/is/__init__.py rename to spacy/tests/lang/isl/__init__.py diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/isl/test_text.py similarity index 85% rename from spacy/tests/lang/is/test_text.py rename to spacy/tests/lang/isl/test_text.py index 6e3654a6e..9e177485d 100644 --- a/spacy/tests/lang/is/test_text.py +++ b/spacy/tests/lang/isl/test_text.py @@ -1,7 +1,7 @@ import pytest -def test_long_text(is_tokenizer): +def test_long_text(isl_tokenizer): # Excerpt: European Convention on Human Rights text = """ hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja @@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; """ - tokens = is_tokenizer(text) + tokens = isl_tokenizer(text) assert len(tokens) == 120 @pytest.mark.xfail -def test_ordinal_number(is_tokenizer): +def test_ordinal_number(isl_tokenizer): text = "10. desember 1948" - tokens = is_tokenizer(text) + tokens = isl_tokenizer(text) assert len(tokens) == 3 diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/isl/test_tokenizer.py similarity index 72% rename from spacy/tests/lang/is/test_tokenizer.py rename to spacy/tests/lang/isl/test_tokenizer.py index 0c05a6050..ba534aaf6 100644 --- a/spacy/tests/lang/is/test_tokenizer.py +++ b/spacy/tests/lang/isl/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest -IS_BASIC_TOKENIZATION_TESTS = [ +ISL_BASIC_TOKENIZATION_TESTS = [ ( "Enginn maður skal sæta pyndingum eða ómannlegri eða " "vanvirðandi meðferð eða refsingu. ", @@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [ ] -@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) -def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): - tokens = is_tokenizer(text) +@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS) +def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens): + tokens = isl_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/mul/__init__.py similarity index 100% rename from spacy/tests/lang/xx/__init__.py rename to spacy/tests/lang/mul/__init__.py diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/mul/test_text.py similarity index 96% rename from spacy/tests/lang/xx/test_text.py rename to spacy/tests/lang/mul/test_text.py index 477f0ebe2..6e4262d66 100644 --- a/spacy/tests/lang/xx/test_text.py +++ b/spacy/tests/lang/mul/test_text.py @@ -1,7 +1,7 @@ import pytest -def test_long_text(xx_tokenizer): +def test_long_text(mul_tokenizer): # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi text = """ Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. @@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu Sääʹmteʹǧǧ. """ - tokens = xx_tokenizer(text) + tokens = mul_tokenizer(text) assert len(tokens) == 179 diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/mul/test_tokenizer.py similarity index 68% rename from spacy/tests/lang/xx/test_tokenizer.py rename to spacy/tests/lang/mul/test_tokenizer.py index 15c760a6b..3d06dc11c 100644 --- a/spacy/tests/lang/xx/test_tokenizer.py +++ b/spacy/tests/lang/mul/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest -XX_BASIC_TOKENIZATION_TESTS = [ +MUL_BASIC_TOKENIZATION_TESTS = [ ( "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", [ @@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [ ] -@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) -def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): - tokens = xx_tokenizer(text) +@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS) +def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens): + tokens = mul_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 36f4a75e0..df76307f1 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -7,10 +7,10 @@ from spacy.util import get_lang_class # excluded: ja, ko, th, vi, zh LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", - "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", + "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "mul", "yo"] # fmt: on diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py index 794815359..fe3bdd1bf 100644 --- a/spacy/tests/pipeline/test_span_ruler.py +++ b/spacy/tests/pipeline/test_span_ruler.py @@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns): def test_span_ruler_add_empty(patterns): """Test that patterns don't get added excessively.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"validate": True}) ruler.add_patterns(patterns) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) @@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns): def test_span_ruler_init(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) @@ -74,7 +74,7 @@ def test_span_ruler_init(patterns): def test_span_ruler_no_patterns_warns(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler) == 0 assert len(ruler.labels) == 0 @@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns(): def test_span_ruler_init_patterns(patterns): # initialize with patterns - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler.labels) == 0 ruler.initialize(lambda: [], patterns=patterns) @@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns): def test_span_ruler_init_clear(patterns): """Test that initialization clears patterns.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns): def test_span_ruler_clear(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns): def test_span_ruler_existing(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": False}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns): def test_span_ruler_existing_overwrite(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns): def test_span_ruler_serialize_bytes(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_nlp = spacy.blank("xx") + new_nlp = spacy.blank("mul") new_ruler = new_nlp.add_pipe("span_ruler") assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 @@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns): def test_span_ruler_validate(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") validated_ruler = nlp.add_pipe( "span_ruler", name="validated_span_ruler", config={"validate": True} @@ -203,14 +203,14 @@ def test_span_ruler_validate(): def test_span_ruler_properties(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns])) def test_span_ruler_overlapping_spans(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns): def test_span_ruler_scorer(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) text = "foo bar baz" @@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process): patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}] - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) @@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process): def test_span_ruler_serialize_dir(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: @@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns): def test_span_ruler_remove_basic(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina went to school")) @@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns): def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) assert len(ruler.patterns) == 3 @@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): def test_span_ruler_remove_several_patterns(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME.")) @@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns): def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th")) @@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): def test_span_ruler_remove_all_patterns(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) assert len(ruler.patterns) == 4 @@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns): def test_span_ruler_remove_and_add(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") patterns1 = [{"label": "DATE1", "pattern": "last time"}] ruler.add_patterns(patterns1) @@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add(): def test_span_ruler_spans_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}}, @@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns): def test_span_ruler_ents_default_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True}) ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns): def test_span_ruler_ents_overwrite_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ @@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns): return pass_through_filter - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 03a98d32f..c8f477a0b 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -552,10 +552,10 @@ def test_spacy_blank(): ("fre", "fr"), ("iw", "he"), ("mo", "ro"), - ("mul", "xx"), ("no", "nb"), + ("is", "isl"), ("pt-BR", "pt"), - ("xx", "xx"), + ("xx", "mul"), ("zh-Hans", "zh"), ("zh-Hant", None), ("zxx", None), @@ -577,10 +577,10 @@ def test_language_matching(lang, target): ("fre", "fr"), ("iw", "he"), ("mo", "ro"), - ("mul", "xx"), + ("is", "isl"), + ("xx", "mul"), ("no", "nb"), ("pt-BR", "pt"), - ("xx", "xx"), ("zh-Hans", "zh"), ], ) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 5b4eeca16..187c7a9c1 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -10,7 +10,6 @@ from spacy.tokenizer import Tokenizer from spacy.util import get_lang_class # Only include languages with no external dependencies -# "is" seems to confuse importlib, so we're also excluding it for now # excluded: ja, ru, th, uk, vi, zh, is LANGUAGES = [ pytest.param("fr", marks=pytest.mark.slow()), @@ -36,6 +35,7 @@ LANGUAGES = [ "hu", pytest.param("id", marks=pytest.mark.slow()), pytest.param("it", marks=pytest.mark.slow()), + pytest.param("isl", marks=pytest.mark.slow()), pytest.param("kn", marks=pytest.mark.slow()), pytest.param("lb", marks=pytest.mark.slow()), pytest.param("lt", marks=pytest.mark.slow()), diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28b21c5f0..259f5fa8c 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -86,7 +86,7 @@ def conll_ner_to_docs( if model: nlp = load_model(model) else: - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() for conll_doc in input_data.strip().split(doc_delimiter): conll_doc = conll_doc.strip() if not conll_doc: @@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): "Segmenting sentences with sentencizer. (Use `-b model` for " "improved parser-based sentence segmentation.)" ) - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() sentencizer = nlp.create_pipe("sentencizer") lines = doc.strip().split("\n") words = [line.strip().split()[0] for line in lines] diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py index 4123839f2..1ff7a64e0 100644 --- a/spacy/training/converters/json_to_docs.py +++ b/spacy/training/converters/json_to_docs.py @@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations from ..example import annotations_to_doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model -from ...lang.xx import MultiLanguage +from ...lang.mul import MultiLanguage def json_to_docs(input_data, model=None, **kwargs): diff --git a/spacy/util.py b/spacy/util.py index 809bc1814..7a0b28665 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -282,7 +282,7 @@ def find_matching_language(lang: str) -> Optional[str]: import spacy.lang # noqa: F401 if lang == "xx": - return "xx" + return "mul" # Find out which language modules we have possible_languages = [] @@ -300,11 +300,7 @@ def find_matching_language(lang: str) -> Optional[str]: # is labeled that way is probably trying to be distinct from 'zh' and # shouldn't automatically match. match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) - if match == "mul": - # Convert 'mul' back to spaCy's 'xx' - return "xx" - else: - return match + return match def get_lang_class(lang: str) -> Type["Language"]: diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 3b1558bd8..8e02ddd74 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -74,11 +74,11 @@ import Languages from 'widgets/languages.js' > ```python > # Standard import -> from spacy.lang.xx import MultiLanguage +> from spacy.lang.mul import MultiLanguage > nlp = MultiLanguage() > > # With lazy-loading -> nlp = spacy.blank("xx") +> nlp = spacy.blank("mul") > ``` spaCy also supports pipelines trained on more than one language. This is @@ -88,9 +88,9 @@ generic subclass containing only the base language data, can be found in [`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). To train a pipeline using the neutral multi-language class, you can set -`lang = "xx"` in your [training config](/usage/training#config). You can also +`lang = "mul"` in your [training config](/usage/training#config). You can also import the `MultiLanguage` class directly, or call -[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. +[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading. ### Chinese language support {#chinese new="2.3"} diff --git a/website/meta/languages.json b/website/meta/languages.json index bd1535c90..33c0fb712 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -207,7 +207,7 @@ "has_examples": true }, { - "code": "is", + "code": "isl", "name": "Icelandic" }, { @@ -530,7 +530,7 @@ ] }, { - "code": "xx", + "code": "mul", "name": "Multi-language", "models": [ "xx_ent_wiki_sm",