Rename lang codes

This commit is contained in:
thomashacker 2022-11-09 13:47:16 +01:00
parent d0fc871a1c
commit 432db3d299
22 changed files with 68 additions and 72 deletions

View File

@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
class Icelandic(Language): class Icelandic(Language):
lang = "is" lang = "isl"
Defaults = IcelandicDefaults Defaults = IcelandicDefaults

View File

@ -3,10 +3,10 @@ from ...language import Language
class MultiLanguage(Language): class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages. """Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'. This module allows models to specify their language ID as 'mul'.
""" """
lang = "xx" lang = "mul"
__all__ = ["MultiLanguage"] __all__ = ["MultiLanguage"]

View File

@ -104,7 +104,7 @@ class Scorer:
def __init__( def __init__(
self, self,
nlp: Optional["Language"] = None, nlp: Optional["Language"] = None,
default_lang: str = "xx", default_lang: str = "mul",
default_pipeline: Iterable[str] = DEFAULT_PIPELINE, default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg, **cfg,
) -> None: ) -> None:

View File

@ -52,7 +52,7 @@ def pytest_runtest_setup(item):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def tokenizer(): def tokenizer():
return get_lang_class("xx")().tokenizer return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -212,8 +212,8 @@ def id_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def is_tokenizer(): def isl_tokenizer():
return get_lang_class("is")().tokenizer return get_lang_class("isl")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -465,8 +465,8 @@ def vi_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def xx_tokenizer(): def mul_tokenizer():
return get_lang_class("xx")().tokenizer return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")

View File

@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
from spacy.attrs import SENT_START, TAG from spacy.attrs import SENT_START, TAG
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.xx import MultiLanguage from spacy.lang.mul import MultiLanguage
from spacy.language import Language from spacy.language import Language
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.tokens import Doc, Span, SpanGroup, Token

View File

@ -1,7 +1,7 @@
import pytest import pytest
def test_long_text(is_tokenizer): def test_long_text(isl_tokenizer):
# Excerpt: European Convention on Human Rights # Excerpt: European Convention on Human Rights
text = """ text = """
hafa í huga, yfirlýsing þessi hefur það markmið tryggja hafa í huga, yfirlýsing þessi hefur það markmið tryggja
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
""" """
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
assert len(tokens) == 120 assert len(tokens) == 120
@pytest.mark.xfail @pytest.mark.xfail
def test_ordinal_number(is_tokenizer): def test_ordinal_number(isl_tokenizer):
text = "10. desember 1948" text = "10. desember 1948"
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -1,6 +1,6 @@
import pytest import pytest
IS_BASIC_TOKENIZATION_TESTS = [ ISL_BASIC_TOKENIZATION_TESTS = [
( (
"Enginn maður skal sæta pyndingum eða ómannlegri eða " "Enginn maður skal sæta pyndingum eða ómannlegri eða "
"vanvirðandi meðferð eða refsingu. ", "vanvirðandi meðferð eða refsingu. ",
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
] ]
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) @pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
tokens = is_tokenizer(text) tokens = isl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list

View File

@ -1,7 +1,7 @@
import pytest import pytest
def test_long_text(xx_tokenizer): def test_long_text(mul_tokenizer):
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
text = """ text = """
ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
Sääʹmteʹǧǧ. Sääʹmteʹǧǧ.
""" """
tokens = xx_tokenizer(text) tokens = mul_tokenizer(text)
assert len(tokens) == 179 assert len(tokens) == 179

View File

@ -1,6 +1,6 @@
import pytest import pytest
XX_BASIC_TOKENIZATION_TESTS = [ MUL_BASIC_TOKENIZATION_TESTS = [
( (
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
[ [
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
] ]
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) @pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
tokens = xx_tokenizer(text) tokens = mul_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list

View File

@ -7,10 +7,10 @@ from spacy.util import get_lang_class
# excluded: ja, ko, th, vi, zh # excluded: ja, ko, th, vi, zh
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"] "tr", "tt", "uk", "ur", "mul", "yo"]
# fmt: on # fmt: on

View File

@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
def test_span_ruler_add_empty(patterns): def test_span_ruler_add_empty(patterns):
"""Test that patterns don't get added excessively.""" """Test that patterns don't get added excessively."""
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"validate": True}) ruler = nlp.add_pipe("span_ruler", config={"validate": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
def test_span_ruler_init(patterns): def test_span_ruler_init(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
def test_span_ruler_no_patterns_warns(): def test_span_ruler_no_patterns_warns():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
assert len(ruler) == 0 assert len(ruler) == 0
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
def test_span_ruler_init_patterns(patterns): def test_span_ruler_init_patterns(patterns):
# initialize with patterns # initialize with patterns
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns) ruler.initialize(lambda: [], patterns=patterns)
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
def test_span_ruler_init_clear(patterns): def test_span_ruler_init_clear(patterns):
"""Test that initialization clears patterns.""" """Test that initialization clears patterns."""
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
def test_span_ruler_clear(patterns): def test_span_ruler_clear(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
def test_span_ruler_existing(patterns): def test_span_ruler_existing(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye") doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
def test_span_ruler_existing_overwrite(patterns): def test_span_ruler_existing_overwrite(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye") doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
def test_span_ruler_serialize_bytes(patterns): def test_span_ruler_serialize_bytes(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
ruler_bytes = ruler.to_bytes() ruler_bytes = ruler.to_bytes()
new_nlp = spacy.blank("xx") new_nlp = spacy.blank("mul")
new_ruler = new_nlp.add_pipe("span_ruler") new_ruler = new_nlp.add_pipe("span_ruler")
assert len(new_ruler) == 0 assert len(new_ruler) == 0
assert len(new_ruler.labels) == 0 assert len(new_ruler.labels) == 0
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
def test_span_ruler_validate(): def test_span_ruler_validate():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
validated_ruler = nlp.add_pipe( validated_ruler = nlp.add_pipe(
"span_ruler", name="validated_span_ruler", config={"validate": True} "span_ruler", name="validated_span_ruler", config={"validate": True}
@ -203,14 +203,14 @@ def test_span_ruler_validate():
def test_span_ruler_properties(patterns): def test_span_ruler_properties(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns])) assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
def test_span_ruler_overlapping_spans(overlapping_patterns): def test_span_ruler_overlapping_spans(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz")) doc = ruler(nlp.make_doc("foo bar baz"))
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
def test_span_ruler_scorer(overlapping_patterns): def test_span_ruler_scorer(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
text = "foo bar baz" text = "foo bar baz"
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
def test_span_ruler_serialize_dir(patterns): def test_span_ruler_serialize_dir(patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
with make_tempdir() as d: with make_tempdir() as d:
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
def test_span_ruler_remove_basic(person_org_patterns): def test_span_ruler_remove_basic(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina went to school")) doc = ruler(nlp.make_doc("Dina went to school"))
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
def test_span_ruler_remove_several_patterns(person_org_patterns): def test_span_ruler_remove_several_patterns(person_org_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns) ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME.")) doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns) ruler.add_patterns(person_org_date_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th")) doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
def test_span_ruler_remove_all_patterns(person_org_date_patterns): def test_span_ruler_remove_all_patterns(person_org_date_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns) ruler.add_patterns(person_org_date_patterns)
assert len(ruler.patterns) == 4 assert len(ruler.patterns) == 4
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
def test_span_ruler_remove_and_add(): def test_span_ruler_remove_and_add():
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler") ruler = nlp.add_pipe("span_ruler")
patterns1 = [{"label": "DATE1", "pattern": "last time"}] patterns1 = [{"label": "DATE1", "pattern": "last time"}]
ruler.add_patterns(patterns1) ruler.add_patterns(patterns1)
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
def test_span_ruler_spans_filter(overlapping_patterns): def test_span_ruler_spans_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}}, config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
def test_span_ruler_ents_default_filter(overlapping_patterns): def test_span_ruler_ents_default_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True}) ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
ruler.add_patterns(overlapping_patterns) ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz")) doc = ruler(nlp.make_doc("foo bar baz"))
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
def test_span_ruler_ents_overwrite_filter(overlapping_patterns): def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={ config={
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
return pass_through_filter return pass_through_filter
nlp = spacy.blank("xx") nlp = spacy.blank("mul")
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
"span_ruler", "span_ruler",
config={ config={

View File

@ -552,10 +552,10 @@ def test_spacy_blank():
("fre", "fr"), ("fre", "fr"),
("iw", "he"), ("iw", "he"),
("mo", "ro"), ("mo", "ro"),
("mul", "xx"),
("no", "nb"), ("no", "nb"),
("is", "isl"),
("pt-BR", "pt"), ("pt-BR", "pt"),
("xx", "xx"), ("xx", "mul"),
("zh-Hans", "zh"), ("zh-Hans", "zh"),
("zh-Hant", None), ("zh-Hant", None),
("zxx", None), ("zxx", None),
@ -577,10 +577,10 @@ def test_language_matching(lang, target):
("fre", "fr"), ("fre", "fr"),
("iw", "he"), ("iw", "he"),
("mo", "ro"), ("mo", "ro"),
("mul", "xx"), ("is", "isl"),
("xx", "mul"),
("no", "nb"), ("no", "nb"),
("pt-BR", "pt"), ("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"), ("zh-Hans", "zh"),
], ],
) )

View File

@ -10,7 +10,6 @@ from spacy.tokenizer import Tokenizer
from spacy.util import get_lang_class from spacy.util import get_lang_class
# Only include languages with no external dependencies # Only include languages with no external dependencies
# "is" seems to confuse importlib, so we're also excluding it for now
# excluded: ja, ru, th, uk, vi, zh, is # excluded: ja, ru, th, uk, vi, zh, is
LANGUAGES = [ LANGUAGES = [
pytest.param("fr", marks=pytest.mark.slow()), pytest.param("fr", marks=pytest.mark.slow()),
@ -36,6 +35,7 @@ LANGUAGES = [
"hu", "hu",
pytest.param("id", marks=pytest.mark.slow()), pytest.param("id", marks=pytest.mark.slow()),
pytest.param("it", marks=pytest.mark.slow()), pytest.param("it", marks=pytest.mark.slow()),
pytest.param("isl", marks=pytest.mark.slow()),
pytest.param("kn", marks=pytest.mark.slow()), pytest.param("kn", marks=pytest.mark.slow()),
pytest.param("lb", marks=pytest.mark.slow()), pytest.param("lb", marks=pytest.mark.slow()),
pytest.param("lt", marks=pytest.mark.slow()), pytest.param("lt", marks=pytest.mark.slow()),

View File

@ -86,7 +86,7 @@ def conll_ner_to_docs(
if model: if model:
nlp = load_model(model) nlp = load_model(model)
else: else:
nlp = get_lang_class("xx")() nlp = get_lang_class("mul")()
for conll_doc in input_data.strip().split(doc_delimiter): for conll_doc in input_data.strip().split(doc_delimiter):
conll_doc = conll_doc.strip() conll_doc = conll_doc.strip()
if not conll_doc: if not conll_doc:
@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
"Segmenting sentences with sentencizer. (Use `-b model` for " "Segmenting sentences with sentencizer. (Use `-b model` for "
"improved parser-based sentence segmentation.)" "improved parser-based sentence segmentation.)"
) )
nlp = get_lang_class("xx")() nlp = get_lang_class("mul")()
sentencizer = nlp.create_pipe("sentencizer") sentencizer = nlp.create_pipe("sentencizer")
lines = doc.strip().split("\n") lines = doc.strip().split("\n")
words = [line.strip().split()[0] for line in lines] words = [line.strip().split()[0] for line in lines]

View File

@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations
from ..example import annotations_to_doc from ..example import annotations_to_doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
from ...lang.xx import MultiLanguage from ...lang.mul import MultiLanguage
def json_to_docs(input_data, model=None, **kwargs): def json_to_docs(input_data, model=None, **kwargs):

View File

@ -282,7 +282,7 @@ def find_matching_language(lang: str) -> Optional[str]:
import spacy.lang # noqa: F401 import spacy.lang # noqa: F401
if lang == "xx": if lang == "xx":
return "xx" return "mul"
# Find out which language modules we have # Find out which language modules we have
possible_languages = [] possible_languages = []
@ -300,11 +300,7 @@ def find_matching_language(lang: str) -> Optional[str]:
# is labeled that way is probably trying to be distinct from 'zh' and # is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match. # shouldn't automatically match.
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
if match == "mul": return match
# Convert 'mul' back to spaCy's 'xx'
return "xx"
else:
return match
def get_lang_class(lang: str) -> Type["Language"]: def get_lang_class(lang: str) -> Type["Language"]:

View File

@ -74,11 +74,11 @@ import Languages from 'widgets/languages.js'
> ```python > ```python
> # Standard import > # Standard import
> from spacy.lang.xx import MultiLanguage > from spacy.lang.mul import MultiLanguage
> nlp = MultiLanguage() > nlp = MultiLanguage()
> >
> # With lazy-loading > # With lazy-loading
> nlp = spacy.blank("xx") > nlp = spacy.blank("mul")
> ``` > ```
spaCy also supports pipelines trained on more than one language. This is spaCy also supports pipelines trained on more than one language. This is
@ -88,9 +88,9 @@ generic subclass containing only the base language data, can be found in
[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). [`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
To train a pipeline using the neutral multi-language class, you can set To train a pipeline using the neutral multi-language class, you can set
`lang = "xx"` in your [training config](/usage/training#config). You can also `lang = "mul"` in your [training config](/usage/training#config). You can also
import the `MultiLanguage` class directly, or call import the `MultiLanguage` class directly, or call
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. [`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
### Chinese language support {#chinese new="2.3"} ### Chinese language support {#chinese new="2.3"}

View File

@ -207,7 +207,7 @@
"has_examples": true "has_examples": true
}, },
{ {
"code": "is", "code": "isl",
"name": "Icelandic" "name": "Icelandic"
}, },
{ {
@ -530,7 +530,7 @@
] ]
}, },
{ {
"code": "xx", "code": "mul",
"name": "Multi-language", "name": "Multi-language",
"models": [ "models": [
"xx_ent_wiki_sm", "xx_ent_wiki_sm",