mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Rename lang codes
This commit is contained in:
parent
d0fc871a1c
commit
432db3d299
|
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
|
|||
|
||||
|
||||
class Icelandic(Language):
|
||||
lang = "is"
|
||||
lang = "isl"
|
||||
Defaults = IcelandicDefaults
|
||||
|
||||
|
|
@ -3,10 +3,10 @@ from ...language import Language
|
|||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
This module allows models to specify their language ID as 'mul'.
|
||||
"""
|
||||
|
||||
lang = "xx"
|
||||
lang = "mul"
|
||||
|
||||
|
||||
__all__ = ["MultiLanguage"]
|
|
@ -104,7 +104,7 @@ class Scorer:
|
|||
def __init__(
|
||||
self,
|
||||
nlp: Optional["Language"] = None,
|
||||
default_lang: str = "xx",
|
||||
default_lang: str = "mul",
|
||||
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
||||
**cfg,
|
||||
) -> None:
|
||||
|
|
|
@ -52,7 +52,7 @@ def pytest_runtest_setup(item):
|
|||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return get_lang_class("xx")().tokenizer
|
||||
return get_lang_class("mul")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -212,8 +212,8 @@ def id_tokenizer():
|
|||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def is_tokenizer():
|
||||
return get_lang_class("is")().tokenizer
|
||||
def isl_tokenizer():
|
||||
return get_lang_class("isl")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -465,8 +465,8 @@ def vi_tokenizer():
|
|||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def xx_tokenizer():
|
||||
return get_lang_class("xx")().tokenizer
|
||||
def mul_tokenizer():
|
||||
return get_lang_class("mul")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
|
|||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||
from spacy.attrs import SENT_START, TAG
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
from spacy.lang.mul import MultiLanguage
|
||||
from spacy.language import Language
|
||||
from spacy.lexeme import Lexeme
|
||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_long_text(is_tokenizer):
|
||||
def test_long_text(isl_tokenizer):
|
||||
# Excerpt: European Convention on Human Rights
|
||||
text = """
|
||||
hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
|
||||
|
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
|
|||
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
|
||||
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
|
||||
"""
|
||||
tokens = is_tokenizer(text)
|
||||
tokens = isl_tokenizer(text)
|
||||
assert len(tokens) == 120
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_ordinal_number(is_tokenizer):
|
||||
def test_ordinal_number(isl_tokenizer):
|
||||
text = "10. desember 1948"
|
||||
tokens = is_tokenizer(text)
|
||||
tokens = isl_tokenizer(text)
|
||||
assert len(tokens) == 3
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
IS_BASIC_TOKENIZATION_TESTS = [
|
||||
ISL_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
|
||||
"vanvirðandi meðferð eða refsingu. ",
|
||||
|
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
|
||||
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
|
||||
tokens = is_tokenizer(text)
|
||||
@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
|
||||
def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
|
||||
tokens = isl_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_long_text(xx_tokenizer):
|
||||
def test_long_text(mul_tokenizer):
|
||||
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
|
||||
text = """
|
||||
Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
|
||||
|
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
|
|||
Sääʹmteʹǧǧ.
|
||||
"""
|
||||
|
||||
tokens = xx_tokenizer(text)
|
||||
tokens = mul_tokenizer(text)
|
||||
assert len(tokens) == 179
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
XX_BASIC_TOKENIZATION_TESTS = [
|
||||
MUL_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
|
||||
[
|
||||
|
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
|
||||
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
|
||||
tokens = xx_tokenizer(text)
|
||||
@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
|
||||
def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
|
||||
tokens = mul_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -7,10 +7,10 @@ from spacy.util import get_lang_class
|
|||
# excluded: ja, ko, th, vi, zh
|
||||
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
||||
"tr", "tt", "uk", "ur", "mul", "yo"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
|
|||
|
||||
def test_span_ruler_add_empty(patterns):
|
||||
"""Test that patterns don't get added excessively."""
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"validate": True})
|
||||
ruler.add_patterns(patterns)
|
||||
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||
|
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_init(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
|
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_no_patterns_warns():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
assert len(ruler) == 0
|
||||
assert len(ruler.labels) == 0
|
||||
|
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
|
|||
|
||||
def test_span_ruler_init_patterns(patterns):
|
||||
# initialize with patterns
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
ruler.initialize(lambda: [], patterns=patterns)
|
||||
|
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
|
|||
|
||||
def test_span_ruler_init_clear(patterns):
|
||||
"""Test that initialization clears patterns."""
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_clear(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_existing(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||
|
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_existing_overwrite(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||
|
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_serialize_bytes(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
new_nlp = spacy.blank("xx")
|
||||
new_nlp = spacy.blank("mul")
|
||||
new_ruler = new_nlp.add_pipe("span_ruler")
|
||||
assert len(new_ruler) == 0
|
||||
assert len(new_ruler.labels) == 0
|
||||
|
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_validate():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
validated_ruler = nlp.add_pipe(
|
||||
"span_ruler", name="validated_span_ruler", config={"validate": True}
|
||||
|
@ -203,14 +203,14 @@ def test_span_ruler_validate():
|
|||
|
||||
|
||||
def test_span_ruler_properties(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||
ruler.add_patterns(patterns)
|
||||
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
|
||||
|
||||
|
||||
def test_span_ruler_overlapping_spans(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||
|
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_scorer(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
text = "foo bar baz"
|
||||
|
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
|
|||
|
||||
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
|
||||
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
|
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
|
|||
|
||||
|
||||
def test_span_ruler_serialize_dir(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
with make_tempdir() as d:
|
||||
|
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_basic(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina went to school"))
|
||||
|
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
assert len(ruler.patterns) == 3
|
||||
|
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_several_patterns(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina founded the company ACME."))
|
||||
|
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_date_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
|
||||
|
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_date_patterns)
|
||||
assert len(ruler.patterns) == 4
|
||||
|
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_and_add():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
patterns1 = [{"label": "DATE1", "pattern": "last time"}]
|
||||
ruler.add_patterns(patterns1)
|
||||
|
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
|
|||
|
||||
|
||||
def test_span_ruler_spans_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
|
||||
|
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_ents_default_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||
|
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={
|
||||
|
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
|
|||
|
||||
return pass_through_filter
|
||||
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={
|
||||
|
|
|
@ -552,10 +552,10 @@ def test_spacy_blank():
|
|||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("is", "isl"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("xx", "mul"),
|
||||
("zh-Hans", "zh"),
|
||||
("zh-Hant", None),
|
||||
("zxx", None),
|
||||
|
@ -577,10 +577,10 @@ def test_language_matching(lang, target):
|
|||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("is", "isl"),
|
||||
("xx", "mul"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
],
|
||||
)
|
||||
|
|
|
@ -10,7 +10,6 @@ from spacy.tokenizer import Tokenizer
|
|||
from spacy.util import get_lang_class
|
||||
|
||||
# Only include languages with no external dependencies
|
||||
# "is" seems to confuse importlib, so we're also excluding it for now
|
||||
# excluded: ja, ru, th, uk, vi, zh, is
|
||||
LANGUAGES = [
|
||||
pytest.param("fr", marks=pytest.mark.slow()),
|
||||
|
@ -36,6 +35,7 @@ LANGUAGES = [
|
|||
"hu",
|
||||
pytest.param("id", marks=pytest.mark.slow()),
|
||||
pytest.param("it", marks=pytest.mark.slow()),
|
||||
pytest.param("isl", marks=pytest.mark.slow()),
|
||||
pytest.param("kn", marks=pytest.mark.slow()),
|
||||
pytest.param("lb", marks=pytest.mark.slow()),
|
||||
pytest.param("lt", marks=pytest.mark.slow()),
|
||||
|
|
|
@ -86,7 +86,7 @@ def conll_ner_to_docs(
|
|||
if model:
|
||||
nlp = load_model(model)
|
||||
else:
|
||||
nlp = get_lang_class("xx")()
|
||||
nlp = get_lang_class("mul")()
|
||||
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||
conll_doc = conll_doc.strip()
|
||||
if not conll_doc:
|
||||
|
@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||
"improved parser-based sentence segmentation.)"
|
||||
)
|
||||
nlp = get_lang_class("xx")()
|
||||
nlp = get_lang_class("mul")()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
lines = doc.strip().split("\n")
|
||||
words = [line.strip().split()[0] for line in lines]
|
||||
|
|
|
@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations
|
|||
from ..example import annotations_to_doc
|
||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||
from ...util import load_model
|
||||
from ...lang.xx import MultiLanguage
|
||||
from ...lang.mul import MultiLanguage
|
||||
|
||||
|
||||
def json_to_docs(input_data, model=None, **kwargs):
|
||||
|
|
|
@ -282,7 +282,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
|||
import spacy.lang # noqa: F401
|
||||
|
||||
if lang == "xx":
|
||||
return "xx"
|
||||
return "mul"
|
||||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
|
@ -300,11 +300,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
|||
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||
# shouldn't automatically match.
|
||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||
if match == "mul":
|
||||
# Convert 'mul' back to spaCy's 'xx'
|
||||
return "xx"
|
||||
else:
|
||||
return match
|
||||
return match
|
||||
|
||||
|
||||
def get_lang_class(lang: str) -> Type["Language"]:
|
||||
|
|
|
@ -74,11 +74,11 @@ import Languages from 'widgets/languages.js'
|
|||
|
||||
> ```python
|
||||
> # Standard import
|
||||
> from spacy.lang.xx import MultiLanguage
|
||||
> from spacy.lang.mul import MultiLanguage
|
||||
> nlp = MultiLanguage()
|
||||
>
|
||||
> # With lazy-loading
|
||||
> nlp = spacy.blank("xx")
|
||||
> nlp = spacy.blank("mul")
|
||||
> ```
|
||||
|
||||
spaCy also supports pipelines trained on more than one language. This is
|
||||
|
@ -88,9 +88,9 @@ generic subclass containing only the base language data, can be found in
|
|||
[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
|
||||
|
||||
To train a pipeline using the neutral multi-language class, you can set
|
||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
||||
`lang = "mul"` in your [training config](/usage/training#config). You can also
|
||||
import the `MultiLanguage` class directly, or call
|
||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||
[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||
|
||||
### Chinese language support {#chinese new="2.3"}
|
||||
|
||||
|
|
|
@ -207,7 +207,7 @@
|
|||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "is",
|
||||
"code": "isl",
|
||||
"name": "Icelandic"
|
||||
},
|
||||
{
|
||||
|
@ -530,7 +530,7 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"code": "xx",
|
||||
"code": "mul",
|
||||
"name": "Multi-language",
|
||||
"models": [
|
||||
"xx_ent_wiki_sm",
|
||||
|
|
Loading…
Reference in New Issue
Block a user