mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 21:53:05 +03:00
Rename language codes (Icelandic, multi-language) (#12149)
* Init * fix tests * Update spacy/errors.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Fix test_blank_languages * Rename xx to mul in docs * Format _util with black * prettier formatting --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
c6cca4c00a
commit
360ccf628a
|
@ -19,6 +19,7 @@ import os
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
|
from ..errors import RENAMED_LANGUAGE_CODES
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -134,6 +135,16 @@ def _parse_override(value: Any) -> Any:
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_renamed_language_codes(lang: Optional[str]) -> None:
|
||||||
|
# Throw error for renamed language codes in v4
|
||||||
|
if lang in RENAMED_LANGUAGE_CODES:
|
||||||
|
msg.fail(
|
||||||
|
title="Renamed language code",
|
||||||
|
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(
|
def load_project_config(
|
||||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
|
|
@ -7,7 +7,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, walk_directory
|
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
@ -112,6 +112,10 @@ def convert(
|
||||||
input_path = Path(input_path)
|
input_path = Path(input_path)
|
||||||
if not msg:
|
if not msg:
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
|
|
||||||
|
# Throw error for renamed language codes in v4
|
||||||
|
_handle_renamed_language_codes(lang)
|
||||||
|
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(input_path, converter):
|
for input_loc in walk_directory(input_path, converter):
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from ..schemas import RecommendationSchema
|
from ..schemas import RecommendationSchema
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
from ._util import string_to_list, import_code
|
from ._util import string_to_list, import_code, _handle_renamed_language_codes
|
||||||
|
|
||||||
|
|
||||||
ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
|
@ -43,7 +43,7 @@ class InitValues:
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
|
||||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
|
@ -169,6 +169,10 @@ def init_config(
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
|
|
||||||
|
# Throw error for renamed language codes in v4
|
||||||
|
_handle_renamed_language_codes(lang)
|
||||||
|
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||||
defaults = RECOMMENDATIONS["__default__"]
|
defaults = RECOMMENDATIONS["__default__"]
|
||||||
|
|
|
@ -9,7 +9,7 @@ from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vectors")
|
@init_cli.command("vectors")
|
||||||
|
@ -31,6 +31,10 @@ def init_vectors_cli(
|
||||||
a model with vectors.
|
a model with vectors.
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
|
||||||
|
# Throw error for renamed language codes in v4
|
||||||
|
_handle_renamed_language_codes(lang)
|
||||||
|
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
|
|
|
@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"reference and predicted docs.")
|
"reference and predicted docs.")
|
||||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
|
|
||||||
|
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||||
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
|
||||||
|
|
||||||
|
|
||||||
class Icelandic(Language):
|
class Icelandic(Language):
|
||||||
lang = "is"
|
lang = "isl"
|
||||||
Defaults = IcelandicDefaults
|
Defaults = IcelandicDefaults
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,10 @@ from ...language import Language
|
||||||
|
|
||||||
class MultiLanguage(Language):
|
class MultiLanguage(Language):
|
||||||
"""Language class to be used for models that support multiple languages.
|
"""Language class to be used for models that support multiple languages.
|
||||||
This module allows models to specify their language ID as 'xx'.
|
This module allows models to specify their language ID as 'mul'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
lang = "xx"
|
lang = "mul"
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["MultiLanguage"]
|
__all__ = ["MultiLanguage"]
|
|
@ -104,7 +104,7 @@ class Scorer:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
nlp: Optional["Language"] = None,
|
nlp: Optional["Language"] = None,
|
||||||
default_lang: str = "xx",
|
default_lang: str = "mul",
|
||||||
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
||||||
**cfg,
|
**cfg,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
|
||||||
|
|
||||||
| Fixture | Description |
|
| Fixture | Description |
|
||||||
| ----------------------------------- | ---------------------------------------------------------------------------- |
|
| ----------------------------------- | ---------------------------------------------------------------------------- |
|
||||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. |
|
||||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||||
|
|
||||||
|
|
|
@ -83,7 +83,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def tokenizer():
|
def tokenizer():
|
||||||
return get_lang_class("xx")().tokenizer
|
return get_lang_class("mul")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
@ -243,8 +243,8 @@ def id_tokenizer():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def is_tokenizer():
|
def isl_tokenizer():
|
||||||
return get_lang_class("is")().tokenizer
|
return get_lang_class("isl")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
@ -496,8 +496,8 @@ def vi_tokenizer():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def xx_tokenizer():
|
def mul_tokenizer():
|
||||||
return get_lang_class("xx")().tokenizer
|
return get_lang_class("mul")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|
|
@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
|
||||||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||||
from spacy.attrs import SENT_START, TAG
|
from spacy.attrs import SENT_START, TAG
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.mul import MultiLanguage
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_long_text(is_tokenizer):
|
def test_long_text(isl_tokenizer):
|
||||||
# Excerpt: European Convention on Human Rights
|
# Excerpt: European Convention on Human Rights
|
||||||
text = """
|
text = """
|
||||||
hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
|
hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
|
||||||
|
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
|
||||||
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
|
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
|
||||||
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
|
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
|
||||||
"""
|
"""
|
||||||
tokens = is_tokenizer(text)
|
tokens = isl_tokenizer(text)
|
||||||
assert len(tokens) == 120
|
assert len(tokens) == 120
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_ordinal_number(is_tokenizer):
|
def test_ordinal_number(isl_tokenizer):
|
||||||
text = "10. desember 1948"
|
text = "10. desember 1948"
|
||||||
tokens = is_tokenizer(text)
|
tokens = isl_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
IS_BASIC_TOKENIZATION_TESTS = [
|
ISL_BASIC_TOKENIZATION_TESTS = [
|
||||||
(
|
(
|
||||||
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
|
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
|
||||||
"vanvirðandi meðferð eða refsingu. ",
|
"vanvirðandi meðferð eða refsingu. ",
|
||||||
|
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
|
||||||
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
|
def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
|
||||||
tokens = is_tokenizer(text)
|
tokens = isl_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_long_text(xx_tokenizer):
|
def test_long_text(mul_tokenizer):
|
||||||
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
|
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
|
||||||
text = """
|
text = """
|
||||||
Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
|
Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
|
||||||
|
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
|
||||||
Sääʹmteʹǧǧ.
|
Sääʹmteʹǧǧ.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = xx_tokenizer(text)
|
tokens = mul_tokenizer(text)
|
||||||
assert len(tokens) == 179
|
assert len(tokens) == 179
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
XX_BASIC_TOKENIZATION_TESTS = [
|
MUL_BASIC_TOKENIZATION_TESTS = [
|
||||||
(
|
(
|
||||||
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
|
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
|
||||||
[
|
[
|
||||||
|
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
|
||||||
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
|
def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
|
||||||
tokens = xx_tokenizer(text)
|
tokens = mul_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
|
@ -7,10 +7,10 @@ from spacy.util import get_lang_class
|
||||||
# excluded: ja, ko, th, vi, zh
|
# excluded: ja, ko, th, vi, zh
|
||||||
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||||
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
|
||||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
"mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
"tr", "tt", "uk", "ur", "yo"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
|
||||||
|
|
||||||
def test_span_ruler_add_empty(patterns):
|
def test_span_ruler_add_empty(patterns):
|
||||||
"""Test that patterns don't get added excessively."""
|
"""Test that patterns don't get added excessively."""
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler", config={"validate": True})
|
ruler = nlp.add_pipe("span_ruler", config={"validate": True})
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||||
|
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_init(patterns):
|
def test_span_ruler_init(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
assert len(ruler) == len(patterns)
|
assert len(ruler) == len(patterns)
|
||||||
|
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_no_patterns_warns():
|
def test_span_ruler_no_patterns_warns():
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
assert len(ruler) == 0
|
assert len(ruler) == 0
|
||||||
assert len(ruler.labels) == 0
|
assert len(ruler.labels) == 0
|
||||||
|
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
|
||||||
|
|
||||||
def test_span_ruler_init_patterns(patterns):
|
def test_span_ruler_init_patterns(patterns):
|
||||||
# initialize with patterns
|
# initialize with patterns
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
assert len(ruler.labels) == 0
|
assert len(ruler.labels) == 0
|
||||||
ruler.initialize(lambda: [], patterns=patterns)
|
ruler.initialize(lambda: [], patterns=patterns)
|
||||||
|
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
|
||||||
|
|
||||||
def test_span_ruler_init_clear(patterns):
|
def test_span_ruler_init_clear(patterns):
|
||||||
"""Test that initialization clears patterns."""
|
"""Test that initialization clears patterns."""
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
assert len(ruler.labels) == 4
|
assert len(ruler.labels) == 4
|
||||||
|
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_clear(patterns):
|
def test_span_ruler_clear(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
assert len(ruler.labels) == 4
|
assert len(ruler.labels) == 4
|
||||||
|
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_existing(patterns):
|
def test_span_ruler_existing(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
|
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||||
|
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_existing_overwrite(patterns):
|
def test_span_ruler_existing_overwrite(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||||
|
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_serialize_bytes(patterns):
|
def test_span_ruler_serialize_bytes(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
assert len(ruler) == len(patterns)
|
assert len(ruler) == len(patterns)
|
||||||
assert len(ruler.labels) == 4
|
assert len(ruler.labels) == 4
|
||||||
ruler_bytes = ruler.to_bytes()
|
ruler_bytes = ruler.to_bytes()
|
||||||
new_nlp = spacy.blank("xx")
|
new_nlp = spacy.blank("mul")
|
||||||
new_ruler = new_nlp.add_pipe("span_ruler")
|
new_ruler = new_nlp.add_pipe("span_ruler")
|
||||||
assert len(new_ruler) == 0
|
assert len(new_ruler) == 0
|
||||||
assert len(new_ruler.labels) == 0
|
assert len(new_ruler.labels) == 0
|
||||||
|
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_validate():
|
def test_span_ruler_validate():
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
validated_ruler = nlp.add_pipe(
|
validated_ruler = nlp.add_pipe(
|
||||||
"span_ruler", name="validated_span_ruler", config={"validate": True}
|
"span_ruler", name="validated_span_ruler", config={"validate": True}
|
||||||
|
@ -203,14 +203,14 @@ def test_span_ruler_validate():
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_properties(patterns):
|
def test_span_ruler_properties(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
|
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_overlapping_spans(overlapping_patterns):
|
def test_span_ruler_overlapping_spans(overlapping_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(overlapping_patterns)
|
ruler.add_patterns(overlapping_patterns)
|
||||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||||
|
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_scorer(overlapping_patterns):
|
def test_span_ruler_scorer(overlapping_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(overlapping_patterns)
|
ruler.add_patterns(overlapping_patterns)
|
||||||
text = "foo bar baz"
|
text = "foo bar baz"
|
||||||
|
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
|
||||||
|
|
||||||
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
|
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
|
||||||
|
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
|
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_serialize_dir(patterns):
|
def test_span_ruler_serialize_dir(patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
|
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_basic(person_org_patterns):
|
def test_span_ruler_remove_basic(person_org_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(person_org_patterns)
|
ruler.add_patterns(person_org_patterns)
|
||||||
doc = ruler(nlp.make_doc("Dina went to school"))
|
doc = ruler(nlp.make_doc("Dina went to school"))
|
||||||
|
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(person_org_patterns)
|
ruler.add_patterns(person_org_patterns)
|
||||||
assert len(ruler.patterns) == 3
|
assert len(ruler.patterns) == 3
|
||||||
|
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_several_patterns(person_org_patterns):
|
def test_span_ruler_remove_several_patterns(person_org_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(person_org_patterns)
|
ruler.add_patterns(person_org_patterns)
|
||||||
doc = ruler(nlp.make_doc("Dina founded the company ACME."))
|
doc = ruler(nlp.make_doc("Dina founded the company ACME."))
|
||||||
|
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(person_org_date_patterns)
|
ruler.add_patterns(person_org_date_patterns)
|
||||||
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
|
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
|
||||||
|
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
ruler.add_patterns(person_org_date_patterns)
|
ruler.add_patterns(person_org_date_patterns)
|
||||||
assert len(ruler.patterns) == 4
|
assert len(ruler.patterns) == 4
|
||||||
|
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_remove_and_add():
|
def test_span_ruler_remove_and_add():
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler")
|
ruler = nlp.add_pipe("span_ruler")
|
||||||
patterns1 = [{"label": "DATE1", "pattern": "last time"}]
|
patterns1 = [{"label": "DATE1", "pattern": "last time"}]
|
||||||
ruler.add_patterns(patterns1)
|
ruler.add_patterns(patterns1)
|
||||||
|
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_spans_filter(overlapping_patterns):
|
def test_span_ruler_spans_filter(overlapping_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe(
|
ruler = nlp.add_pipe(
|
||||||
"span_ruler",
|
"span_ruler",
|
||||||
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
|
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
|
||||||
|
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_ents_default_filter(overlapping_patterns):
|
def test_span_ruler_ents_default_filter(overlapping_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
|
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
|
||||||
ruler.add_patterns(overlapping_patterns)
|
ruler.add_patterns(overlapping_patterns)
|
||||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||||
|
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
|
||||||
|
|
||||||
|
|
||||||
def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
|
def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe(
|
ruler = nlp.add_pipe(
|
||||||
"span_ruler",
|
"span_ruler",
|
||||||
config={
|
config={
|
||||||
|
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
|
||||||
|
|
||||||
return pass_through_filter
|
return pass_through_filter
|
||||||
|
|
||||||
nlp = spacy.blank("xx")
|
nlp = spacy.blank("mul")
|
||||||
ruler = nlp.add_pipe(
|
ruler = nlp.add_pipe(
|
||||||
"span_ruler",
|
"span_ruler",
|
||||||
config={
|
config={
|
||||||
|
|
|
@ -664,11 +664,12 @@ def test_spacy_blank():
|
||||||
("fra", "fr"),
|
("fra", "fr"),
|
||||||
("fre", "fr"),
|
("fre", "fr"),
|
||||||
("iw", "he"),
|
("iw", "he"),
|
||||||
|
("is", "isl"),
|
||||||
("mo", "ro"),
|
("mo", "ro"),
|
||||||
("mul", "xx"),
|
("mul", "mul"),
|
||||||
("no", "nb"),
|
("no", "nb"),
|
||||||
("pt-BR", "pt"),
|
("pt-BR", "pt"),
|
||||||
("xx", "xx"),
|
("xx", "mul"),
|
||||||
("zh-Hans", "zh"),
|
("zh-Hans", "zh"),
|
||||||
("zh-Hant", None),
|
("zh-Hant", None),
|
||||||
("zxx", None),
|
("zxx", None),
|
||||||
|
@ -689,11 +690,11 @@ def test_language_matching(lang, target):
|
||||||
("fra", "fr"),
|
("fra", "fr"),
|
||||||
("fre", "fr"),
|
("fre", "fr"),
|
||||||
("iw", "he"),
|
("iw", "he"),
|
||||||
|
("is", "isl"),
|
||||||
("mo", "ro"),
|
("mo", "ro"),
|
||||||
("mul", "xx"),
|
("xx", "mul"),
|
||||||
("no", "nb"),
|
("no", "nb"),
|
||||||
("pt-BR", "pt"),
|
("pt-BR", "pt"),
|
||||||
("xx", "xx"),
|
|
||||||
("zh-Hans", "zh"),
|
("zh-Hans", "zh"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -36,6 +36,7 @@ LANGUAGES = [
|
||||||
"hu",
|
"hu",
|
||||||
pytest.param("id", marks=pytest.mark.slow()),
|
pytest.param("id", marks=pytest.mark.slow()),
|
||||||
pytest.param("it", marks=pytest.mark.slow()),
|
pytest.param("it", marks=pytest.mark.slow()),
|
||||||
|
pytest.param("isl", marks=pytest.mark.slow()),
|
||||||
pytest.param("kn", marks=pytest.mark.slow()),
|
pytest.param("kn", marks=pytest.mark.slow()),
|
||||||
pytest.param("lb", marks=pytest.mark.slow()),
|
pytest.param("lb", marks=pytest.mark.slow()),
|
||||||
pytest.param("lt", marks=pytest.mark.slow()),
|
pytest.param("lt", marks=pytest.mark.slow()),
|
||||||
|
|
|
@ -86,7 +86,7 @@ def conll_ner_to_docs(
|
||||||
if model:
|
if model:
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
else:
|
else:
|
||||||
nlp = get_lang_class("xx")()
|
nlp = get_lang_class("mul")()
|
||||||
for conll_doc in input_data.strip().split(doc_delimiter):
|
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||||
conll_doc = conll_doc.strip()
|
conll_doc = conll_doc.strip()
|
||||||
if not conll_doc:
|
if not conll_doc:
|
||||||
|
@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
||||||
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||||
"improved parser-based sentence segmentation.)"
|
"improved parser-based sentence segmentation.)"
|
||||||
)
|
)
|
||||||
nlp = get_lang_class("xx")()
|
nlp = get_lang_class("mul")()
|
||||||
sentencizer = nlp.create_pipe("sentencizer")
|
sentencizer = nlp.create_pipe("sentencizer")
|
||||||
lines = doc.strip().split("\n")
|
lines = doc.strip().split("\n")
|
||||||
words = [line.strip().split()[0] for line in lines]
|
words = [line.strip().split()[0] for line in lines]
|
||||||
|
|
|
@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations
|
||||||
from ..example import annotations_to_doc
|
from ..example import annotations_to_doc
|
||||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||||
from ...util import load_model
|
from ...util import load_model
|
||||||
from ...lang.xx import MultiLanguage
|
from ...lang.mul import MultiLanguage
|
||||||
|
|
||||||
|
|
||||||
def json_to_docs(input_data, model=None, **kwargs):
|
def json_to_docs(input_data, model=None, **kwargs):
|
||||||
|
|
|
@ -283,7 +283,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
||||||
import spacy.lang # noqa: F401
|
import spacy.lang # noqa: F401
|
||||||
|
|
||||||
if lang == "xx":
|
if lang == "xx":
|
||||||
return "xx"
|
return "mul"
|
||||||
|
|
||||||
# Find out which language modules we have
|
# Find out which language modules we have
|
||||||
possible_languages = []
|
possible_languages = []
|
||||||
|
@ -301,11 +301,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
||||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||||
# shouldn't automatically match.
|
# shouldn't automatically match.
|
||||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||||
if match == "mul":
|
return match
|
||||||
# Convert 'mul' back to spaCy's 'xx'
|
|
||||||
return "xx"
|
|
||||||
else:
|
|
||||||
return match
|
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(lang: str) -> Type["Language"]:
|
def get_lang_class(lang: str) -> Type["Language"]:
|
||||||
|
|
|
@ -30,7 +30,7 @@ Create a new `Scorer`.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
|
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
|
||||||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
|
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ |
|
||||||
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||||
|
|
|
@ -74,23 +74,23 @@ your data.
|
||||||
|
|
||||||
> ```python
|
> ```python
|
||||||
> # Standard import
|
> # Standard import
|
||||||
> from spacy.lang.xx import MultiLanguage
|
> from spacy.lang.mul import MultiLanguage
|
||||||
> nlp = MultiLanguage()
|
> nlp = MultiLanguage()
|
||||||
>
|
>
|
||||||
> # With lazy-loading
|
> # With lazy-loading
|
||||||
> nlp = spacy.blank("xx")
|
> nlp = spacy.blank("mul")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
spaCy also supports pipelines trained on more than one language. This is
|
spaCy also supports pipelines trained on more than one language. This is
|
||||||
especially useful for named entity recognition. The language ID used for
|
especially useful for named entity recognition. The language ID used for
|
||||||
multi-language or language-neutral pipelines is `xx`. The language class, a
|
multi-language or language-neutral pipelines is `mul`. The language class, a
|
||||||
generic subclass containing only the base language data, can be found in
|
generic subclass containing only the base language data, can be found in
|
||||||
[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
|
[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
|
||||||
|
|
||||||
To train a pipeline using the neutral multi-language class, you can set
|
To train a pipeline using the neutral multi-language class, you can set
|
||||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
`lang = "mul"` in your [training config](/usage/training#config). You can also
|
||||||
\import the `MultiLanguage` class directly, or call
|
\import the `MultiLanguage` class directly, or call
|
||||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||||
|
|
||||||
### Chinese language support {id="chinese",version="2.3"}
|
### Chinese language support {id="chinese",version="2.3"}
|
||||||
|
|
||||||
|
|
|
@ -165,7 +165,7 @@
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "is",
|
"code": "isl",
|
||||||
"name": "Icelandic"
|
"name": "Icelandic"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -434,9 +434,9 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "xx",
|
"code": "mul",
|
||||||
"name": "Multi-language",
|
"name": "Multi-language",
|
||||||
"models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
|
"models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
|
||||||
"example": "This is a sentence about Facebook."
|
"example": "This is a sentence about Facebook."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
|
||||||
</QS>
|
</QS>
|
||||||
<QS config="example" prompt="python">
|
<QS config="example" prompt="python">
|
||||||
print([
|
print([
|
||||||
{code === 'xx'
|
{code === 'mul'
|
||||||
? '(ent.text, ent.label) for ent in doc.ents'
|
? '(ent.text, ent.label) for ent in doc.ents'
|
||||||
: '(w.text, w.pos_) for w in doc'}
|
: '(w.text, w.pos_) for w in doc'}
|
||||||
])
|
])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user