mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into fix/travis-tests
This commit is contained in:
		
						commit
						bd6353715a
					
				| 
						 | 
				
			
			@ -187,12 +187,17 @@ def debug_data(
 | 
			
		|||
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
 | 
			
		||||
        msg.warn(
 | 
			
		||||
            "{} words in training data without vectors ({:0.2f}%)".format(
 | 
			
		||||
                n_missing_vectors,
 | 
			
		||||
                n_missing_vectors / gold_train_data["n_words"],
 | 
			
		||||
                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
 | 
			
		||||
            ),
 | 
			
		||||
        )
 | 
			
		||||
        msg.text(
 | 
			
		||||
            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
 | 
			
		||||
            "10 most common words without vectors: {}".format(
 | 
			
		||||
                _format_labels(
 | 
			
		||||
                    gold_train_data["words_missing_vectors"].most_common(10),
 | 
			
		||||
                    counts=True,
 | 
			
		||||
                )
 | 
			
		||||
            ),
 | 
			
		||||
            show=verbose,
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info("No word vectors present in the model")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
 | 
			
		|||
        str,
 | 
			
		||||
    ),
 | 
			
		||||
    model_name=("Optional name for the model meta", "option", "mn", str),
 | 
			
		||||
    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
 | 
			
		||||
    base_model=(
 | 
			
		||||
        "Base model (for languages with custom tokenizers)",
 | 
			
		||||
        "option",
 | 
			
		||||
        "b",
 | 
			
		||||
        str,
 | 
			
		||||
    ),
 | 
			
		||||
)
 | 
			
		||||
def init_model(
 | 
			
		||||
    lang,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ def add_codes(err_cls):
 | 
			
		|||
    class ErrorsWithCodes(err_cls):
 | 
			
		||||
        def __getattribute__(self, code):
 | 
			
		||||
            msg = super().__getattribute__(code)
 | 
			
		||||
            if code.startswith('__'):  # python system attributes like __class__
 | 
			
		||||
            if code.startswith("__"):  # python system attributes like __class__
 | 
			
		||||
                return msg
 | 
			
		||||
            else:
 | 
			
		||||
                return "[{code}] {msg}".format(code=code, msg=msg)
 | 
			
		||||
| 
						 | 
				
			
			@ -116,6 +116,7 @@ class Warnings(object):
 | 
			
		|||
            " to check the alignment. Misaligned entities ('-') will be "
 | 
			
		||||
            "ignored during training.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@add_codes
 | 
			
		||||
class Errors(object):
 | 
			
		||||
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
 | 
			
		|||
from ..tag_map import TAG_MAP
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
 | 
			
		|||
 | 
			
		||||
        _exc[orth + "d"] = [
 | 
			
		||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
			
		||||
            {ORTH: "d", NORM: "'d"}
 | 
			
		||||
            {ORTH: "d", NORM: "'d"},
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        _exc[orth + "'d've"] = [
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 | 
			
		|||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
			
		||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
			
		||||
from ..char_classes import merge_chars
 | 
			
		||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_list_units = [u for u in LIST_UNITS if u != "%"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,12 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .tag_map import TAG_MAP
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArmenianDefaults(Language.Defaults):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,6 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
>>> from spacy.lang.hy.examples import sentences
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,6 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
նա
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
			
		||||
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
			
		||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 | 
			
		||||
 | 
			
		||||
TAG_MAP = {
 | 
			
		||||
| 
						 | 
				
			
			@ -716,7 +716,7 @@ TAG_MAP = {
 | 
			
		|||
        POS: NOUN,
 | 
			
		||||
        "Animacy": "Nhum",
 | 
			
		||||
        "Case": "Dat",
 | 
			
		||||
        "Number": "Coll",
 | 
			
		||||
        # "Number": "Coll",
 | 
			
		||||
        "Number": "Sing",
 | 
			
		||||
        "Person": "1",
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			@ -815,7 +815,7 @@ TAG_MAP = {
 | 
			
		|||
        "Animacy": "Nhum",
 | 
			
		||||
        "Case": "Nom",
 | 
			
		||||
        "Definite": "Def",
 | 
			
		||||
        "Number": "Plur",
 | 
			
		||||
        # "Number": "Plur",
 | 
			
		||||
        "Number": "Sing",
 | 
			
		||||
        "Poss": "Yes",
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			@ -880,7 +880,7 @@ TAG_MAP = {
 | 
			
		|||
        POS: NOUN,
 | 
			
		||||
        "Animacy": "Nhum",
 | 
			
		||||
        "Case": "Nom",
 | 
			
		||||
        "Number": "Plur",
 | 
			
		||||
        # "Number": "Plur",
 | 
			
		||||
        "Number": "Sing",
 | 
			
		||||
        "Person": "2",
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			@ -1223,9 +1223,9 @@ TAG_MAP = {
 | 
			
		|||
    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
 | 
			
		||||
        POS: PRON,
 | 
			
		||||
        "Case": "Nom",
 | 
			
		||||
        "Number": "Sing",
 | 
			
		||||
        # "Number": "Sing",
 | 
			
		||||
        "Number": "Plur",
 | 
			
		||||
        "Person": "3",
 | 
			
		||||
        # "Person": "3",
 | 
			
		||||
        "Person": "1",
 | 
			
		||||
        "PronType": "Emp",
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -55,7 +55,7 @@ _num_words = [
 | 
			
		|||
    "തൊണ്ണൂറ് ",
 | 
			
		||||
    "നുറ് ",
 | 
			
		||||
    "ആയിരം ",
 | 
			
		||||
    "പത്തുലക്ഷം"
 | 
			
		||||
    "പത്തുലക്ഷം",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,6 @@ from __future__ import unicode_literals
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
അത്
 | 
			
		||||
ഇത്
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		|||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import add_lookups
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,6 @@ from __future__ import unicode_literals
 | 
			
		|||
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...parts_of_speech import NAMES
 | 
			
		||||
from ...errors import Errors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PolishLemmatizer(Lemmatizer):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
			
		|||
 | 
			
		||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
			
		||||
 | 
			
		||||
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
 | 
			
		||||
_prefixes = _prefixes = [
 | 
			
		||||
    r"(długo|krótko|jedno|dwu|trzy|cztero)-"
 | 
			
		||||
] + BASE_TOKENIZER_PREFIXES
 | 
			
		||||
 | 
			
		||||
_infixes = (
 | 
			
		||||
    LIST_ELLIPSES
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,7 +40,7 @@ _num_words = [
 | 
			
		|||
    "miljard",
 | 
			
		||||
    "biljon",
 | 
			
		||||
    "biljard",
 | 
			
		||||
    "kvadriljon"
 | 
			
		||||
    "kvadriljon",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -38,7 +38,6 @@ TAG_MAP = {
 | 
			
		|||
    "NNPC": {POS: PROPN},
 | 
			
		||||
    "NNC": {POS: NOUN},
 | 
			
		||||
    "PSP": {POS: ADP},
 | 
			
		||||
 | 
			
		||||
    ".": {POS: PUNCT},
 | 
			
		||||
    ",": {POS: PUNCT},
 | 
			
		||||
    "-LRB-": {POS: PUNCT},
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -79,7 +79,9 @@ class BaseDefaults(object):
 | 
			
		|||
            lookups=lookups,
 | 
			
		||||
        )
 | 
			
		||||
        vocab.lex_attr_getters[NORM] = util.add_lookups(
 | 
			
		||||
            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
 | 
			
		||||
            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
 | 
			
		||||
            BASE_NORMS,
 | 
			
		||||
            vocab.lookups.get_table("lexeme_norm"),
 | 
			
		||||
        )
 | 
			
		||||
        for tag_str, exc in cls.morph_rules.items():
 | 
			
		||||
            for orth_str, attrs in exc.items():
 | 
			
		||||
| 
						 | 
				
			
			@ -974,7 +976,9 @@ class Language(object):
 | 
			
		|||
        serializers = OrderedDict()
 | 
			
		||||
        serializers["vocab"] = lambda: self.vocab.to_bytes()
 | 
			
		||||
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
 | 
			
		||||
        serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
 | 
			
		||||
        serializers["meta.json"] = lambda: srsly.json_dumps(
 | 
			
		||||
            OrderedDict(sorted(self.meta.items()))
 | 
			
		||||
        )
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
            if name in exclude:
 | 
			
		||||
                continue
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -112,6 +112,7 @@ def ga_tokenizer():
 | 
			
		|||
def gu_tokenizer():
 | 
			
		||||
    return get_lang_class("gu").Defaults.create_tokenizer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def he_tokenizer():
 | 
			
		||||
    return get_lang_class("he").Defaults.create_tokenizer()
 | 
			
		||||
| 
						 | 
				
			
			@ -246,7 +247,9 @@ def yo_tokenizer():
 | 
			
		|||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def zh_tokenizer_char():
 | 
			
		||||
    return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
 | 
			
		||||
    return get_lang_class("zh").Defaults.create_tokenizer(
 | 
			
		||||
        config={"use_jieba": False, "use_pkuseg": False}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
| 
						 | 
				
			
			@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
 | 
			
		|||
@pytest.fixture(scope="session")
 | 
			
		||||
def zh_tokenizer_pkuseg():
 | 
			
		||||
    pytest.importorskip("pkuseg")
 | 
			
		||||
    return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
 | 
			
		||||
    return get_lang_class("zh").Defaults.create_tokenizer(
 | 
			
		||||
        config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
 | 
			
		|||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
			
		||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
			
		||||
    assert doc.text == text
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
			
		||||
        word for word in words if not word.isspace()
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # partial whitespace in words
 | 
			
		||||
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
			
		||||
| 
						 | 
				
			
			@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
 | 
			
		|||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
			
		||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
			
		||||
    assert doc.text == text
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
			
		||||
        word for word in words if not word.isspace()
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # non-standard whitespace tokens
 | 
			
		||||
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
 | 
			
		||||
| 
						 | 
				
			
			@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
 | 
			
		|||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
			
		||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
			
		||||
    assert doc.text == text
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
			
		||||
    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
			
		||||
        word for word in words if not word.isspace()
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # mismatch between words and text
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
 | 
			
		|||
    doc.is_parsed = True
 | 
			
		||||
    assert len(list(doc.sents)) == 2
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_is_sent_end(en_tokenizer):
 | 
			
		||||
    doc = en_tokenizer("This is a sentence. This is another.")
 | 
			
		||||
    assert doc[4].is_sent_end is None
 | 
			
		||||
| 
						 | 
				
			
			@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
 | 
			
		|||
    assert doc[1].is_sent_start is None
 | 
			
		||||
    assert not doc.is_sentenced
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_tokenlast_has_sent_end_true():
 | 
			
		||||
    doc = Doc(Vocab(), words=["hello", "world"])
 | 
			
		||||
    assert doc[0].is_sent_end is None
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,17 +3,16 @@ from __future__ import unicode_literals
 | 
			
		|||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
 | 
			
		||||
    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
 | 
			
		||||
    tokens = gu_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text,length",
 | 
			
		||||
    [
 | 
			
		||||
        ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
 | 
			
		||||
        ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
 | 
			
		||||
    ],
 | 
			
		||||
    [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
 | 
			
		||||
)
 | 
			
		||||
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
 | 
			
		||||
    tokens = gu_tokenizer(text)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
 | 
			
		|||
    assert len(tokens) == 5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text,length",
 | 
			
		||||
    [
 | 
			
		||||
        (
 | 
			
		||||
            "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
 | 
			
		||||
            10,
 | 
			
		||||
        ),
 | 
			
		||||
        ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
 | 
			
		||||
    tokens = ml_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == length
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
 | 
			
		|||
 | 
			
		||||
@pytest.mark.slow
 | 
			
		||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
			
		||||
    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
 | 
			
		||||
    nlp = Chinese(
 | 
			
		||||
        meta={
 | 
			
		||||
            "tokenizer": {
 | 
			
		||||
                "config": {
 | 
			
		||||
                    "use_jieba": False,
 | 
			
		||||
                    "use_pkuseg": True,
 | 
			
		||||
                    "pkuseg_model": "medicine",
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
    zh_tokenizer_serialize(nlp.tokenizer)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
 | 
			
		|||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
 | 
			
		||||
    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
			
		||||
    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
 | 
			
		||||
    updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
			
		||||
    updated_user_dict = _get_pkuseg_trie_data(
 | 
			
		||||
        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
 | 
			
		||||
    )
 | 
			
		||||
    assert len(user_dict) == len(updated_user_dict) - 1
 | 
			
		||||
 | 
			
		||||
    # reset user dict
 | 
			
		||||
    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
 | 
			
		||||
    reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
			
		||||
    reset_user_dict = _get_pkuseg_trie_data(
 | 
			
		||||
        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
 | 
			
		||||
    )
 | 
			
		||||
    assert len(reset_user_dict) == 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -272,8 +272,8 @@ def test_matcher_regex_shape(en_vocab):
 | 
			
		|||
        (">=", ["a"]),
 | 
			
		||||
        ("<=", ["aaa"]),
 | 
			
		||||
        (">", ["a", "aa"]),
 | 
			
		||||
        ("<", ["aa", "aaa"])
 | 
			
		||||
    ]
 | 
			
		||||
        ("<", ["aa", "aaa"]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_matcher_compare_length(en_vocab, cmp, bad):
 | 
			
		||||
    matcher = Matcher(en_vocab)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
 | 
			
		|||
        ),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
 | 
			
		||||
def test_sentencizer_custom_punct(
 | 
			
		||||
    en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
 | 
			
		||||
):
 | 
			
		||||
    doc = Doc(en_vocab, words=words)
 | 
			
		||||
    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
			
		||||
    doc = sentencizer(doc)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 | 
			
		|||
        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
 | 
			
		||||
        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
 | 
			
		||||
        if strings1 == strings2:
 | 
			
		||||
            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
 | 
			
		||||
            assert [s for s in vocab1_d.strings if s != "_SP"] == [
 | 
			
		||||
                s for s in vocab2_d.strings if s != "_SP"
 | 
			
		||||
            ]
 | 
			
		||||
        else:
 | 
			
		||||
            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
 | 
			
		||||
            assert [s for s in vocab1_d.strings if s != "_SP"] != [
 | 
			
		||||
                s for s in vocab2_d.strings if s != "_SP"
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +80,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
 | 
			
		|||
def test_deserialize_vocab_seen_entries(strings, lex_attr):
 | 
			
		||||
    # Reported in #2153
 | 
			
		||||
    vocab = Vocab(strings=strings)
 | 
			
		||||
    length = len(vocab)
 | 
			
		||||
    vocab.from_bytes(vocab.to_bytes())
 | 
			
		||||
    assert len(vocab.strings) == len(strings) + 1  # adds _SP
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
 | 
			
		|||
        else:
 | 
			
		||||
            assert list(sstore1_d) != list(sstore2_d)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
			
		||||
def test_pickle_vocab(strings, lex_attr):
 | 
			
		||||
    vocab = Vocab(strings=strings)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
			
		|||
    data = (
 | 
			
		||||
        "I'll return the ₹54 amount",
 | 
			
		||||
        {
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
 | 
			
		||||
            "entities": [(16, 19, "MONEY")],
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
| 
						 | 
				
			
			@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
			
		|||
    data = (
 | 
			
		||||
        "I'll return the $54 amount",
 | 
			
		||||
        {
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
 | 
			
		||||
            "entities": [(16, 19, "MONEY")],
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -366,6 +366,7 @@ def test_vectors_serialize():
 | 
			
		|||
        assert row == row_r
 | 
			
		||||
        assert_equal(v.data, v_r.data)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_vector_is_oov():
 | 
			
		||||
    vocab = Vocab(vectors_name="test_vocab_is_oov")
 | 
			
		||||
    data = numpy.ndarray((5, 3), dtype="f")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
 | 
			
		|||
        except ValueError:
 | 
			
		||||
            raise ValueError(Errors.E194.format(text=text, words=words))
 | 
			
		||||
        if word_start > 0:
 | 
			
		||||
            text_words.append(text[text_pos:text_pos+word_start])
 | 
			
		||||
            text_words.append(text[text_pos : text_pos + word_start])
 | 
			
		||||
            text_spaces.append(False)
 | 
			
		||||
            text_pos += word_start
 | 
			
		||||
        text_words.append(word)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user