mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Tidy up and auto-format
This commit is contained in:
		
							parent
							
								
									f2a131bd9a
								
							
						
					
					
						commit
						d8f3190c0a
					
				| 
						 | 
					@ -187,12 +187,17 @@ def debug_data(
 | 
				
			||||||
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
 | 
					        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(
 | 
				
			||||||
            "{} words in training data without vectors ({:0.2f}%)".format(
 | 
					            "{} words in training data without vectors ({:0.2f}%)".format(
 | 
				
			||||||
                n_missing_vectors,
 | 
					                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
 | 
				
			||||||
                n_missing_vectors / gold_train_data["n_words"],
 | 
					 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.text(
 | 
					        msg.text(
 | 
				
			||||||
            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
 | 
					            "10 most common words without vectors: {}".format(
 | 
				
			||||||
 | 
					                _format_labels(
 | 
				
			||||||
 | 
					                    gold_train_data["words_missing_vectors"].most_common(10),
 | 
				
			||||||
 | 
					                    counts=True,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					            show=verbose,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the model")
 | 
					        msg.info("No word vectors present in the model")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
 | 
				
			||||||
        str,
 | 
					        str,
 | 
				
			||||||
    ),
 | 
					    ),
 | 
				
			||||||
    model_name=("Optional name for the model meta", "option", "mn", str),
 | 
					    model_name=("Optional name for the model meta", "option", "mn", str),
 | 
				
			||||||
    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
 | 
					    base_model=(
 | 
				
			||||||
 | 
					        "Base model (for languages with custom tokenizers)",
 | 
				
			||||||
 | 
					        "option",
 | 
				
			||||||
 | 
					        "b",
 | 
				
			||||||
 | 
					        str,
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def init_model(
 | 
					def init_model(
 | 
				
			||||||
    lang,
 | 
					    lang,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ def add_codes(err_cls):
 | 
				
			||||||
    class ErrorsWithCodes(err_cls):
 | 
					    class ErrorsWithCodes(err_cls):
 | 
				
			||||||
        def __getattribute__(self, code):
 | 
					        def __getattribute__(self, code):
 | 
				
			||||||
            msg = super().__getattribute__(code)
 | 
					            msg = super().__getattribute__(code)
 | 
				
			||||||
            if code.startswith('__'):  # python system attributes like __class__
 | 
					            if code.startswith("__"):  # python system attributes like __class__
 | 
				
			||||||
                return msg
 | 
					                return msg
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return "[{code}] {msg}".format(code=code, msg=msg)
 | 
					                return "[{code}] {msg}".format(code=code, msg=msg)
 | 
				
			||||||
| 
						 | 
					@ -116,6 +116,7 @@ class Warnings(object):
 | 
				
			||||||
            " to check the alignment. Misaligned entities ('-') will be "
 | 
					            " to check the alignment. Misaligned entities ('-') will be "
 | 
				
			||||||
            "ignored during training.")
 | 
					            "ignored during training.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
class Errors(object):
 | 
					class Errors(object):
 | 
				
			||||||
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
					    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
 | 
				
			||||||
from ..tag_map import TAG_MAP
 | 
					from ..tag_map import TAG_MAP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "d"] = [
 | 
					        _exc[orth + "d"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, LEMMA: word, NORM: word},
 | 
				
			||||||
            {ORTH: "d", NORM: "'d"}
 | 
					            {ORTH: "d", NORM: "'d"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'d've"] = [
 | 
					        _exc[orth + "'d've"] = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 | 
				
			||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
					from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
				
			||||||
from ..char_classes import merge_chars
 | 
					from ..char_classes import merge_chars
 | 
				
			||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_list_units = [u for u in LIST_UNITS if u != "%"]
 | 
					_list_units = [u for u in LIST_UNITS if u != "%"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,12 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArmenianDefaults(Language.Defaults):
 | 
					class ArmenianDefaults(Language.Defaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
>>> from spacy.lang.hy.examples import sentences
 | 
					>>> from spacy.lang.hy.examples import sentences
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
նա
 | 
					նա
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
					from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
				
			||||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 | 
					from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TAG_MAP = {
 | 
					TAG_MAP = {
 | 
				
			||||||
| 
						 | 
					@ -716,7 +716,7 @@ TAG_MAP = {
 | 
				
			||||||
        POS: NOUN,
 | 
					        POS: NOUN,
 | 
				
			||||||
        "Animacy": "Nhum",
 | 
					        "Animacy": "Nhum",
 | 
				
			||||||
        "Case": "Dat",
 | 
					        "Case": "Dat",
 | 
				
			||||||
        "Number": "Coll",
 | 
					        # "Number": "Coll",
 | 
				
			||||||
        "Number": "Sing",
 | 
					        "Number": "Sing",
 | 
				
			||||||
        "Person": "1",
 | 
					        "Person": "1",
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
| 
						 | 
					@ -815,7 +815,7 @@ TAG_MAP = {
 | 
				
			||||||
        "Animacy": "Nhum",
 | 
					        "Animacy": "Nhum",
 | 
				
			||||||
        "Case": "Nom",
 | 
					        "Case": "Nom",
 | 
				
			||||||
        "Definite": "Def",
 | 
					        "Definite": "Def",
 | 
				
			||||||
        "Number": "Plur",
 | 
					        # "Number": "Plur",
 | 
				
			||||||
        "Number": "Sing",
 | 
					        "Number": "Sing",
 | 
				
			||||||
        "Poss": "Yes",
 | 
					        "Poss": "Yes",
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
| 
						 | 
					@ -880,7 +880,7 @@ TAG_MAP = {
 | 
				
			||||||
        POS: NOUN,
 | 
					        POS: NOUN,
 | 
				
			||||||
        "Animacy": "Nhum",
 | 
					        "Animacy": "Nhum",
 | 
				
			||||||
        "Case": "Nom",
 | 
					        "Case": "Nom",
 | 
				
			||||||
        "Number": "Plur",
 | 
					        # "Number": "Plur",
 | 
				
			||||||
        "Number": "Sing",
 | 
					        "Number": "Sing",
 | 
				
			||||||
        "Person": "2",
 | 
					        "Person": "2",
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
| 
						 | 
					@ -1223,9 +1223,9 @@ TAG_MAP = {
 | 
				
			||||||
    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
 | 
					    "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
 | 
				
			||||||
        POS: PRON,
 | 
					        POS: PRON,
 | 
				
			||||||
        "Case": "Nom",
 | 
					        "Case": "Nom",
 | 
				
			||||||
        "Number": "Sing",
 | 
					        # "Number": "Sing",
 | 
				
			||||||
        "Number": "Plur",
 | 
					        "Number": "Plur",
 | 
				
			||||||
        "Person": "3",
 | 
					        # "Person": "3",
 | 
				
			||||||
        "Person": "1",
 | 
					        "Person": "1",
 | 
				
			||||||
        "PronType": "Emp",
 | 
					        "PronType": "Emp",
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,7 +55,7 @@ _num_words = [
 | 
				
			||||||
    "തൊണ്ണൂറ് ",
 | 
					    "തൊണ്ണൂറ് ",
 | 
				
			||||||
    "നുറ് ",
 | 
					    "നുറ് ",
 | 
				
			||||||
    "ആയിരം ",
 | 
					    "ആയിരം ",
 | 
				
			||||||
    "പത്തുലക്ഷം"
 | 
					    "പത്തുലക്ഷം",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,6 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
അത്
 | 
					അത്
 | 
				
			||||||
ഇത്
 | 
					ഇത്
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					from ..norm_exceptions import BASE_NORMS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...attrs import LANG, NORM
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					from ...util import add_lookups
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...lookups import Lookups
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,6 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...parts_of_speech import NAMES
 | 
					from ...parts_of_speech import NAMES
 | 
				
			||||||
from ...errors import Errors
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PolishLemmatizer(Lemmatizer):
 | 
					class PolishLemmatizer(Lemmatizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
 | 
					_prefixes = _prefixes = [
 | 
				
			||||||
 | 
					    r"(długo|krótko|jedno|dwu|trzy|cztero)-"
 | 
				
			||||||
 | 
					] + BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
    LIST_ELLIPSES
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,7 +40,7 @@ _num_words = [
 | 
				
			||||||
    "miljard",
 | 
					    "miljard",
 | 
				
			||||||
    "biljon",
 | 
					    "biljon",
 | 
				
			||||||
    "biljard",
 | 
					    "biljard",
 | 
				
			||||||
    "kvadriljon"
 | 
					    "kvadriljon",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,7 +38,6 @@ TAG_MAP = {
 | 
				
			||||||
    "NNPC": {POS: PROPN},
 | 
					    "NNPC": {POS: PROPN},
 | 
				
			||||||
    "NNC": {POS: NOUN},
 | 
					    "NNC": {POS: NOUN},
 | 
				
			||||||
    "PSP": {POS: ADP},
 | 
					    "PSP": {POS: ADP},
 | 
				
			||||||
 | 
					 | 
				
			||||||
    ".": {POS: PUNCT},
 | 
					    ".": {POS: PUNCT},
 | 
				
			||||||
    ",": {POS: PUNCT},
 | 
					    ",": {POS: PUNCT},
 | 
				
			||||||
    "-LRB-": {POS: PUNCT},
 | 
					    "-LRB-": {POS: PUNCT},
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
            if reset:
 | 
					            if reset:
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
                    import pkuseg
 | 
					                    import pkuseg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
 | 
					                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
 | 
				
			||||||
                except ImportError:
 | 
					                except ImportError:
 | 
				
			||||||
                    if self.use_pkuseg:
 | 
					                    if self.use_pkuseg:
 | 
				
			||||||
| 
						 | 
					@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                        raise ImportError(msg)
 | 
					                        raise ImportError(msg)
 | 
				
			||||||
            for word in words:
 | 
					            for word in words:
 | 
				
			||||||
                self.pkuseg_seg.preprocesser.insert(word.strip(), '')
 | 
					                self.pkuseg_seg.preprocesser.insert(word.strip(), "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_config(self):
 | 
					    def _get_config(self):
 | 
				
			||||||
        config = OrderedDict(
 | 
					        config = OrderedDict(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,9 @@ class BaseDefaults(object):
 | 
				
			||||||
            lookups=lookups,
 | 
					            lookups=lookups,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        vocab.lex_attr_getters[NORM] = util.add_lookups(
 | 
					        vocab.lex_attr_getters[NORM] = util.add_lookups(
 | 
				
			||||||
            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
 | 
					            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
 | 
				
			||||||
 | 
					            BASE_NORMS,
 | 
				
			||||||
 | 
					            vocab.lookups.get_table("lexeme_norm"),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        for tag_str, exc in cls.morph_rules.items():
 | 
					        for tag_str, exc in cls.morph_rules.items():
 | 
				
			||||||
            for orth_str, attrs in exc.items():
 | 
					            for orth_str, attrs in exc.items():
 | 
				
			||||||
| 
						 | 
					@ -974,7 +976,9 @@ class Language(object):
 | 
				
			||||||
        serializers = OrderedDict()
 | 
					        serializers = OrderedDict()
 | 
				
			||||||
        serializers["vocab"] = lambda: self.vocab.to_bytes()
 | 
					        serializers["vocab"] = lambda: self.vocab.to_bytes()
 | 
				
			||||||
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
 | 
					        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
 | 
				
			||||||
        serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
 | 
					        serializers["meta.json"] = lambda: srsly.json_dumps(
 | 
				
			||||||
 | 
					            OrderedDict(sorted(self.meta.items()))
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        for name, proc in self.pipeline:
 | 
					        for name, proc in self.pipeline:
 | 
				
			||||||
            if name in exclude:
 | 
					            if name in exclude:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -112,6 +112,7 @@ def ga_tokenizer():
 | 
				
			||||||
def gu_tokenizer():
 | 
					def gu_tokenizer():
 | 
				
			||||||
    return get_lang_class("gu").Defaults.create_tokenizer()
 | 
					    return get_lang_class("gu").Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def he_tokenizer():
 | 
					def he_tokenizer():
 | 
				
			||||||
    return get_lang_class("he").Defaults.create_tokenizer()
 | 
					    return get_lang_class("he").Defaults.create_tokenizer()
 | 
				
			||||||
| 
						 | 
					@ -246,7 +247,9 @@ def yo_tokenizer():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def zh_tokenizer_char():
 | 
					def zh_tokenizer_char():
 | 
				
			||||||
    return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
 | 
					    return get_lang_class("zh").Defaults.create_tokenizer(
 | 
				
			||||||
 | 
					        config={"use_jieba": False, "use_pkuseg": False}
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
| 
						 | 
					@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def zh_tokenizer_pkuseg():
 | 
					def zh_tokenizer_pkuseg():
 | 
				
			||||||
    pytest.importorskip("pkuseg")
 | 
					    pytest.importorskip("pkuseg")
 | 
				
			||||||
    return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
 | 
					    return get_lang_class("zh").Defaults.create_tokenizer(
 | 
				
			||||||
 | 
					        config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
 | 
				
			||||||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
					    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
				
			||||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
					    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
				
			||||||
    assert doc.text == text
 | 
					    assert doc.text == text
 | 
				
			||||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
					    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
				
			||||||
 | 
					        word for word in words if not word.isspace()
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # partial whitespace in words
 | 
					    # partial whitespace in words
 | 
				
			||||||
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
					    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
				
			||||||
| 
						 | 
					@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
 | 
				
			||||||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
					    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
				
			||||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
					    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
				
			||||||
    assert doc.text == text
 | 
					    assert doc.text == text
 | 
				
			||||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
					    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
				
			||||||
 | 
					        word for word in words if not word.isspace()
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # non-standard whitespace tokens
 | 
					    # non-standard whitespace tokens
 | 
				
			||||||
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
 | 
					    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
 | 
				
			||||||
| 
						 | 
					@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
 | 
				
			||||||
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
					    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
 | 
				
			||||||
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
					    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
 | 
				
			||||||
    assert doc.text == text
 | 
					    assert doc.text == text
 | 
				
			||||||
    assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
 | 
					    assert [t.text for t in doc if not t.text.isspace()] == [
 | 
				
			||||||
 | 
					        word for word in words if not word.isspace()
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # mismatch between words and text
 | 
					    # mismatch between words and text
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
 | 
				
			||||||
    doc.is_parsed = True
 | 
					    doc.is_parsed = True
 | 
				
			||||||
    assert len(list(doc.sents)) == 2
 | 
					    assert len(list(doc.sents)) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_sent_end(en_tokenizer):
 | 
					def test_is_sent_end(en_tokenizer):
 | 
				
			||||||
    doc = en_tokenizer("This is a sentence. This is another.")
 | 
					    doc = en_tokenizer("This is a sentence. This is another.")
 | 
				
			||||||
    assert doc[4].is_sent_end is None
 | 
					    assert doc[4].is_sent_end is None
 | 
				
			||||||
| 
						 | 
					@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
 | 
				
			||||||
    assert doc[1].is_sent_start is None
 | 
					    assert doc[1].is_sent_start is None
 | 
				
			||||||
    assert not doc.is_sentenced
 | 
					    assert not doc.is_sentenced
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tokenlast_has_sent_end_true():
 | 
					def test_tokenlast_has_sent_end_true():
 | 
				
			||||||
    doc = Doc(Vocab(), words=["hello", "world"])
 | 
					    doc = Doc(Vocab(), words=["hello", "world"])
 | 
				
			||||||
    assert doc[0].is_sent_end is None
 | 
					    assert doc[0].is_sent_end is None
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,17 +3,16 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
 | 
					def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
 | 
				
			||||||
    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
 | 
					    text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
 | 
				
			||||||
    tokens = gu_tokenizer(text)
 | 
					    tokens = gu_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 9
 | 
					    assert len(tokens) == 9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "text,length",
 | 
					    "text,length",
 | 
				
			||||||
    [
 | 
					    [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
 | 
				
			||||||
        ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
 | 
					 | 
				
			||||||
        ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
 | 
					def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
 | 
				
			||||||
    tokens = gu_tokenizer(text)
 | 
					    tokens = gu_tokenizer(text)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
 | 
				
			||||||
    assert len(tokens) == 5
 | 
					    assert len(tokens) == 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "text,length",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
 | 
				
			||||||
 | 
					            10,
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
 | 
					def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
 | 
				
			||||||
    tokens = ml_tokenizer(text)
 | 
					    tokens = ml_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == length
 | 
					    assert len(tokens) == length
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.slow
 | 
					@pytest.mark.slow
 | 
				
			||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
					def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
				
			||||||
    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
 | 
					    nlp = Chinese(
 | 
				
			||||||
 | 
					        meta={
 | 
				
			||||||
 | 
					            "tokenizer": {
 | 
				
			||||||
 | 
					                "config": {
 | 
				
			||||||
 | 
					                    "use_jieba": False,
 | 
				
			||||||
 | 
					                    "use_pkuseg": True,
 | 
				
			||||||
 | 
					                    "pkuseg_model": "medicine",
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    zh_tokenizer_serialize(nlp.tokenizer)
 | 
					    zh_tokenizer_serialize(nlp.tokenizer)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
 | 
				
			||||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
 | 
					def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
 | 
				
			||||||
    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
					    user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
				
			||||||
    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
 | 
					    zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
 | 
				
			||||||
    updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
					    updated_user_dict = _get_pkuseg_trie_data(
 | 
				
			||||||
 | 
					        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    assert len(user_dict) == len(updated_user_dict) - 1
 | 
					    assert len(user_dict) == len(updated_user_dict) - 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # reset user dict
 | 
					    # reset user dict
 | 
				
			||||||
    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
 | 
					    zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
 | 
				
			||||||
    reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
 | 
					    reset_user_dict = _get_pkuseg_trie_data(
 | 
				
			||||||
 | 
					        zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    assert len(reset_user_dict) == 0
 | 
					    assert len(reset_user_dict) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -272,8 +272,8 @@ def test_matcher_regex_shape(en_vocab):
 | 
				
			||||||
        (">=", ["a"]),
 | 
					        (">=", ["a"]),
 | 
				
			||||||
        ("<=", ["aaa"]),
 | 
					        ("<=", ["aaa"]),
 | 
				
			||||||
        (">", ["a", "aa"]),
 | 
					        (">", ["a", "aa"]),
 | 
				
			||||||
        ("<", ["aa", "aaa"])
 | 
					        ("<", ["aa", "aaa"]),
 | 
				
			||||||
    ]
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_matcher_compare_length(en_vocab, cmp, bad):
 | 
					def test_matcher_compare_length(en_vocab, cmp, bad):
 | 
				
			||||||
    matcher = Matcher(en_vocab)
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
 | 
					def test_sentencizer_custom_punct(
 | 
				
			||||||
 | 
					    en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    doc = Doc(en_vocab, words=words)
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
					    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
				
			||||||
    doc = sentencizer(doc)
 | 
					    doc = sentencizer(doc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 | 
				
			||||||
        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
 | 
					        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
 | 
				
			||||||
        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
 | 
					        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
 | 
				
			||||||
        if strings1 == strings2:
 | 
					        if strings1 == strings2:
 | 
				
			||||||
            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
 | 
					            assert [s for s in vocab1_d.strings if s != "_SP"] == [
 | 
				
			||||||
 | 
					                s for s in vocab2_d.strings if s != "_SP"
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
 | 
					            assert [s for s in vocab1_d.strings if s != "_SP"] != [
 | 
				
			||||||
 | 
					                s for s in vocab2_d.strings if s != "_SP"
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
					@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
				
			||||||
| 
						 | 
					@ -76,7 +80,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
 | 
				
			||||||
def test_deserialize_vocab_seen_entries(strings, lex_attr):
 | 
					def test_deserialize_vocab_seen_entries(strings, lex_attr):
 | 
				
			||||||
    # Reported in #2153
 | 
					    # Reported in #2153
 | 
				
			||||||
    vocab = Vocab(strings=strings)
 | 
					    vocab = Vocab(strings=strings)
 | 
				
			||||||
    length = len(vocab)
 | 
					 | 
				
			||||||
    vocab.from_bytes(vocab.to_bytes())
 | 
					    vocab.from_bytes(vocab.to_bytes())
 | 
				
			||||||
    assert len(vocab.strings) == len(strings) + 1  # adds _SP
 | 
					    assert len(vocab.strings) == len(strings) + 1  # adds _SP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert list(sstore1_d) != list(sstore2_d)
 | 
					            assert list(sstore1_d) != list(sstore2_d)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
					@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 | 
				
			||||||
def test_pickle_vocab(strings, lex_attr):
 | 
					def test_pickle_vocab(strings, lex_attr):
 | 
				
			||||||
    vocab = Vocab(strings=strings)
 | 
					    vocab = Vocab(strings=strings)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    data = (
 | 
					    data = (
 | 
				
			||||||
        "I'll return the ₹54 amount",
 | 
					        "I'll return the ₹54 amount",
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
 | 
					            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
 | 
				
			||||||
            "entities": [(16, 19, "MONEY")],
 | 
					            "entities": [(16, 19, "MONEY")],
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    data = (
 | 
					    data = (
 | 
				
			||||||
        "I'll return the $54 amount",
 | 
					        "I'll return the $54 amount",
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
 | 
					            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
 | 
				
			||||||
            "entities": [(16, 19, "MONEY")],
 | 
					            "entities": [(16, 19, "MONEY")],
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -366,6 +366,7 @@ def test_vectors_serialize():
 | 
				
			||||||
        assert row == row_r
 | 
					        assert row == row_r
 | 
				
			||||||
        assert_equal(v.data, v_r.data)
 | 
					        assert_equal(v.data, v_r.data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_vector_is_oov():
 | 
					def test_vector_is_oov():
 | 
				
			||||||
    vocab = Vocab(vectors_name="test_vocab_is_oov")
 | 
					    vocab = Vocab(vectors_name="test_vocab_is_oov")
 | 
				
			||||||
    data = numpy.ndarray((5, 3), dtype="f")
 | 
					    data = numpy.ndarray((5, 3), dtype="f")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user