mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Merge branch 'master' into fix/travis-tests
This commit is contained in:
commit
bd6353715a
|
@ -187,12 +187,17 @@ def debug_data(
|
|||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||
n_missing_vectors,
|
||||
n_missing_vectors / gold_train_data["n_words"],
|
||||
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
|
||||
),
|
||||
)
|
||||
msg.text(
|
||||
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
|
||||
"10 most common words without vectors: {}".format(
|
||||
_format_labels(
|
||||
gold_train_data["words_missing_vectors"].most_common(10),
|
||||
counts=True,
|
||||
)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
|
|
@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
|
|||
str,
|
||||
),
|
||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
||||
base_model=(
|
||||
"Base model (for languages with custom tokenizers)",
|
||||
"option",
|
||||
"b",
|
||||
str,
|
||||
),
|
||||
)
|
||||
def init_model(
|
||||
lang,
|
||||
|
|
|
@ -8,7 +8,7 @@ def add_codes(err_cls):
|
|||
class ErrorsWithCodes(err_cls):
|
||||
def __getattribute__(self, code):
|
||||
msg = super().__getattribute__(code)
|
||||
if code.startswith('__'): # python system attributes like __class__
|
||||
if code.startswith("__"): # python system attributes like __class__
|
||||
return msg
|
||||
else:
|
||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||
|
@ -116,6 +116,7 @@ class Warnings(object):
|
|||
" to check the alignment. Misaligned entities ('-') will be "
|
||||
"ignored during training.")
|
||||
|
||||
|
||||
@add_codes
|
||||
class Errors(object):
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
|
|
|
@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
|
|||
from ..tag_map import TAG_MAP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
|
|
@ -47,7 +47,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
|
|||
lang lange leicht leider lieber los
|
||||
|
||||
machen macht machte mag magst man manche manchem manchen mancher manches mehr
|
||||
mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
|
||||
mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
|
||||
mögen möglich mögt morgen muss muß müssen musst müsst musste mussten
|
||||
|
||||
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
|
||||
|
|
|
@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
|||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d", NORM: "'d"}
|
||||
{ORTH: "d", NORM: "'d"},
|
||||
]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
|
|
|
@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
|||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from ..char_classes import merge_chars
|
||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||
|
|
|
@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
|
|||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKEN_MATCH = re.compile(
|
||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||
).match
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
એમ
|
||||
એમ
|
||||
આ
|
||||
એ
|
||||
રહી
|
||||
|
@ -24,7 +24,7 @@ STOP_WORDS = set(
|
|||
તેમને
|
||||
તેમના
|
||||
તેમણે
|
||||
તેમનું
|
||||
તેમનું
|
||||
તેમાં
|
||||
અને
|
||||
અહીં
|
||||
|
@ -33,12 +33,12 @@ STOP_WORDS = set(
|
|||
થાય
|
||||
જે
|
||||
ને
|
||||
કે
|
||||
કે
|
||||
ના
|
||||
ની
|
||||
નો
|
||||
ને
|
||||
નું
|
||||
નું
|
||||
શું
|
||||
માં
|
||||
પણ
|
||||
|
@ -69,12 +69,12 @@ STOP_WORDS = set(
|
|||
કોઈ
|
||||
કેમ
|
||||
કર્યો
|
||||
કર્યુ
|
||||
કર્યુ
|
||||
કરે
|
||||
સૌથી
|
||||
ત્યારબાદ
|
||||
ત્યારબાદ
|
||||
તથા
|
||||
દ્વારા
|
||||
દ્વારા
|
||||
જુઓ
|
||||
જાઓ
|
||||
જ્યારે
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
class ArmenianDefaults(Language.Defaults):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.hy.examples import sentences
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
նա
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
||||
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -716,7 +716,7 @@ TAG_MAP = {
|
|||
POS: NOUN,
|
||||
"Animacy": "Nhum",
|
||||
"Case": "Dat",
|
||||
"Number": "Coll",
|
||||
# "Number": "Coll",
|
||||
"Number": "Sing",
|
||||
"Person": "1",
|
||||
},
|
||||
|
@ -815,7 +815,7 @@ TAG_MAP = {
|
|||
"Animacy": "Nhum",
|
||||
"Case": "Nom",
|
||||
"Definite": "Def",
|
||||
"Number": "Plur",
|
||||
# "Number": "Plur",
|
||||
"Number": "Sing",
|
||||
"Poss": "Yes",
|
||||
},
|
||||
|
@ -880,7 +880,7 @@ TAG_MAP = {
|
|||
POS: NOUN,
|
||||
"Animacy": "Nhum",
|
||||
"Case": "Nom",
|
||||
"Number": "Plur",
|
||||
# "Number": "Plur",
|
||||
"Number": "Sing",
|
||||
"Person": "2",
|
||||
},
|
||||
|
@ -1223,9 +1223,9 @@ TAG_MAP = {
|
|||
"PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
|
||||
POS: PRON,
|
||||
"Case": "Nom",
|
||||
"Number": "Sing",
|
||||
# "Number": "Sing",
|
||||
"Number": "Plur",
|
||||
"Person": "3",
|
||||
# "Person": "3",
|
||||
"Person": "1",
|
||||
"PronType": "Emp",
|
||||
},
|
||||
|
|
|
@ -55,7 +55,7 @@ _num_words = [
|
|||
"തൊണ്ണൂറ് ",
|
||||
"നുറ് ",
|
||||
"ആയിരം ",
|
||||
"പത്തുലക്ഷം"
|
||||
"പത്തുലക്ഷം",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
||||
"""
|
||||
അത്
|
||||
ഇത്
|
||||
|
|
|
@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import add_lookups
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...parts_of_speech import NAMES
|
||||
from ...errors import Errors
|
||||
|
||||
|
||||
class PolishLemmatizer(Lemmatizer):
|
||||
|
|
|
@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
|||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
|
||||
_prefixes = _prefixes = [
|
||||
r"(długo|krótko|jedno|dwu|trzy|cztero)-"
|
||||
] + BASE_TOKENIZER_PREFIXES
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
|
|
|
@ -40,7 +40,7 @@ _num_words = [
|
|||
"miljard",
|
||||
"biljon",
|
||||
"biljard",
|
||||
"kvadriljon"
|
||||
"kvadriljon",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -38,7 +38,6 @@ TAG_MAP = {
|
|||
"NNPC": {POS: PROPN},
|
||||
"NNC": {POS: NOUN},
|
||||
"PSP": {POS: ADP},
|
||||
|
||||
".": {POS: PUNCT},
|
||||
",": {POS: PUNCT},
|
||||
"-LRB-": {POS: PUNCT},
|
||||
|
|
|
@ -79,7 +79,9 @@ class BaseDefaults(object):
|
|||
lookups=lookups,
|
||||
)
|
||||
vocab.lex_attr_getters[NORM] = util.add_lookups(
|
||||
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
|
||||
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
vocab.lookups.get_table("lexeme_norm"),
|
||||
)
|
||||
for tag_str, exc in cls.morph_rules.items():
|
||||
for orth_str, attrs in exc.items():
|
||||
|
@ -974,7 +976,9 @@ class Language(object):
|
|||
serializers = OrderedDict()
|
||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(
|
||||
OrderedDict(sorted(self.meta.items()))
|
||||
)
|
||||
for name, proc in self.pipeline:
|
||||
if name in exclude:
|
||||
continue
|
||||
|
|
|
@ -112,6 +112,7 @@ def ga_tokenizer():
|
|||
def gu_tokenizer():
|
||||
return get_lang_class("gu").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def he_tokenizer():
|
||||
return get_lang_class("he").Defaults.create_tokenizer()
|
||||
|
@ -246,7 +247,9 @@ def yo_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_char():
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||
config={"use_jieba": False, "use_pkuseg": False}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
|
|||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
|
|||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
assert doc.text == text
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||
word for word in words if not word.isspace()
|
||||
]
|
||||
|
||||
# partial whitespace in words
|
||||
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
|
@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
|
|||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
assert doc.text == text
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||
word for word in words if not word.isspace()
|
||||
]
|
||||
|
||||
# non-standard whitespace tokens
|
||||
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||
|
@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
|
|||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||
assert doc.text == text
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
||||
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||
word for word in words if not word.isspace()
|
||||
]
|
||||
|
||||
# mismatch between words and text
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
|
|||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
def test_is_sent_end(en_tokenizer):
|
||||
doc = en_tokenizer("This is a sentence. This is another.")
|
||||
assert doc[4].is_sent_end is None
|
||||
|
@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
|
|||
assert doc[1].is_sent_start is None
|
||||
assert not doc.is_sentenced
|
||||
|
||||
|
||||
def test_tokenlast_has_sent_end_true():
|
||||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_end is None
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = de_tokenizer("Er lag auf seinem")
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||
|
|
|
@ -13,9 +13,9 @@ from ...util import get_doc
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = en_tokenizer("This is a sentence")
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = es_tokenizer("en Oxford este verano")
|
||||
|
|
|
@ -62,4 +62,4 @@ def test_lex_attrs_like_number(es_tokenizer, text, match):
|
|||
@pytest.mark.parametrize("word", ["once"])
|
||||
def test_es_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
assert like_num(word.upper())
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
|
|
|
@ -3,17 +3,16 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
|
||||
text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
|
||||
tokens = gu_tokenizer(text)
|
||||
assert len(tokens) == 9
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
|
||||
("ખેતરની ખેડ કરવામાં આવે છે.", 5),
|
||||
],
|
||||
[("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
|
||||
)
|
||||
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
|
||||
tokens = gu_tokenizer(text)
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = id_tokenizer("sebelas")
|
||||
|
|
|
@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
|
|||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
(
|
||||
"എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
|
||||
10,
|
||||
),
|
||||
("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
|
||||
],
|
||||
)
|
||||
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
|
||||
tokens = ml_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -5,9 +5,9 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||
|
|
|
@ -7,9 +7,9 @@ from ...util import get_doc
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||
|
|
|
@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
|||
|
||||
@pytest.mark.slow
|
||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
|
||||
nlp = Chinese(
|
||||
meta={
|
||||
"tokenizer": {
|
||||
"config": {
|
||||
"use_jieba": False,
|
||||
"use_pkuseg": True,
|
||||
"pkuseg_model": "medicine",
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
zh_tokenizer_serialize(nlp.tokenizer)
|
||||
|
|
|
@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
|||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||
updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
updated_user_dict = _get_pkuseg_trie_data(
|
||||
zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
|
||||
)
|
||||
assert len(user_dict) == len(updated_user_dict) - 1
|
||||
|
||||
# reset user dict
|
||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
|
||||
reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
reset_user_dict = _get_pkuseg_trie_data(
|
||||
zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
|
||||
)
|
||||
assert len(reset_user_dict) == 0
|
||||
|
||||
|
||||
|
|
|
@ -265,15 +265,15 @@ def test_matcher_regex_shape(en_vocab):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cmp, bad",
|
||||
"cmp, bad",
|
||||
[
|
||||
("==", ["a", "aaa"]),
|
||||
("!=", ["aa"]),
|
||||
(">=", ["a"]),
|
||||
("<=", ["aaa"]),
|
||||
(">", ["a", "aa"]),
|
||||
("<", ["aa", "aaa"])
|
||||
]
|
||||
("<", ["aa", "aaa"]),
|
||||
],
|
||||
)
|
||||
def test_matcher_compare_length(en_vocab, cmp, bad):
|
||||
matcher = Matcher(en_vocab)
|
||||
|
|
|
@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
|
|||
),
|
||||
],
|
||||
)
|
||||
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
|
||||
def test_sentencizer_custom_punct(
|
||||
en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
|
||||
):
|
||||
doc = Doc(en_vocab, words=words)
|
||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||
doc = sentencizer(doc)
|
||||
|
|
|
@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
assert vocab1.to_bytes() == vocab1_b
|
||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||
assert new_vocab1.to_bytes() == vocab1_b
|
||||
assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
|
||||
assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
|
||||
|
||||
|
||||
|
@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
|||
assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
|
||||
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
|
||||
if strings1 == strings2:
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] == [
|
||||
s for s in vocab2_d.strings if s != "_SP"
|
||||
]
|
||||
else:
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] != [
|
||||
s for s in vocab2_d.strings if s != "_SP"
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
|||
def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
||||
# Reported in #2153
|
||||
vocab = Vocab(strings=strings)
|
||||
length = len(vocab)
|
||||
vocab.from_bytes(vocab.to_bytes())
|
||||
assert len(vocab.strings) == len(strings) + 1 # adds _SP
|
||||
assert len(vocab.strings) == len(strings) + 1 # adds _SP
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
|||
else:
|
||||
assert list(sstore1_d) != list(sstore2_d)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
def test_pickle_vocab(strings, lex_attr):
|
||||
vocab = Vocab(strings=strings)
|
||||
|
|
|
@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
data = (
|
||||
"I'll return the ₹54 amount",
|
||||
{
|
||||
"words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
|
||||
"words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
|
||||
"entities": [(16, 19, "MONEY")],
|
||||
},
|
||||
)
|
||||
|
@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
data = (
|
||||
"I'll return the $54 amount",
|
||||
{
|
||||
"words": ["I", "'ll", "return", "the", "$", "54", "amount",],
|
||||
"words": ["I", "'ll", "return", "the", "$", "54", "amount"],
|
||||
"entities": [(16, 19, "MONEY")],
|
||||
},
|
||||
)
|
||||
|
|
|
@ -366,6 +366,7 @@ def test_vectors_serialize():
|
|||
assert row == row_r
|
||||
assert_equal(v.data, v_r.data)
|
||||
|
||||
|
||||
def test_vector_is_oov():
|
||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
|
@ -375,4 +376,4 @@ def test_vector_is_oov():
|
|||
vocab.set_vector("dog", data[1])
|
||||
assert vocab["cat"].is_oov is True
|
||||
assert vocab["dog"].is_oov is True
|
||||
assert vocab["hamster"].is_oov is False
|
||||
assert vocab["hamster"].is_oov is False
|
||||
|
|
|
@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
|
|||
except ValueError:
|
||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||
if word_start > 0:
|
||||
text_words.append(text[text_pos:text_pos+word_start])
|
||||
text_words.append(text[text_pos : text_pos + word_start])
|
||||
text_spaces.append(False)
|
||||
text_pos += word_start
|
||||
text_words.append(word)
|
||||
|
|
Loading…
Reference in New Issue
Block a user