mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'master' into fix/travis-tests
This commit is contained in:
commit
bd6353715a
|
@ -187,12 +187,17 @@ def debug_data(
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
n_missing_vectors,
|
n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
|
||||||
n_missing_vectors / gold_train_data["n_words"],
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
|
"10 most common words without vectors: {}".format(
|
||||||
|
_format_labels(
|
||||||
|
gold_train_data["words_missing_vectors"].most_common(10),
|
||||||
|
counts=True,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
|
|
@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
|
||||||
str,
|
str,
|
||||||
),
|
),
|
||||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||||
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
base_model=(
|
||||||
|
"Base model (for languages with custom tokenizers)",
|
||||||
|
"option",
|
||||||
|
"b",
|
||||||
|
str,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def init_model(
|
def init_model(
|
||||||
lang,
|
lang,
|
||||||
|
|
|
@ -8,7 +8,7 @@ def add_codes(err_cls):
|
||||||
class ErrorsWithCodes(err_cls):
|
class ErrorsWithCodes(err_cls):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = super().__getattribute__(code)
|
msg = super().__getattribute__(code)
|
||||||
if code.startswith('__'): # python system attributes like __class__
|
if code.startswith("__"): # python system attributes like __class__
|
||||||
return msg
|
return msg
|
||||||
else:
|
else:
|
||||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
@ -116,6 +116,7 @@ class Warnings(object):
|
||||||
" to check the alignment. Misaligned entities ('-') will be "
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
"ignored during training.")
|
"ignored during training.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors(object):
|
class Errors(object):
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
|
||||||
from ..tag_map import TAG_MAP
|
from ..tag_map import TAG_MAP
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
|
@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
|
|
||||||
_exc[orth + "d"] = [
|
_exc[orth + "d"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "d", NORM: "'d"}
|
{ORTH: "d", NORM: "'d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
|
|
|
@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
from ..char_classes import merge_chars
|
from ..char_classes import merge_chars
|
||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
|
||||||
|
|
||||||
|
|
||||||
_list_units = [u for u in LIST_UNITS if u != "%"]
|
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||||
|
|
|
@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||||
).match
|
).match
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
class ArmenianDefaults(Language.Defaults):
|
class ArmenianDefaults(Language.Defaults):
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.hy.examples import sentences
|
>>> from spacy.lang.hy.examples import sentences
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
նա
|
նա
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
||||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
|
@ -716,7 +716,7 @@ TAG_MAP = {
|
||||||
POS: NOUN,
|
POS: NOUN,
|
||||||
"Animacy": "Nhum",
|
"Animacy": "Nhum",
|
||||||
"Case": "Dat",
|
"Case": "Dat",
|
||||||
"Number": "Coll",
|
# "Number": "Coll",
|
||||||
"Number": "Sing",
|
"Number": "Sing",
|
||||||
"Person": "1",
|
"Person": "1",
|
||||||
},
|
},
|
||||||
|
@ -815,7 +815,7 @@ TAG_MAP = {
|
||||||
"Animacy": "Nhum",
|
"Animacy": "Nhum",
|
||||||
"Case": "Nom",
|
"Case": "Nom",
|
||||||
"Definite": "Def",
|
"Definite": "Def",
|
||||||
"Number": "Plur",
|
# "Number": "Plur",
|
||||||
"Number": "Sing",
|
"Number": "Sing",
|
||||||
"Poss": "Yes",
|
"Poss": "Yes",
|
||||||
},
|
},
|
||||||
|
@ -880,7 +880,7 @@ TAG_MAP = {
|
||||||
POS: NOUN,
|
POS: NOUN,
|
||||||
"Animacy": "Nhum",
|
"Animacy": "Nhum",
|
||||||
"Case": "Nom",
|
"Case": "Nom",
|
||||||
"Number": "Plur",
|
# "Number": "Plur",
|
||||||
"Number": "Sing",
|
"Number": "Sing",
|
||||||
"Person": "2",
|
"Person": "2",
|
||||||
},
|
},
|
||||||
|
@ -1223,9 +1223,9 @@ TAG_MAP = {
|
||||||
"PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
|
"PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
|
||||||
POS: PRON,
|
POS: PRON,
|
||||||
"Case": "Nom",
|
"Case": "Nom",
|
||||||
"Number": "Sing",
|
# "Number": "Sing",
|
||||||
"Number": "Plur",
|
"Number": "Plur",
|
||||||
"Person": "3",
|
# "Person": "3",
|
||||||
"Person": "1",
|
"Person": "1",
|
||||||
"PronType": "Emp",
|
"PronType": "Emp",
|
||||||
},
|
},
|
||||||
|
|
|
@ -55,7 +55,7 @@ _num_words = [
|
||||||
"തൊണ്ണൂറ് ",
|
"തൊണ്ണൂറ് ",
|
||||||
"നുറ് ",
|
"നുറ് ",
|
||||||
"ആയിരം ",
|
"ആയിരം ",
|
||||||
"പത്തുലക്ഷം"
|
"പത്തുലക്ഷം",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
അത്
|
അത്
|
||||||
ഇത്
|
ഇത്
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import add_lookups
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...parts_of_speech import NAMES
|
from ...parts_of_speech import NAMES
|
||||||
from ...errors import Errors
|
|
||||||
|
|
||||||
|
|
||||||
class PolishLemmatizer(Lemmatizer):
|
class PolishLemmatizer(Lemmatizer):
|
||||||
|
|
|
@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
|
_prefixes = _prefixes = [
|
||||||
|
r"(długo|krótko|jedno|dwu|trzy|cztero)-"
|
||||||
|
] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
|
|
@ -40,7 +40,7 @@ _num_words = [
|
||||||
"miljard",
|
"miljard",
|
||||||
"biljon",
|
"biljon",
|
||||||
"biljard",
|
"biljard",
|
||||||
"kvadriljon"
|
"kvadriljon",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,6 @@ TAG_MAP = {
|
||||||
"NNPC": {POS: PROPN},
|
"NNPC": {POS: PROPN},
|
||||||
"NNC": {POS: NOUN},
|
"NNC": {POS: NOUN},
|
||||||
"PSP": {POS: ADP},
|
"PSP": {POS: ADP},
|
||||||
|
|
||||||
".": {POS: PUNCT},
|
".": {POS: PUNCT},
|
||||||
",": {POS: PUNCT},
|
",": {POS: PUNCT},
|
||||||
"-LRB-": {POS: PUNCT},
|
"-LRB-": {POS: PUNCT},
|
||||||
|
|
|
@ -79,7 +79,9 @@ class BaseDefaults(object):
|
||||||
lookups=lookups,
|
lookups=lookups,
|
||||||
)
|
)
|
||||||
vocab.lex_attr_getters[NORM] = util.add_lookups(
|
vocab.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
|
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||||
|
BASE_NORMS,
|
||||||
|
vocab.lookups.get_table("lexeme_norm"),
|
||||||
)
|
)
|
||||||
for tag_str, exc in cls.morph_rules.items():
|
for tag_str, exc in cls.morph_rules.items():
|
||||||
for orth_str, attrs in exc.items():
|
for orth_str, attrs in exc.items():
|
||||||
|
@ -974,7 +976,9 @@ class Language(object):
|
||||||
serializers = OrderedDict()
|
serializers = OrderedDict()
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||||
serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
|
serializers["meta.json"] = lambda: srsly.json_dumps(
|
||||||
|
OrderedDict(sorted(self.meta.items()))
|
||||||
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -112,6 +112,7 @@ def ga_tokenizer():
|
||||||
def gu_tokenizer():
|
def gu_tokenizer():
|
||||||
return get_lang_class("gu").Defaults.create_tokenizer()
|
return get_lang_class("gu").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def he_tokenizer():
|
def he_tokenizer():
|
||||||
return get_lang_class("he").Defaults.create_tokenizer()
|
return get_lang_class("he").Defaults.create_tokenizer()
|
||||||
|
@ -246,7 +247,9 @@ def yo_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_char():
|
def zh_tokenizer_char():
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
|
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||||
|
config={"use_jieba": False, "use_pkuseg": False}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
|
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||||
|
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|
|
@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
|
||||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
assert doc.text == text
|
assert doc.text == text
|
||||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||||
|
word for word in words if not word.isspace()
|
||||||
|
]
|
||||||
|
|
||||||
# partial whitespace in words
|
# partial whitespace in words
|
||||||
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
|
@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
|
||||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
assert doc.text == text
|
assert doc.text == text
|
||||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||||
|
word for word in words if not word.isspace()
|
||||||
|
]
|
||||||
|
|
||||||
# non-standard whitespace tokens
|
# non-standard whitespace tokens
|
||||||
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||||
|
@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
|
||||||
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
|
||||||
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
|
||||||
assert doc.text == text
|
assert doc.text == text
|
||||||
assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
|
assert [t.text for t in doc if not t.text.isspace()] == [
|
||||||
|
word for word in words if not word.isspace()
|
||||||
|
]
|
||||||
|
|
||||||
# mismatch between words and text
|
# mismatch between words and text
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_is_sent_end(en_tokenizer):
|
def test_is_sent_end(en_tokenizer):
|
||||||
doc = en_tokenizer("This is a sentence. This is another.")
|
doc = en_tokenizer("This is a sentence. This is another.")
|
||||||
assert doc[4].is_sent_end is None
|
assert doc[4].is_sent_end is None
|
||||||
|
@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
|
||||||
assert doc[1].is_sent_start is None
|
assert doc[1].is_sent_start is None
|
||||||
assert not doc.is_sentenced
|
assert not doc.is_sentenced
|
||||||
|
|
||||||
|
|
||||||
def test_tokenlast_has_sent_end_true():
|
def test_tokenlast_has_sent_end_true():
|
||||||
doc = Doc(Vocab(), words=["hello", "world"])
|
doc = Doc(Vocab(), words=["hello", "world"])
|
||||||
assert doc[0].is_sent_end is None
|
assert doc[0].is_sent_end is None
|
||||||
|
|
|
@ -3,17 +3,16 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
|
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
|
||||||
text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
|
text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
|
||||||
tokens = gu_tokenizer(text)
|
tokens = gu_tokenizer(text)
|
||||||
assert len(tokens) == 9
|
assert len(tokens) == 9
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
|
||||||
("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
|
|
||||||
("ખેતરની ખેડ કરવામાં આવે છે.", 5),
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
|
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
|
||||||
tokens = gu_tokenizer(text)
|
tokens = gu_tokenizer(text)
|
||||||
|
|
|
@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
|
||||||
|
10,
|
||||||
|
),
|
||||||
|
("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
|
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
|
||||||
tokens = ml_tokenizer(text)
|
tokens = ml_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
|
nlp = Chinese(
|
||||||
|
meta={
|
||||||
|
"tokenizer": {
|
||||||
|
"config": {
|
||||||
|
"use_jieba": False,
|
||||||
|
"use_pkuseg": True,
|
||||||
|
"pkuseg_model": "medicine",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
zh_tokenizer_serialize(nlp.tokenizer)
|
zh_tokenizer_serialize(nlp.tokenizer)
|
||||||
|
|
|
@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
||||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||||
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||||
updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
updated_user_dict = _get_pkuseg_trie_data(
|
||||||
|
zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
|
||||||
|
)
|
||||||
assert len(user_dict) == len(updated_user_dict) - 1
|
assert len(user_dict) == len(updated_user_dict) - 1
|
||||||
|
|
||||||
# reset user dict
|
# reset user dict
|
||||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
|
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
|
||||||
reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
reset_user_dict = _get_pkuseg_trie_data(
|
||||||
|
zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
|
||||||
|
)
|
||||||
assert len(reset_user_dict) == 0
|
assert len(reset_user_dict) == 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -272,8 +272,8 @@ def test_matcher_regex_shape(en_vocab):
|
||||||
(">=", ["a"]),
|
(">=", ["a"]),
|
||||||
("<=", ["aaa"]),
|
("<=", ["aaa"]),
|
||||||
(">", ["a", "aa"]),
|
(">", ["a", "aa"]),
|
||||||
("<", ["aa", "aaa"])
|
("<", ["aa", "aaa"]),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_matcher_compare_length(en_vocab, cmp, bad):
|
def test_matcher_compare_length(en_vocab, cmp, bad):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
|
@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
|
def test_sentencizer_custom_punct(
|
||||||
|
en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
|
||||||
|
):
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||||
doc = sentencizer(doc)
|
doc = sentencizer(doc)
|
||||||
|
|
|
@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||||
assert vocab1.to_bytes() == vocab1_b
|
assert vocab1.to_bytes() == vocab1_b
|
||||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||||
assert new_vocab1.to_bytes() == vocab1_b
|
assert new_vocab1.to_bytes() == vocab1_b
|
||||||
assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
|
assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
|
||||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
|
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
||||||
assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
|
assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
|
||||||
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
|
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
|
||||||
if strings1 == strings2:
|
if strings1 == strings2:
|
||||||
assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
|
assert [s for s in vocab1_d.strings if s != "_SP"] == [
|
||||||
|
s for s in vocab2_d.strings if s != "_SP"
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
|
assert [s for s in vocab1_d.strings if s != "_SP"] != [
|
||||||
|
s for s in vocab2_d.strings if s != "_SP"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
|
@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||||
def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
||||||
# Reported in #2153
|
# Reported in #2153
|
||||||
vocab = Vocab(strings=strings)
|
vocab = Vocab(strings=strings)
|
||||||
length = len(vocab)
|
|
||||||
vocab.from_bytes(vocab.to_bytes())
|
vocab.from_bytes(vocab.to_bytes())
|
||||||
assert len(vocab.strings) == len(strings) + 1 # adds _SP
|
assert len(vocab.strings) == len(strings) + 1 # adds _SP
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
|
@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
||||||
else:
|
else:
|
||||||
assert list(sstore1_d) != list(sstore2_d)
|
assert list(sstore1_d) != list(sstore2_d)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
def test_pickle_vocab(strings, lex_attr):
|
def test_pickle_vocab(strings, lex_attr):
|
||||||
vocab = Vocab(strings=strings)
|
vocab = Vocab(strings=strings)
|
||||||
|
|
|
@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
data = (
|
data = (
|
||||||
"I'll return the ₹54 amount",
|
"I'll return the ₹54 amount",
|
||||||
{
|
{
|
||||||
"words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
|
"words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
|
||||||
"entities": [(16, 19, "MONEY")],
|
"entities": [(16, 19, "MONEY")],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
data = (
|
data = (
|
||||||
"I'll return the $54 amount",
|
"I'll return the $54 amount",
|
||||||
{
|
{
|
||||||
"words": ["I", "'ll", "return", "the", "$", "54", "amount",],
|
"words": ["I", "'ll", "return", "the", "$", "54", "amount"],
|
||||||
"entities": [(16, 19, "MONEY")],
|
"entities": [(16, 19, "MONEY")],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
@ -366,6 +366,7 @@ def test_vectors_serialize():
|
||||||
assert row == row_r
|
assert row == row_r
|
||||||
assert_equal(v.data, v_r.data)
|
assert_equal(v.data, v_r.data)
|
||||||
|
|
||||||
|
|
||||||
def test_vector_is_oov():
|
def test_vector_is_oov():
|
||||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
|
|
|
@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
if word_start > 0:
|
if word_start > 0:
|
||||||
text_words.append(text[text_pos:text_pos+word_start])
|
text_words.append(text[text_pos : text_pos + word_start])
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
text_pos += word_start
|
text_pos += word_start
|
||||||
text_words.append(word)
|
text_words.append(word)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user