From 28256522c8a6311e6a20d66927ef2bd230755464 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Sun, 17 Jan 2021 08:48:43 -0300 Subject: [PATCH 01/24] Fix `spacy.util.minibatch` when the size iterator is finished (#6745) --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 95a9f087f..6cf87bcac 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -513,7 +513,7 @@ def minibatch(items, size=8): size_ = size items = iter(items) while True: - batch_size = next(size_) + batch_size = next(size_, 0) # StopIteration isn't handled in generators in Python >= 3.7. batch = list(itertools.islice(items, int(batch_size))) if len(batch) == 0: break From bc7d83d4be0742c01425529baa8aa356b7bc0c50 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 19 Jan 2021 00:38:11 +0100 Subject: [PATCH 02/24] Skip 0-length matches (#6759) Add hack to prevent matcher from returning 0-length matches. --- spacy/matcher/matcher.pyx | 3 ++- spacy/tests/matcher/test_matcher_api.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 644f7704b..a367dcc3a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: + # Skip 0-length matches. (TODO: fix algorithm) + if match not in seen and matches[i].length > 0: output.append(match) seen.add(match) return output diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 236f25130..75ee255d4 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab): assert "Rule" in matcher matcher.remove("Rule") assert "Rule" not in matcher + + +def test_matcher_no_zero_length(en_vocab): + doc = Doc(en_vocab, words=["a", "b"]) + doc[0].tag_ = "A" + doc[1].tag_ = "B" + doc.is_tagged = True + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) + assert len(matcher(doc)) == 0 From fdf8c77630b13758c1d49b335897084435e89e89 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 21 Jan 2021 09:59:17 +0100 Subject: [PATCH 03/24] support IS_SENT_START in PhraseMatcher (#6771) * support IS_SENT_START in PhraseMatcher * add unit test and friendlier error * use IDS.get instead --- spacy/matcher/phrasematcher.pyx | 5 ++++- spacy/tests/matcher/test_phrase_matcher.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 00c3357f5..c1883869e 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -8,6 +8,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter import warnings +from ..attrs import IDS from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token @@ -58,9 +59,11 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: raise ValueError(Errors.E152.format(attr=attr)) - self.attr = self.vocab.strings[attr] + self.attr = IDS.get(attr) def __len__(self): """Get the number of match IDs added to the matcher. diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 60aa584ef..b523ee157 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -290,3 +290,8 @@ def test_phrase_matcher_pickle(en_vocab): # clunky way to vaguely check that callback is unpickled (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] assert isinstance(callbacks.get("TEST2"), Mock) + + +@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) +def test_phrase_matcher_sent_start(en_vocab, attr): + matcher = PhraseMatcher(en_vocab, attr=attr) From 5ace559201c714ab89b3092b87d791e16973f31d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 21 Jan 2021 16:18:46 +0100 Subject: [PATCH 04/24] ensure span.text works for an empty span (#6772) --- spacy/tests/regression/test_issue6755.py | 9 +++++++++ spacy/tokens/span.pyx | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue6755.py diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py new file mode 100644 index 000000000..4c735b1ff --- /dev/null +++ b/spacy/tests/regression/test_issue6755.py @@ -0,0 +1,9 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cf0775bae..2ac8af9e4 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -500,7 +500,7 @@ cdef class Span: def text(self): """RETURNS (unicode): The original verbatim text of the span.""" text = self.text_with_ws - if self[-1].whitespace_: + if len(self) > 0 and self[-1].whitespace_: text = text[:-1] return text From 28d06ab860414e14b99fffc6d12d8928139a892c Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Fri, 22 Jan 2021 23:08:41 +0600 Subject: [PATCH 05/24] Add tokenizer_exceptions --- spacy/lang/ky/__init__.py | 31 +++++++++++++++ spacy/lang/ky/tokenizer_exceptions.py | 55 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 spacy/lang/ky/__init__.py create mode 100644 spacy/lang/ky/tokenizer_exceptions.py diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py new file mode 100644 index 000000000..3655e6264 --- /dev/null +++ b/spacy/lang/ky/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...attrs import LANG +from ...language import Language +from ...util import update_exc + + +class TatarDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "tt" + + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) + + stop_words = STOP_WORDS + + +class Tatar(Language): + lang = "tt" + Defaults = TatarDefaults + + +__all__ = ["Tatar"] diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py new file mode 100644 index 000000000..be5e9530c --- /dev/null +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "дүй", LEMMA: "дүйшөмбү"}, + {ORTH: "шей", LEMMA: "шейшемби"}, + {ORTH: "шар", LEMMA: "шаршемби"}, + {ORTH: "бей", LEMMA: "бейшемби"}, + {ORTH: "жум", LEMMA: "жума"}, + {ORTH: "ишм", LEMMA: "ишемби"}, + {ORTH: "жек", LEMMA: "жекшемби"}, + # Months abbreviations + {ORTH: "янв", LEMMA: "январь"}, + {ORTH: "фев", LEMMA: "февраль"}, + {ORTH: "мар", LEMMA: "март"}, + {ORTH: "апр", LEMMA: "апрель"}, + {ORTH: "июн", LEMMA: "июнь"}, + {ORTH: "июл", LEMMA: "июль"}, + {ORTH: "авг", LEMMA: "август"}, + {ORTH: "сен", LEMMA: "сентябрь"}, + {ORTH: "окт", LEMMA: "октябрь"}, + {ORTH: "ноя", LEMMA: "ноябрь"}, + {ORTH: "дек", LEMMA: "декабрь"}, + # Number abbreviations + {ORTH: "млрд", LEMMA: "миллиард"}, + {ORTH: "млн", LEMMA: "миллион"}, +] + +for abbr in _abbrev_exc: + for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + +for exc_data in [ # "etc." abbreviations + {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, + {ORTH: "ж.б.", NORM: "жана башка"}, + {ORTH: "ж.", NORM: "жыл"}, + {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"}, + {ORTH: "б.з.", NORM: "биздин заман"}, + {ORTH: "кк.", NORM: "кылымдар"}, + {ORTH: "жж.", NORM: "жылдар"}, + {ORTH: "к.", NORM: "кылым"}, + {ORTH: "көч.", NORM: "көчөсү"}, + {ORTH: "м-н", NORM: "менен"}, + {ORTH: "б-ча", NORM: "боюнча"}, +]: + exc_data[LEMMA] = exc_data[NORM] + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = _exc From 101d265778633f5f4cbe15013ab8c5cc3c9f3789 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:25:28 +0600 Subject: [PATCH 06/24] Add stopwords --- spacy/lang/ky/stop_words.py | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 spacy/lang/ky/stop_words.py diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py new file mode 100644 index 000000000..1f59539fe --- /dev/null +++ b/spacy/lang/ky/stop_words.py @@ -0,0 +1,47 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Tatar stopwords are from https://github.com/aliiae/stopwords-tt + +STOP_WORDS = set( +""" +ага адам айтты айтымында айтып ал алар +алардын алган алуу алып анда андан аны +анын ар + +бар басма баш башка башкы башчысы берген +биз билдирген билдирди бир биринчи бирок +бишкек болгон болот болсо болуп боюнча +буга бул + +гана + +да дагы деген деди деп + +жана жатат жаткан жаңы же жогорку жок жол +жолу + +кабыл калган кандай карата каршы катары +келген керек кийин кол кылмыш кыргыз +күнү көп + +маалымат мамлекеттик мен менен миң +мурдагы мыйзам мындай мүмкүн + +ошол ошондой + +сүрөт сөз + +тарабынан турган тууралуу + +укук учурда + +чейин чек + +экенин эки эл эле эмес эми эч + +үч үчүн + +өз +""".split() +) From 4418ec2eeedb0889968127ac4c0d9a1a0439723b Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:31:31 +0600 Subject: [PATCH 07/24] Add punctuation --- spacy/lang/ky/punctuation.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 spacy/lang/ky/punctuation.py diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py new file mode 100644 index 000000000..9ee66a59e --- /dev/null +++ b/spacy/lang/ky/punctuation.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS + +_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "") +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[0-9])", + ] +) + +TOKENIZER_INFIXES = _infixes From d53724ba1d6a22b3f25fad118dce14e3495040e5 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:35:25 +0600 Subject: [PATCH 08/24] Add lex_attrs --- spacy/lang/ky/lex_attrs.py | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 spacy/lang/ky/lex_attrs.py diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py new file mode 100644 index 000000000..af926b138 --- /dev/null +++ b/spacy/lang/ky/lex_attrs.py @@ -0,0 +1,51 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "нөл", + "ноль", + "бир", + "эки", + "үч", + "төрт", + "беш", + "алты", + "жети", + "сегиз", + "тогуз", + "он", + "жыйырма", + "отуз", + "кырк", + "элүү", + "алтымыш", + "жетмиш", + "сексен", + "токсон", + "жүз", + "миң", + "миллион", + "миллиард", + "триллион", + "триллиард", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} From 2f385385a95f9a6ce22dc8489a95fcd58b853fc2 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:36:28 +0600 Subject: [PATCH 09/24] Remove comment --- spacy/lang/ky/stop_words.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index 1f59539fe..eede62767 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals -# Tatar stopwords are from https://github.com/aliiae/stopwords-tt - STOP_WORDS = set( """ ага адам айтты айтымында айтып ал алар From e30bbf5432c86352c9ae0e7f9b5329ac6ba39620 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:49:08 +0600 Subject: [PATCH 10/24] Add examples --- spacy/lang/ky/examples.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spacy/lang/ky/examples.py diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py new file mode 100644 index 000000000..f1f31e3ab --- /dev/null +++ b/spacy/lang/ky/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.ky.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.", + "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.", + "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.", + "Лондон - Улуу Британияда жайгашкан ири шаар.", + "Кайдасың?", + "Франциянын президенти ким?", + "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?", + "Барак Обама качан төрөлгөн?", +] From fe3b5b8ff596117d39a9143f8d076a601e8016db Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:53:41 +0600 Subject: [PATCH 11/24] Add kyrgyz to char_classes --- spacy/lang/char_classes.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 3fb0fb41e..d876d375a 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -207,6 +207,10 @@ _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" _tatar = r"әөүҗңһӘӨҮҖҢҺ" +_kyrgyz_lower = r"өңү" +_kyrgyz_upper = r"ӨҢҮ" +_kyrgyz = r"өңүӨҢҮ" + _greek_lower = r"α-ωάέίόώήύ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ" @@ -219,8 +223,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ" _macedonian_upper = r"ЃЅЈЉЊЌЀЍ" _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ" -_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper -_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower +_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper +_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower _uncased = ( _ethiopic @@ -236,7 +240,7 @@ _uncased = ( + _cjk ) -ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased) +ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased) From 2a2646362be11ee9122328353d46f24277a6b1b5 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 22:00:50 +0600 Subject: [PATCH 12/24] Fix language subclass --- spacy/lang/ky/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py index 3655e6264..4656cfeb9 100644 --- a/spacy/lang/ky/__init__.py +++ b/spacy/lang/ky/__init__.py @@ -11,9 +11,9 @@ from ...language import Language from ...util import update_exc -class TatarDefaults(Language.Defaults): +class KyrgyzDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "tt" + lex_attr_getters[LANG] = lambda text: "ky" lex_attr_getters.update(LEX_ATTRS) @@ -23,9 +23,9 @@ class TatarDefaults(Language.Defaults): stop_words = STOP_WORDS -class Tatar(Language): - lang = "tt" - Defaults = TatarDefaults +class Kyrgyz(Language): + lang = "ky" + Defaults = KyrgyzDefaults -__all__ = ["Tatar"] +__all__ = ["Kyrgyz"] From 53abf759ad035ad64d4cfb1f0ae3ced1a6e00522 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sun, 24 Jan 2021 20:54:22 +0600 Subject: [PATCH 13/24] Fix punctuation --- spacy/lang/ky/punctuation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py index 9ee66a59e..22c2061ca 100644 --- a/spacy/lang/ky/punctuation.py +++ b/spacy/lang/ky/punctuation.py @@ -16,6 +16,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA), r"(?<=[0-9])-(?=[0-9])", ] ) From 87168eb81f679ba17b7ddac9fb934b058c70a40c Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sun, 24 Jan 2021 20:56:16 +0600 Subject: [PATCH 14/24] Add tests --- spacy/tests/conftest.py | 5 ++ spacy/tests/lang/ky/__init__.py | 0 spacy/tests/lang/ky/test_tokenizer.py | 91 +++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 spacy/tests/lang/ky/__init__.py create mode 100644 spacy/tests/lang/ky/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 90a18925b..ad545bcfd 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -262,6 +262,11 @@ def tt_tokenizer(): return get_lang_class("tt").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ky_tokenizer(): + return get_lang_class("ky").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py new file mode 100644 index 000000000..99dab2b16 --- /dev/null +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +INFIX_HYPHEN_TESTS = [ + ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()), + ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()), +] + +PUNC_INSIDE_WORDS_TESTS = [ + ( + "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.", + "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ," + " 783,9 млн. киши / жылына .".split(), + ), + ('То"кой', 'То " кой'.split()), +] + +MIXED_ORDINAL_NUMS_TESTS = [ + ("Эртең 22-январь...", "Эртең 22 - январь ...".split()) +] + +ABBREV_TESTS = [ + ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()), + ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()), + ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()), + ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()), + ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()), +] + +NAME_ABBREV_TESTS = [ + ("М.Жумаш", "М.Жумаш".split()), + ("М.жумаш", "М.жумаш".split()), + ("м.Жумаш", "м . Жумаш".split()), + ("Жумаш М.Н.", "Жумаш М.Н.".split()), + ("Жумаш.", "Жумаш .".split()), +] + +TYPOS_IN_PUNC_TESTS = [ + ("«3-жылда , туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()), + ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()), + ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()), # "?)" => "?)" or "? )" +] + +LONG_TEXTS_TESTS = [ + ( + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак: ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка (100 чакырымдан кем эмес) барып " + "келгенге аракет кылдык.", + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак : ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып " + "келгенге аракет кылдык .".split(), + ) +] + +TESTCASES = ( + INFIX_HYPHEN_TESTS + + PUNC_INSIDE_WORDS_TESTS + + MIXED_ORDINAL_NUMS_TESTS + + ABBREV_TESTS + + NAME_ABBREV_TESTS + + LONG_TEXTS_TESTS + + TYPOS_IN_PUNC_TESTS +) + +NORM_TESTCASES = [ + ( + "ит, мышык ж.б.у.с. үй жаныбарлары.", + ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."], + ) +] + + +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) +def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens): + tokens = [token.text for token in ky_tokenizer(text) if not token.is_space] + assert expected_tokens == tokens + + +@pytest.mark.parametrize("text,norms", NORM_TESTCASES) +def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms): + tokens = ky_tokenizer(text) + assert [token.norm_ for token in tokens] == norms From 79327197d133b106d2f524d172705842043c9f0a Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:34:12 +0600 Subject: [PATCH 15/24] Add contributor agreement --- .github/contributors/jumasheff.md | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jumasheff.md diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md new file mode 100644 index 000000000..1ce6d2341 --- /dev/null +++ b/.github/contributors/jumasheff.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Murat Jumashev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 25.01.2021 | +| GitHub username | jumasheff | +| Website (optional) | | From 7d0154a36e180a6ff01059d57b62d186f2fd4458 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:42:04 +0600 Subject: [PATCH 16/24] Added language meta data --- website/meta/languages.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 4975a1a1e..02a8eb123 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -151,6 +151,12 @@ { "code": "fa", "name": "Persian", "has_examples": true }, { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "tt", "name": "Tatar", "has_examples": true }, + { + "code": "ky", + "name": "Kyrgyz", + "example": "Адамга эң кыйыны — күн сайын адам болуу", + "has_examples": true + }, { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, { "code": "ga", "name": "Irish" }, From 2b19ebad59c37f97a374b8ea7eec127889ef4709 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:46:45 +0600 Subject: [PATCH 17/24] Remove Kyrgyz chars fr. char_classes since Tatar ones already cover --- spacy/lang/char_classes.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index d876d375a..3fb0fb41e 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -207,10 +207,6 @@ _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" _tatar = r"әөүҗңһӘӨҮҖҢҺ" -_kyrgyz_lower = r"өңү" -_kyrgyz_upper = r"ӨҢҮ" -_kyrgyz = r"өңүӨҢҮ" - _greek_lower = r"α-ωάέίόώήύ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ" @@ -223,8 +219,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ" _macedonian_upper = r"ЃЅЈЉЊЌЀЍ" _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ" -_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper -_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower +_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper +_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower _uncased = ( _ethiopic @@ -240,7 +236,7 @@ _uncased = ( + _cjk ) -ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased) +ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased) From 1f2b0ec168289189a7b4d1e07bf364457c0744f3 Mon Sep 17 00:00:00 2001 From: jganseman <555345+jganseman@users.noreply.github.com> Date: Tue, 26 Jan 2021 10:53:39 +0100 Subject: [PATCH 18/24] proposing a more concise explanation for is_oov proposing a more concise explanation for is_oov --- website/docs/api/token.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 9f8594c96..2b5d779e3 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -459,7 +459,7 @@ The L2 norm of the token's vector representation. | `like_url` | bool | Does the token resemble a URL? | | `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | +| `is_oov` | bool | Is the token out-of-vocabulary (i.e. does it not have a word vector?) | | `is_stop` | bool | Is the token part of a "stop list"? | | `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | From c9103d60fa9aa551cb383ac43acf42e39c70af03 Mon Sep 17 00:00:00 2001 From: jganseman <555345+jganseman@users.noreply.github.com> Date: Tue, 26 Jan 2021 11:02:31 +0100 Subject: [PATCH 19/24] Create jganseman.md --- .github/contributors/jganseman.md | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jganseman.md diff --git a/.github/contributors/jganseman.md b/.github/contributors/jganseman.md new file mode 100644 index 000000000..dc25bee1c --- /dev/null +++ b/.github/contributors/jganseman.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Joachim Ganseman | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 26/01/2021 | +| GitHub username | jganseman | +| Website (optional) | www.ganseman.be | From 8bc57ec37205842af77cbd8b1d40c380d992a6a8 Mon Sep 17 00:00:00 2001 From: jganseman <555345+jganseman@users.noreply.github.com> Date: Tue, 26 Jan 2021 11:09:16 +0100 Subject: [PATCH 20/24] also update is_oov in lexeme docs --- website/docs/api/lexeme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index f7f6d654c..f36ba1871 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation. | `like_url` | bool | Does the lexeme resemble a URL? | | `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Does the lexeme have a word vector? | +| `is_oov` | bool | Is the lexeme out-of-vocabulary (i.e. Does it not have a word vector?) | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | | `lang_` | unicode | Language of the parent vocabulary. | From 634ae609b42addf9d3bd7189086ebf646eeebd46 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 27 Jan 2021 13:08:00 +1100 Subject: [PATCH 21/24] Adjust formatting [ci skip] --- website/docs/usage/facts-figures.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 1fb932889..e77f384b5 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -95,7 +95,8 @@ results. Project template: ### Speed comparison {#benchmarks-speed} We compare the speed of different NLP libraries, measured in words per second -(WPS) - higher is better. The evaluation was performed on 10,000 Reddit comments. +(WPS) - higher is better. The evaluation was performed on 10,000 Reddit +comments.
@@ -105,7 +106,7 @@ We compare the speed of different NLP libraries, measured in words per second | spaCy | [`en_core_web_trf`](/models/en#en_core_web_trf) | 684 | 3,768 | | Stanza | `en_ewt` | 878 | 2,180 | | Flair | `pos`(`-fast`) & `ner`(`-fast`) | 323 | 1,184 | -| UDPipe | `english-ewt-ud-2.5` | 1,101 | NA | +| UDPipe | `english-ewt-ud-2.5` | 1,101 | _n/a_ |
From e3f8be9a948b7043162c9cd5aad22bd56ce446df Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 27 Jan 2021 13:29:22 +1100 Subject: [PATCH 22/24] Update language data --- spacy/lang/ky/__init__.py | 17 +++-------------- spacy/lang/ky/examples.py | 3 --- spacy/lang/ky/lex_attrs.py | 3 --- spacy/lang/ky/punctuation.py | 3 --- spacy/lang/ky/stop_words.py | 5 +---- spacy/lang/ky/tokenizer_exceptions.py | 7 +++---- 6 files changed, 7 insertions(+), 31 deletions(-) diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py index 4656cfeb9..a333db035 100644 --- a/spacy/lang/ky/__init__.py +++ b/spacy/lang/ky/__init__.py @@ -1,25 +1,14 @@ -# coding: utf8 -from __future__ import unicode_literals - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...attrs import LANG from ...language import Language -from ...util import update_exc class KyrgyzDefaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "ky" - - lex_attr_getters.update(LEX_ATTRS) - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - infixes = tuple(TOKENIZER_INFIXES) - + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py index f1f31e3ab..ba77ea975 100644 --- a/spacy/lang/ky/examples.py +++ b/spacy/lang/ky/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.ky.examples import sentences diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py index af926b138..bdf993482 100644 --- a/spacy/lang/ky/lex_attrs.py +++ b/spacy/lang/ky/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py index 22c2061ca..fa9819f80 100644 --- a/spacy/lang/ky/punctuation.py +++ b/spacy/lang/ky/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS from ..char_classes import LIST_ELLIPSES, LIST_ICONS diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index eede62767..ea40bdfa2 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,8 +1,5 @@ -# encoding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( -""" + """ ага адам айтты айтымында айтып ал алар алардын алган алуу алып анда андан аны анын ар diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py index be5e9530c..eb367aeef 100644 --- a/spacy/lang/ky/tokenizer_exceptions.py +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -1,7 +1,6 @@ -# coding: utf8 -from __future__ import unicode_literals - +from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM +from ...util import update_exc _exc = {} @@ -52,4 +51,4 @@ for exc_data in [ # "etc." abbreviations exc_data[LEMMA] = exc_data[NORM] _exc[exc_data[ORTH]] = [exc_data] -TOKENIZER_EXCEPTIONS = _exc +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) From 80ba9eaf7dee5a946be1723dd3e739b87f72e038 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 27 Jan 2021 21:29:02 +1100 Subject: [PATCH 23/24] Fix test --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 417b0bf1d..9209a840c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -267,7 +267,7 @@ def tt_tokenizer(): @pytest.fixture(scope="session") def ky_tokenizer(): - return get_lang_class("ky").Defaults.create_tokenizer() + return get_lang_class("ky")().tokenizer @pytest.fixture(scope="session") From 615dba9d99df0cd25fbecd8d58b19371da59a719 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 27 Jan 2021 22:11:42 +1100 Subject: [PATCH 24/24] Fix tokenizer exceptions --- spacy/lang/ky/tokenizer_exceptions.py | 50 +++++++++++++-------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py index eb367aeef..cd51c2714 100644 --- a/spacy/lang/ky/tokenizer_exceptions.py +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -1,39 +1,39 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, NORM from ...util import update_exc _exc = {} _abbrev_exc = [ # Weekdays abbreviations - {ORTH: "дүй", LEMMA: "дүйшөмбү"}, - {ORTH: "шей", LEMMA: "шейшемби"}, - {ORTH: "шар", LEMMA: "шаршемби"}, - {ORTH: "бей", LEMMA: "бейшемби"}, - {ORTH: "жум", LEMMA: "жума"}, - {ORTH: "ишм", LEMMA: "ишемби"}, - {ORTH: "жек", LEMMA: "жекшемби"}, + {ORTH: "дүй", NORM: "дүйшөмбү"}, + {ORTH: "шей", NORM: "шейшемби"}, + {ORTH: "шар", NORM: "шаршемби"}, + {ORTH: "бей", NORM: "бейшемби"}, + {ORTH: "жум", NORM: "жума"}, + {ORTH: "ишм", NORM: "ишемби"}, + {ORTH: "жек", NORM: "жекшемби"}, # Months abbreviations - {ORTH: "янв", LEMMA: "январь"}, - {ORTH: "фев", LEMMA: "февраль"}, - {ORTH: "мар", LEMMA: "март"}, - {ORTH: "апр", LEMMA: "апрель"}, - {ORTH: "июн", LEMMA: "июнь"}, - {ORTH: "июл", LEMMA: "июль"}, - {ORTH: "авг", LEMMA: "август"}, - {ORTH: "сен", LEMMA: "сентябрь"}, - {ORTH: "окт", LEMMA: "октябрь"}, - {ORTH: "ноя", LEMMA: "ноябрь"}, - {ORTH: "дек", LEMMA: "декабрь"}, + {ORTH: "янв", NORM: "январь"}, + {ORTH: "фев", NORM: "февраль"}, + {ORTH: "мар", NORM: "март"}, + {ORTH: "апр", NORM: "апрель"}, + {ORTH: "июн", NORM: "июнь"}, + {ORTH: "июл", NORM: "июль"}, + {ORTH: "авг", NORM: "август"}, + {ORTH: "сен", NORM: "сентябрь"}, + {ORTH: "окт", NORM: "октябрь"}, + {ORTH: "ноя", NORM: "ноябрь"}, + {ORTH: "дек", NORM: "декабрь"}, # Number abbreviations - {ORTH: "млрд", LEMMA: "миллиард"}, - {ORTH: "млн", LEMMA: "миллион"}, + {ORTH: "млрд", NORM: "миллиард"}, + {ORTH: "млн", NORM: "миллион"}, ] for abbr in _abbrev_exc: for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): - _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] - _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth] = [{ORTH: orth, NORM: abbr[NORM]}] + _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbr[NORM]}] for exc_data in [ # "etc." abbreviations {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, @@ -47,8 +47,6 @@ for exc_data in [ # "etc." abbreviations {ORTH: "көч.", NORM: "көчөсү"}, {ORTH: "м-н", NORM: "менен"}, {ORTH: "б-ча", NORM: "боюнча"}, -]: - exc_data[LEMMA] = exc_data[NORM] - _exc[exc_data[ORTH]] = [exc_data] +]: _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)