From 28256522c8a6311e6a20d66927ef2bd230755464 Mon Sep 17 00:00:00 2001
From: Santiago Castro <bryant@montevideo.com.uy>
Date: Sun, 17 Jan 2021 08:48:43 -0300
Subject: [PATCH 01/24] Fix `spacy.util.minibatch` when the size iterator is
 finished (#6745)

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 95a9f087f..6cf87bcac 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -513,7 +513,7 @@ def minibatch(items, size=8):
         size_ = size
     items = iter(items)
     while True:
-        batch_size = next(size_)
+        batch_size = next(size_, 0)  # StopIteration isn't handled in generators in Python >= 3.7.
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From bc7d83d4be0742c01425529baa8aa356b7bc0c50 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 19 Jan 2021 00:38:11 +0100
Subject: [PATCH 02/24] Skip 0-length matches (#6759)

Add hack to prevent matcher from returning 0-length matches.
---
 spacy/matcher/matcher.pyx               |  3 ++-
 spacy/tests/matcher/test_matcher_api.py | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 644f7704b..a367dcc3a 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         # We need to deduplicate, because we could otherwise arrive at the same
         # match through two paths, e.g. .?.? matching 'a'. Are we matching the
         # first .?, or the second .? -- it doesn't matter, it's just one match.
-        if match not in seen:
+        # Skip 0-length matches. (TODO: fix algorithm)
+        if match not in seen and matches[i].length > 0:
             output.append(match)
             seen.add(match)
     return output
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 236f25130..75ee255d4 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab):
     assert "Rule" in matcher
     matcher.remove("Rule")
     assert "Rule" not in matcher
+
+
+def test_matcher_no_zero_length(en_vocab):
+    doc = Doc(en_vocab, words=["a", "b"])
+    doc[0].tag_ = "A"
+    doc[1].tag_ = "B"
+    doc.is_tagged = True
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
+    assert len(matcher(doc)) == 0

From fdf8c77630b13758c1d49b335897084435e89e89 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 21 Jan 2021 09:59:17 +0100
Subject: [PATCH 03/24] support IS_SENT_START in PhraseMatcher (#6771)

* support IS_SENT_START in PhraseMatcher

* add unit test and friendlier error

* use IDS.get instead
---
 spacy/matcher/phrasematcher.pyx            | 5 ++++-
 spacy/tests/matcher/test_phrase_matcher.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 00c3357f5..c1883869e 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -8,6 +8,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
+from ..attrs import IDS
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
@@ -58,9 +59,11 @@ cdef class PhraseMatcher:
             attr = attr.upper()
             if attr == "TEXT":
                 attr = "ORTH"
+            if attr == "IS_SENT_START":
+                attr = "SENT_START"
             if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
                 raise ValueError(Errors.E152.format(attr=attr))
-            self.attr = self.vocab.strings[attr]
+            self.attr = IDS.get(attr)
 
     def __len__(self):
         """Get the number of match IDs added to the matcher.
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 60aa584ef..b523ee157 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -290,3 +290,8 @@ def test_phrase_matcher_pickle(en_vocab):
     # clunky way to vaguely check that callback is unpickled
     (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
     assert isinstance(callbacks.get("TEST2"), Mock)
+
+
+@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
+def test_phrase_matcher_sent_start(en_vocab, attr):
+    matcher = PhraseMatcher(en_vocab, attr=attr)

From 5ace559201c714ab89b3092b87d791e16973f31d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 21 Jan 2021 16:18:46 +0100
Subject: [PATCH 04/24] ensure span.text works for an empty span (#6772)

---
 spacy/tests/regression/test_issue6755.py | 9 +++++++++
 spacy/tokens/span.pyx                    | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/regression/test_issue6755.py

diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py
new file mode 100644
index 000000000..4c735b1ff
--- /dev/null
+++ b/spacy/tests/regression/test_issue6755.py
@@ -0,0 +1,9 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+def test_issue6755(en_tokenizer):
+    doc = en_tokenizer("This is a magnificent sentence.")
+    span = doc[:0]
+    assert span.text_with_ws == ""
+    assert span.text == ""
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cf0775bae..2ac8af9e4 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -500,7 +500,7 @@ cdef class Span:
     def text(self):
         """RETURNS (unicode): The original verbatim text of the span."""
         text = self.text_with_ws
-        if self[-1].whitespace_:
+        if len(self) > 0 and self[-1].whitespace_:
             text = text[:-1]
         return text
 

From 28d06ab860414e14b99fffc6d12d8928139a892c Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Fri, 22 Jan 2021 23:08:41 +0600
Subject: [PATCH 05/24] Add tokenizer_exceptions

---
 spacy/lang/ky/__init__.py             | 31 +++++++++++++++
 spacy/lang/ky/tokenizer_exceptions.py | 55 +++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 spacy/lang/ky/__init__.py
 create mode 100644 spacy/lang/ky/tokenizer_exceptions.py

diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
new file mode 100644
index 000000000..3655e6264
--- /dev/null
+++ b/spacy/lang/ky/__init__.py
@@ -0,0 +1,31 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...attrs import LANG
+from ...language import Language
+from ...util import update_exc
+
+
+class TatarDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: "tt"
+
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    infixes = tuple(TOKENIZER_INFIXES)
+
+    stop_words = STOP_WORDS
+
+
+class Tatar(Language):
+    lang = "tt"
+    Defaults = TatarDefaults
+
+
+__all__ = ["Tatar"]
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
new file mode 100644
index 000000000..be5e9530c
--- /dev/null
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, LEMMA, NORM
+
+_exc = {}
+
+_abbrev_exc = [
+    # Weekdays abbreviations
+    {ORTH: "дүй", LEMMA: "дүйшөмбү"},
+    {ORTH: "шей", LEMMA: "шейшемби"},
+    {ORTH: "шар", LEMMA: "шаршемби"},
+    {ORTH: "бей", LEMMA: "бейшемби"},
+    {ORTH: "жум", LEMMA: "жума"},
+    {ORTH: "ишм", LEMMA: "ишемби"},
+    {ORTH: "жек", LEMMA: "жекшемби"},
+    # Months abbreviations
+    {ORTH: "янв", LEMMA: "январь"},
+    {ORTH: "фев", LEMMA: "февраль"},
+    {ORTH: "мар", LEMMA: "март"},
+    {ORTH: "апр", LEMMA: "апрель"},
+    {ORTH: "июн", LEMMA: "июнь"},
+    {ORTH: "июл", LEMMA: "июль"},
+    {ORTH: "авг", LEMMA: "август"},
+    {ORTH: "сен", LEMMA: "сентябрь"},
+    {ORTH: "окт", LEMMA: "октябрь"},
+    {ORTH: "ноя", LEMMA: "ноябрь"},
+    {ORTH: "дек", LEMMA: "декабрь"},
+    # Number abbreviations
+    {ORTH: "млрд", LEMMA: "миллиард"},
+    {ORTH: "млн", LEMMA: "миллион"},
+]
+
+for abbr in _abbrev_exc:
+    for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()):
+        _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+        _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+
+for exc_data in [  # "etc." abbreviations
+    {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"},
+    {ORTH: "ж.б.", NORM: "жана башка"},
+    {ORTH: "ж.", NORM: "жыл"},
+    {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"},
+    {ORTH: "б.з.", NORM: "биздин заман"},
+    {ORTH: "кк.", NORM: "кылымдар"},
+    {ORTH: "жж.", NORM: "жылдар"},
+    {ORTH: "к.", NORM: "кылым"},
+    {ORTH: "көч.", NORM: "көчөсү"},
+    {ORTH: "м-н", NORM: "менен"},
+    {ORTH: "б-ча", NORM: "боюнча"},
+]:
+    exc_data[LEMMA] = exc_data[NORM]
+    _exc[exc_data[ORTH]] = [exc_data]
+
+TOKENIZER_EXCEPTIONS = _exc

From 101d265778633f5f4cbe15013ab8c5cc3c9f3789 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:25:28 +0600
Subject: [PATCH 06/24] Add stopwords

---
 spacy/lang/ky/stop_words.py | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 spacy/lang/ky/stop_words.py

diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py
new file mode 100644
index 000000000..1f59539fe
--- /dev/null
+++ b/spacy/lang/ky/stop_words.py
@@ -0,0 +1,47 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
+
+STOP_WORDS = set(
+"""
+ага адам айтты айтымында айтып ал алар
+алардын алган алуу алып анда андан аны
+анын ар
+
+бар басма баш башка башкы башчысы берген
+биз билдирген билдирди бир биринчи бирок
+бишкек болгон болот болсо болуп боюнча
+буга бул
+
+гана
+
+да дагы деген деди деп
+
+жана жатат жаткан жаңы же жогорку жок жол
+жолу
+
+кабыл калган кандай карата каршы катары
+келген керек кийин кол кылмыш кыргыз
+күнү көп
+
+маалымат мамлекеттик мен менен миң
+мурдагы мыйзам мындай мүмкүн
+
+ошол ошондой
+
+сүрөт сөз
+
+тарабынан турган тууралуу
+
+укук учурда
+
+чейин чек
+
+экенин эки эл эле эмес эми эч
+
+үч үчүн
+
+өз
+""".split()
+)

From 4418ec2eeedb0889968127ac4c0d9a1a0439723b Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:31:31 +0600
Subject: [PATCH 07/24] Add punctuation

---
 spacy/lang/ky/punctuation.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 spacy/lang/ky/punctuation.py

diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
new file mode 100644
index 000000000..9ee66a59e
--- /dev/null
+++ b/spacy/lang/ky/punctuation.py
@@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+
+_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
+        r"(?<=[0-9])-(?=[0-9])",
+    ]
+)
+
+TOKENIZER_INFIXES = _infixes

From d53724ba1d6a22b3f25fad118dce14e3495040e5 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:35:25 +0600
Subject: [PATCH 08/24] Add lex_attrs

---
 spacy/lang/ky/lex_attrs.py | 51 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 spacy/lang/ky/lex_attrs.py

diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py
new file mode 100644
index 000000000..af926b138
--- /dev/null
+++ b/spacy/lang/ky/lex_attrs.py
@@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "нөл",
+    "ноль",
+    "бир",
+    "эки",
+    "үч",
+    "төрт",
+    "беш",
+    "алты",
+    "жети",
+    "сегиз",
+    "тогуз",
+    "он",
+    "жыйырма",
+    "отуз",
+    "кырк",
+    "элүү",
+    "алтымыш",
+    "жетмиш",
+    "сексен",
+    "токсон",
+    "жүз",
+    "миң",
+    "миллион",
+    "миллиард",
+    "триллион",
+    "триллиард",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}

From 2f385385a95f9a6ce22dc8489a95fcd58b853fc2 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:36:28 +0600
Subject: [PATCH 09/24] Remove comment

---
 spacy/lang/ky/stop_words.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py
index 1f59539fe..eede62767 100644
--- a/spacy/lang/ky/stop_words.py
+++ b/spacy/lang/ky/stop_words.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
-
 STOP_WORDS = set(
 """
 ага адам айтты айтымында айтып ал алар

From e30bbf5432c86352c9ae0e7f9b5329ac6ba39620 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:49:08 +0600
Subject: [PATCH 10/24] Add examples

---
 spacy/lang/ky/examples.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 spacy/lang/ky/examples.py

diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py
new file mode 100644
index 000000000..f1f31e3ab
--- /dev/null
+++ b/spacy/lang/ky/examples.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.ky.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.",
+    "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.",
+    "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.",
+    "Лондон - Улуу Британияда жайгашкан ири шаар.",
+    "Кайдасың?",
+    "Франциянын президенти ким?",
+    "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?",
+    "Барак Обама качан төрөлгөн?",
+]

From fe3b5b8ff596117d39a9143f8d076a601e8016db Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:53:41 +0600
Subject: [PATCH 11/24] Add kyrgyz to char_classes

---
 spacy/lang/char_classes.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 3fb0fb41e..d876d375a 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -207,6 +207,10 @@ _tatar_lower = r"әөүҗңһ"
 _tatar_upper = r"ӘӨҮҖҢҺ"
 _tatar = r"әөүҗңһӘӨҮҖҢҺ"
 
+_kyrgyz_lower = r"өңү"
+_kyrgyz_upper = r"ӨҢҮ"
+_kyrgyz = r"өңүӨҢҮ"
+
 _greek_lower = r"α-ωάέίόώήύ"
 _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
 _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
@@ -219,8 +223,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ"
 _macedonian_upper = r"ЃЅЈЉЊЌЀЍ"
 _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ"
 
-_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
-_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
+_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
+_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
 
 _uncased = (
     _ethiopic
@@ -236,7 +240,7 @@ _uncased = (
     + _cjk
 )
 
-ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased)
+ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)
 

From 2a2646362be11ee9122328353d46f24277a6b1b5 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 22:00:50 +0600
Subject: [PATCH 12/24] Fix language subclass

---
 spacy/lang/ky/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index 3655e6264..4656cfeb9 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -11,9 +11,9 @@ from ...language import Language
 from ...util import update_exc
 
 
-class TatarDefaults(Language.Defaults):
+class KyrgyzDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "tt"
+    lex_attr_getters[LANG] = lambda text: "ky"
 
     lex_attr_getters.update(LEX_ATTRS)
 
@@ -23,9 +23,9 @@ class TatarDefaults(Language.Defaults):
     stop_words = STOP_WORDS
 
 
-class Tatar(Language):
-    lang = "tt"
-    Defaults = TatarDefaults
+class Kyrgyz(Language):
+    lang = "ky"
+    Defaults = KyrgyzDefaults
 
 
-__all__ = ["Tatar"]
+__all__ = ["Kyrgyz"]

From 53abf759ad035ad64d4cfb1f0ae3ced1a6e00522 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sun, 24 Jan 2021 20:54:22 +0600
Subject: [PATCH 13/24] Fix punctuation

---
 spacy/lang/ky/punctuation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index 9ee66a59e..22c2061ca 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -16,6 +16,7 @@ _infixes = (
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
         r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
+        r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
         r"(?<=[0-9])-(?=[0-9])",
     ]
 )

From 87168eb81f679ba17b7ddac9fb934b058c70a40c Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sun, 24 Jan 2021 20:56:16 +0600
Subject: [PATCH 14/24] Add tests

---
 spacy/tests/conftest.py               |  5 ++
 spacy/tests/lang/ky/__init__.py       |  0
 spacy/tests/lang/ky/test_tokenizer.py | 91 +++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 spacy/tests/lang/ky/__init__.py
 create mode 100644 spacy/tests/lang/ky/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 90a18925b..ad545bcfd 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -262,6 +262,11 @@ def tt_tokenizer():
     return get_lang_class("tt").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def ky_tokenizer():
+    return get_lang_class("ky").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def uk_tokenizer():
     pytest.importorskip("pymorphy2")
diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py
new file mode 100644
index 000000000..99dab2b16
--- /dev/null
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@@ -0,0 +1,91 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+INFIX_HYPHEN_TESTS = [
+    ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()),
+    ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()),
+]
+
+PUNC_INSIDE_WORDS_TESTS = [
+    (
+        "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.",
+        "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ,"
+        " 783,9 млн. киши / жылына .".split(),
+    ),
+    ('То"кой', 'То " кой'.split()),
+]
+
+MIXED_ORDINAL_NUMS_TESTS = [
+    ("Эртең 22-январь...", "Эртең 22 - январь ...".split())
+]
+
+ABBREV_TESTS = [
+    ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()),
+    ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()),
+    ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()),
+    ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()),
+    ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()),
+]
+
+NAME_ABBREV_TESTS = [
+    ("М.Жумаш", "М.Жумаш".split()),
+    ("М.жумаш", "М.жумаш".split()),
+    ("м.Жумаш", "м . Жумаш".split()),
+    ("Жумаш М.Н.", "Жумаш М.Н.".split()),
+    ("Жумаш.", "Жумаш .".split()),
+]
+
+TYPOS_IN_PUNC_TESTS = [
+    ("«3-жылда , туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()),
+    ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()),
+    ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()),  # "?)" => "?)" or "? )"
+]
+
+LONG_TEXTS_TESTS = [
+    (
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак: ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка (100 чакырымдан кем эмес) барып "
+        "келгенге аракет кылдык.",
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак : ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып "
+        "келгенге аракет кылдык .".split(),
+    )
+]
+
+TESTCASES = (
+    INFIX_HYPHEN_TESTS
+    + PUNC_INSIDE_WORDS_TESTS
+    + MIXED_ORDINAL_NUMS_TESTS
+    + ABBREV_TESTS
+    + NAME_ABBREV_TESTS
+    + LONG_TEXTS_TESTS
+    + TYPOS_IN_PUNC_TESTS
+)
+
+NORM_TESTCASES = [
+    (
+        "ит, мышык ж.б.у.с. үй жаныбарлары.",
+        ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."],
+    )
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
+def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ky_tokenizer(text) if not token.is_space]
+    assert expected_tokens == tokens
+
+
+@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
+def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms):
+    tokens = ky_tokenizer(text)
+    assert [token.norm_ for token in tokens] == norms

From 79327197d133b106d2f524d172705842043c9f0a Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:34:12 +0600
Subject: [PATCH 15/24] Add contributor agreement

---
 .github/contributors/jumasheff.md | 106 ++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jumasheff.md

diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md
new file mode 100644
index 000000000..1ce6d2341
--- /dev/null
+++ b/.github/contributors/jumasheff.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Murat Jumashev       |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 25.01.2021           |
+| GitHub username                | jumasheff            |
+| Website (optional)             |                      |

From 7d0154a36e180a6ff01059d57b62d186f2fd4458 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:42:04 +0600
Subject: [PATCH 16/24] Added language meta data

---
 website/meta/languages.json | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 4975a1a1e..02a8eb123 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -151,6 +151,12 @@
         { "code": "fa", "name": "Persian", "has_examples": true },
         { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
         { "code": "tt", "name": "Tatar", "has_examples": true },
+        {
+            "code": "ky",
+            "name": "Kyrgyz",
+            "example": "Адамга эң кыйыны — күн сайын адам болуу",
+            "has_examples": true
+        },
         { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
         { "code": "ga", "name": "Irish" },

From 2b19ebad59c37f97a374b8ea7eec127889ef4709 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:46:45 +0600
Subject: [PATCH 17/24] Remove Kyrgyz chars fr. char_classes since Tatar ones
 already cover

---
 spacy/lang/char_classes.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index d876d375a..3fb0fb41e 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -207,10 +207,6 @@ _tatar_lower = r"әөүҗңһ"
 _tatar_upper = r"ӘӨҮҖҢҺ"
 _tatar = r"әөүҗңһӘӨҮҖҢҺ"
 
-_kyrgyz_lower = r"өңү"
-_kyrgyz_upper = r"ӨҢҮ"
-_kyrgyz = r"өңүӨҢҮ"
-
 _greek_lower = r"α-ωάέίόώήύ"
 _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
 _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
@@ -223,8 +219,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ"
 _macedonian_upper = r"ЃЅЈЉЊЌЀЍ"
 _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ"
 
-_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
-_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
+_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
+_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
 
 _uncased = (
     _ethiopic
@@ -240,7 +236,7 @@ _uncased = (
     + _cjk
 )
 
-ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased)
+ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)
 

From 1f2b0ec168289189a7b4d1e07bf364457c0744f3 Mon Sep 17 00:00:00 2001
From: jganseman <555345+jganseman@users.noreply.github.com>
Date: Tue, 26 Jan 2021 10:53:39 +0100
Subject: [PATCH 18/24] proposing a more concise explanation for is_oov

proposing a more concise explanation for is_oov
---
 website/docs/api/token.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 9f8594c96..2b5d779e3 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -459,7 +459,7 @@ The L2 norm of the token's vector representation.
 | `like_url`                                   | bool         | Does the token resemble a URL?                                                                                                                                                                                                                                 |
 | `like_num`                                   | bool         | Does the token represent a number? e.g. "10.9", "10", "ten", etc.                                                                                                                                                                                              |
 | `like_email`                                 | bool         | Does the token resemble an email address?                                                                                                                                                                                                                      |
-| `is_oov`                                     | bool         | Does the token have a word vector?                                                                                                                                                                                                                              |
+| `is_oov`                                     | bool         | Is the token out-of-vocabulary (i.e. does it not have a word vector?)                                                                                                                                                                                                                              |
 | `is_stop`                                    | bool         | Is the token part of a "stop list"?                                                                                                                                                                                                                            |
 | `pos`                                        | int          | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/).                                                                                                                                                 |
 | `pos_`                                       | unicode      | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/).                                                                                                                                                 |

From c9103d60fa9aa551cb383ac43acf42e39c70af03 Mon Sep 17 00:00:00 2001
From: jganseman <555345+jganseman@users.noreply.github.com>
Date: Tue, 26 Jan 2021 11:02:31 +0100
Subject: [PATCH 19/24] Create jganseman.md

---
 .github/contributors/jganseman.md | 106 ++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jganseman.md

diff --git a/.github/contributors/jganseman.md b/.github/contributors/jganseman.md
new file mode 100644
index 000000000..dc25bee1c
--- /dev/null
+++ b/.github/contributors/jganseman.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Joachim Ganseman     |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 26/01/2021           |
+| GitHub username                | jganseman            |
+| Website (optional)             | www.ganseman.be      |

From 8bc57ec37205842af77cbd8b1d40c380d992a6a8 Mon Sep 17 00:00:00 2001
From: jganseman <555345+jganseman@users.noreply.github.com>
Date: Tue, 26 Jan 2021 11:09:16 +0100
Subject: [PATCH 20/24] also update is_oov in lexeme docs

---
 website/docs/api/lexeme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index f7f6d654c..f36ba1871 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation.
 | `like_url`                                   | bool    | Does the lexeme resemble a URL?                                                                                                                                                                                                                              |
 | `like_num`                                   | bool    | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.                                                                                                                                                                                           |
 | `like_email`                                 | bool    | Does the lexeme resemble an email address?                                                                                                                                                                                                                   |
-| `is_oov`                                     | bool    | Does the lexeme have a word vector?                                                                                                                                                                                                                          |
+| `is_oov`                                     | bool    | Is the lexeme out-of-vocabulary (i.e. Does it not have a word vector?)                                                                                                                                                                                                                        |
 | `is_stop`                                    | bool    | Is the lexeme part of a "stop list"?                                                                                                                                                                                                                         |
 | `lang`                                       | int     | Language of the parent vocabulary.                                                                                                                                                                                                                           |
 | `lang_`                                      | unicode | Language of the parent vocabulary.                                                                                                                                                                                                                           |

From 634ae609b42addf9d3bd7189086ebf646eeebd46 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 27 Jan 2021 13:08:00 +1100
Subject: [PATCH 21/24] Adjust formatting [ci skip]

---
 website/docs/usage/facts-figures.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 1fb932889..e77f384b5 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -95,7 +95,8 @@ results. Project template:
 ### Speed comparison {#benchmarks-speed}
 
 We compare the speed of different NLP libraries, measured in words per second
-(WPS) - higher is better. The evaluation was performed on 10,000 Reddit comments.
+(WPS) - higher is better. The evaluation was performed on 10,000 Reddit
+comments.
 
 <figure>
 
@@ -105,7 +106,7 @@ We compare the speed of different NLP libraries, measured in words per second
 | spaCy   | [`en_core_web_trf`](/models/en#en_core_web_trf) |                                                            684 |                                                          3,768 |
 | Stanza  | `en_ewt`                                        |                                                            878 |                                                          2,180 |
 | Flair   | `pos`(`-fast`) & `ner`(`-fast`)                 |                                                            323 |                                                          1,184 |
-| UDPipe  | `english-ewt-ud-2.5`                            |                                                          1,101 |                                                             NA |
+| UDPipe  | `english-ewt-ud-2.5`                            |                                                          1,101 |                                                          _n/a_ |
 
 <figcaption class="caption">
 

From e3f8be9a948b7043162c9cd5aad22bd56ce446df Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 27 Jan 2021 13:29:22 +1100
Subject: [PATCH 22/24] Update language data

---
 spacy/lang/ky/__init__.py             | 17 +++--------------
 spacy/lang/ky/examples.py             |  3 ---
 spacy/lang/ky/lex_attrs.py            |  3 ---
 spacy/lang/ky/punctuation.py          |  3 ---
 spacy/lang/ky/stop_words.py           |  5 +----
 spacy/lang/ky/tokenizer_exceptions.py |  7 +++----
 6 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index 4656cfeb9..a333db035 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -1,25 +1,14 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...attrs import LANG
 from ...language import Language
-from ...util import update_exc
 
 
 class KyrgyzDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "ky"
-
-    lex_attr_getters.update(LEX_ATTRS)
-
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    infixes = tuple(TOKENIZER_INFIXES)
-
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    infixes = TOKENIZER_INFIXES
+    lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
 
diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py
index f1f31e3ab..ba77ea975 100644
--- a/spacy/lang/ky/examples.py
+++ b/spacy/lang/ky/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.ky.examples import sentences
diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py
index af926b138..bdf993482 100644
--- a/spacy/lang/ky/lex_attrs.py
+++ b/spacy/lang/ky/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM
 
 _num_words = [
diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index 22c2061ca..fa9819f80 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 
diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py
index eede62767..ea40bdfa2 100644
--- a/spacy/lang/ky/stop_words.py
+++ b/spacy/lang/ky/stop_words.py
@@ -1,8 +1,5 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
-"""
+    """
 ага адам айтты айтымында айтып ал алар
 алардын алган алуу алып анда андан аны
 анын ар
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
index be5e9530c..eb367aeef 100644
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-# coding: utf8
-from __future__ import unicode_literals
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, LEMMA, NORM
+from ...util import update_exc
 
 _exc = {}
 
@@ -52,4 +51,4 @@ for exc_data in [  # "etc." abbreviations
     exc_data[LEMMA] = exc_data[NORM]
     _exc[exc_data[ORTH]] = [exc_data]
 
-TOKENIZER_EXCEPTIONS = _exc
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

From 80ba9eaf7dee5a946be1723dd3e739b87f72e038 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 27 Jan 2021 21:29:02 +1100
Subject: [PATCH 23/24] Fix test

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 417b0bf1d..9209a840c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -267,7 +267,7 @@ def tt_tokenizer():
 
 @pytest.fixture(scope="session")
 def ky_tokenizer():
-    return get_lang_class("ky").Defaults.create_tokenizer()
+    return get_lang_class("ky")().tokenizer
 
 
 @pytest.fixture(scope="session")

From 615dba9d99df0cd25fbecd8d58b19371da59a719 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 27 Jan 2021 22:11:42 +1100
Subject: [PATCH 24/24] Fix tokenizer exceptions

---
 spacy/lang/ky/tokenizer_exceptions.py | 50 +++++++++++++--------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
index eb367aeef..cd51c2714 100644
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -1,39 +1,39 @@
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, LEMMA, NORM
+from ...symbols import ORTH, NORM
 from ...util import update_exc
 
 _exc = {}
 
 _abbrev_exc = [
     # Weekdays abbreviations
-    {ORTH: "дүй", LEMMA: "дүйшөмбү"},
-    {ORTH: "шей", LEMMA: "шейшемби"},
-    {ORTH: "шар", LEMMA: "шаршемби"},
-    {ORTH: "бей", LEMMA: "бейшемби"},
-    {ORTH: "жум", LEMMA: "жума"},
-    {ORTH: "ишм", LEMMA: "ишемби"},
-    {ORTH: "жек", LEMMA: "жекшемби"},
+    {ORTH: "дүй", NORM: "дүйшөмбү"},
+    {ORTH: "шей", NORM: "шейшемби"},
+    {ORTH: "шар", NORM: "шаршемби"},
+    {ORTH: "бей", NORM: "бейшемби"},
+    {ORTH: "жум", NORM: "жума"},
+    {ORTH: "ишм", NORM: "ишемби"},
+    {ORTH: "жек", NORM: "жекшемби"},
     # Months abbreviations
-    {ORTH: "янв", LEMMA: "январь"},
-    {ORTH: "фев", LEMMA: "февраль"},
-    {ORTH: "мар", LEMMA: "март"},
-    {ORTH: "апр", LEMMA: "апрель"},
-    {ORTH: "июн", LEMMA: "июнь"},
-    {ORTH: "июл", LEMMA: "июль"},
-    {ORTH: "авг", LEMMA: "август"},
-    {ORTH: "сен", LEMMA: "сентябрь"},
-    {ORTH: "окт", LEMMA: "октябрь"},
-    {ORTH: "ноя", LEMMA: "ноябрь"},
-    {ORTH: "дек", LEMMA: "декабрь"},
+    {ORTH: "янв", NORM: "январь"},
+    {ORTH: "фев", NORM: "февраль"},
+    {ORTH: "мар", NORM: "март"},
+    {ORTH: "апр", NORM: "апрель"},
+    {ORTH: "июн", NORM: "июнь"},
+    {ORTH: "июл", NORM: "июль"},
+    {ORTH: "авг", NORM: "август"},
+    {ORTH: "сен", NORM: "сентябрь"},
+    {ORTH: "окт", NORM: "октябрь"},
+    {ORTH: "ноя", NORM: "ноябрь"},
+    {ORTH: "дек", NORM: "декабрь"},
     # Number abbreviations
-    {ORTH: "млрд", LEMMA: "миллиард"},
-    {ORTH: "млн", LEMMA: "миллион"},
+    {ORTH: "млрд", NORM: "миллиард"},
+    {ORTH: "млн", NORM: "миллион"},
 ]
 
 for abbr in _abbrev_exc:
     for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()):
-        _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
-        _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+        _exc[orth] = [{ORTH: orth, NORM: abbr[NORM]}]
+        _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbr[NORM]}]
 
 for exc_data in [  # "etc." abbreviations
     {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"},
@@ -47,8 +47,6 @@ for exc_data in [  # "etc." abbreviations
     {ORTH: "көч.", NORM: "көчөсү"},
     {ORTH: "м-н", NORM: "менен"},
     {ORTH: "б-ча", NORM: "боюнча"},
-]:
-    exc_data[LEMMA] = exc_data[NORM]
-    _exc[exc_data[ORTH]] = [exc_data]
+]:    _exc[exc_data[ORTH]] = [exc_data]
 
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)