From 2a78f4d6345084fda788a7f94beff963026b0e83 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:23:19 +0700
Subject: [PATCH 1/8] updated .gitignore file

added excluding PyCharm's idea directory
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 84ced41f8..ecd8ed39f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,7 @@ Desktop.ini
 
 # Other
 *.tgz
+
+
+# JetBrains PyCharm
+.idea/
\ No newline at end of file

From 7b9491679ffa235ce6cc3f8d3f94b00c14d40655 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:24:20 +0700
Subject: [PATCH 2/8] added russian language support

---
 spacy/ru/__init__.py             | 56 ++++++++++++++++++++++++++++++++
 spacy/ru/language_data.py        | 18 ++++++++++
 spacy/ru/stop_words.py           | 54 ++++++++++++++++++++++++++++++
 spacy/ru/tokenizer_exceptions.py | 29 +++++++++++++++++
 4 files changed, 157 insertions(+)
 create mode 100644 spacy/ru/__init__.py
 create mode 100644 spacy/ru/language_data.py
 create mode 100644 spacy/ru/stop_words.py
 create mode 100644 spacy/ru/tokenizer_exceptions.py

diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py
new file mode 100644
index 000000000..d8f38e199
--- /dev/null
+++ b/spacy/ru/__init__.py
@@ -0,0 +1,56 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from ..language import Language
+from ..attrs import LANG
+from ..tokens import Doc
+from .language_data import *
+
+
+class RussianTokenizer(object):
+    try:
+        from pymorphy2 import MorphAnalyzer
+    except ImportError:
+        raise ImportError(
+            "The Russian tokenizer requires the pymorphy2 library: "
+            "try to fix it with "
+            "pip install pymorphy2==0.8")
+
+    _morph = MorphAnalyzer()
+
+    def __init__(self, spacy_tokenizer, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
+        self._spacy_tokenizer = spacy_tokenizer
+
+    def __call__(self, text):
+        words = [self._normalize(RussianTokenizer._get_word(token))
+                 for token in self._spacy_tokenizer(text)]
+
+        return Doc(self.vocab, words, [False] * len(words))
+
+    @staticmethod
+    def _get_word(token):
+        return token.lemma_ if len(token.lemma_) > 0 else token.text
+
+    @classmethod
+    def _normalize(cls, word):
+        return cls._morph.parse(word)[0].normal_form
+
+
+class RussianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ru'
+
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
+
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
+        return RussianTokenizer(tokenizer, cls, nlp)
+
+
+class Russian(Language):
+    lang = 'ru'
+
+    Defaults = RussianDefaults
diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py
new file mode 100644
index 000000000..75ca41b65
--- /dev/null
+++ b/spacy/ru/language_data.py
@@ -0,0 +1,18 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import update_exc, strings_to_exc
+
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+STOP_WORDS = set(STOP_WORDS)
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+
+
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+
+
+__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
\ No newline at end of file
diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py
new file mode 100644
index 000000000..ddb28af86
--- /dev/null
+++ b/spacy/ru/stop_words.py
@@ -0,0 +1,54 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set("""
+а
+
+будем будет будете будешь буду будут будучи будь будьте бы был была были было
+быть
+
+в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
+всея всю вся вы
+
+да для до
+
+его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
+
+же
+
+за
+
+и из или им ими имъ их
+
+к как кем ко когда кого ком кому комья которая которого которое которой котором
+которому которою которую которые который которым которыми которых кто
+
+меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
+моей моем моём моему моею можем может можете можешь мои мой моим моими моих
+мочь мою моя мы
+
+на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
+нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
+
+о об один одна одни одним одними одних одно одного одной одном одному одною
+одну он она оне они оно от
+
+по при
+
+с сам сама сами самим самими самих само самого самом самому саму свое своё
+своего своей своем своём своему своею свои свой своим своими своих свою своя
+себе себя собой собою
+
+та так такая такие таким такими таких такого такое такой таком такому такою
+такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
+тот тою ту ты
+
+у уже
+
+чего чем чём чему что чтобы
+
+эта эти этим этими этих это этого этой этом этому этот этою эту
+
+я
+""".split())
\ No newline at end of file
diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py
new file mode 100644
index 000000000..8df57a402
--- /dev/null
+++ b/spacy/ru/tokenizer_exceptions.py
@@ -0,0 +1,29 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TOKENIZER_EXCEPTIONS = {
+    "Пн.": [
+        {ORTH: "Пн.", LEMMA: "Понедельник"}
+    ],
+    "Вт.": [
+        {ORTH: "Вт.", LEMMA: "Вторник"}
+    ],
+    "Ср.": [
+        {ORTH: "Ср.", LEMMA: "Среда"}
+    ],
+    "Чт.": [
+        {ORTH: "Чт.", LEMMA: "Четверг"}
+    ],
+    "Пт.": [
+        {ORTH: "Пт.", LEMMA: "Пятница"}
+    ],
+    "Сб.": [
+        {ORTH: "Сб.", LEMMA: "Суббота"}
+    ],
+    "Вс.": [
+        {ORTH: "Вс.", LEMMA: "Воскресенье"}
+    ],
+}
\ No newline at end of file

From f81dd284eb2e8c09c55a4fc37abb3e00e278f0a8 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Thu, 12 Oct 2017 22:28:34 +0700
Subject: [PATCH 3/8] updated spacy/__init__.py

registered russian language via set_lang_class
---
 spacy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index f0d5ea0fc..1e5faf504 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -7,13 +7,13 @@ from .cli.info import info
 from .glossary import explain
 from .about import __version__
 
-from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th, ru
 
 
 _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
              fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese,
-             th.Thai)
+             th.Thai, ru.Russian)
 
 
 for _lang in _languages:

From 622b6d627078f5a5bc14ebb2840a64ec3db5d118 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 13:57:29 +0700
Subject: [PATCH 4/8] updated Russian tokenizer

moved the trying to import pymorph into __init__
---
 spacy/ru/__init__.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py
index d8f38e199..12b480a8a 100644
--- a/spacy/ru/__init__.py
+++ b/spacy/ru/__init__.py
@@ -8,17 +8,19 @@ from .language_data import *
 
 
 class RussianTokenizer(object):
-    try:
-        from pymorphy2 import MorphAnalyzer
-    except ImportError:
-        raise ImportError(
-            "The Russian tokenizer requires the pymorphy2 library: "
-            "try to fix it with "
-            "pip install pymorphy2==0.8")
-
-    _morph = MorphAnalyzer()
+    _morph = None
 
     def __init__(self, spacy_tokenizer, cls, nlp=None):
+        try:
+            from pymorphy2 import MorphAnalyzer
+        except ImportError:
+            raise ImportError(
+                "The Russian tokenizer requires the pymorphy2 library: "
+                "try to fix it with "
+                "pip install pymorphy2==0.8")
+
+        RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
+
         self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
         self._spacy_tokenizer = spacy_tokenizer
 
@@ -36,6 +38,12 @@ class RussianTokenizer(object):
     def _normalize(cls, word):
         return cls._morph.parse(word)[0].normal_form
 
+    @classmethod
+    def _create_morph(cls, morph_analyzer_class):
+        if not cls._morph:
+            cls._morph = morph_analyzer_class()
+        return cls._morph
+
 
 class RussianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)

From a229b6e0ded3b1255fd77e00c197fa35c9030e5b Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 14:04:37 +0700
Subject: [PATCH 5/8] added tests for Russian language

added tests of creating Russian Language instance and Russian tokenizer
---
 spacy/tests/conftest.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 90b947702..718a8265c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -16,7 +16,7 @@ from ..bn import Bengali
 from ..he import Hebrew
 from ..nb import Norwegian
 from ..th import Thai
-
+from ..ru import Russian
 
 from ..tokens import Doc
 from ..strings import StringStore
@@ -30,7 +30,7 @@ import pytest
 
 # These languages get run through generic tokenizer tests
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
-             Swedish, Hungarian, Finnish, Bengali, Norwegian]
+             Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian]
 
 
 @pytest.fixture(params=LANGUAGES)
@@ -53,6 +53,7 @@ def en_vocab():
 def en_parser():
     return English.Defaults.create_parser()
 
+
 @pytest.fixture
 def es_tokenizer():
     return Spanish.Defaults.create_tokenizer()
@@ -83,11 +84,13 @@ def ja_tokenizer():
     pytest.importorskip("MeCab")
     return Japanese.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def japanese():
     pytest.importorskip("MeCab")
     return Japanese()
 
+
 @pytest.fixture
 def sv_tokenizer():
     return Swedish.Defaults.create_tokenizer()
@@ -102,15 +105,30 @@ def bn_tokenizer():
 def he_tokenizer():
     return Hebrew.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def nb_tokenizer():
     return Norwegian.Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def th_tokenizer():
     pythainlp = pytest.importorskip("pythainlp")
     return Thai.Defaults.create_tokenizer()
 
+
+@pytest.fixture
+def ru_tokenizer():
+    pytest.importorskip("pymorphy2")
+    return Russian.Defaults.create_tokenizer()
+
+
+@pytest.fixture
+def russian():
+    pytest.importorskip("pymorphy2")
+    return Russian()
+
+
 @pytest.fixture
 def stringstore():
     return StringStore()
@@ -118,7 +136,7 @@ def stringstore():
 
 @pytest.fixture
 def en_entityrecognizer():
-     return English.Defaults.create_entity()
+    return English.Defaults.create_entity()
 
 
 @pytest.fixture
@@ -130,6 +148,7 @@ def lemmatizer():
 def text_file():
     return StringIO()
 
+
 @pytest.fixture
 def text_file_b():
     return BytesIO()
@@ -149,11 +168,11 @@ def DE():
 
 def pytest_addoption(parser):
     parser.addoption("--models", action="store_true",
-        help="include tests that require full models")
+                     help="include tests that require full models")
     parser.addoption("--vectors", action="store_true",
-        help="include word vectors tests")
+                     help="include word vectors tests")
     parser.addoption("--slow", action="store_true",
-        help="include slow tests")
+                     help="include slow tests")
 
 
 def pytest_runtest_setup(item):

From 6fb9d75bd2a9ed049300b4237bec23d7a09e6845 Mon Sep 17 00:00:00 2001
From: yuukos <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 15:51:03 +0700
Subject: [PATCH 6/8] fixed test with creating tokenizer

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 718a8265c..de0facf49 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -30,7 +30,7 @@ import pytest
 
 # These languages get run through generic tokenizer tests
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
-             Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian]
+             Swedish, Hungarian, Finnish, Bengali, Norwegian]
 
 
 @pytest.fixture(params=LANGUAGES)

From ce00405afc176bd02363a7d703c3e61ef52fb851 Mon Sep 17 00:00:00 2001
From: Alex <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 21:00:15 +0700
Subject: [PATCH 7/8] Create yuukos.md

---
 .github/contributors/yuukos.md | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/yuukos.md

diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md
new file mode 100644
index 000000000..aecafeecb
--- /dev/null
+++ b/.github/contributors/yuukos.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Alexey Kim           |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 13-12-2017           |
+| GitHub username                | yuukos               |
+| Website (optional)             |                      |

From 95836abee1c311bb95d291d0357f29b9f4e98e1c Mon Sep 17 00:00:00 2001
From: Alex <yuukos.dev@icloud.com>
Date: Fri, 13 Oct 2017 21:02:19 +0700
Subject: [PATCH 8/8] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9e210bd4c..edd1ed30d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -3,6 +3,7 @@
 This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
 
 * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
+* Alexey Kim, [@yuukos](https://github.com/yuukos)
 * Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
 * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
 * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)