From da135bd823dcd57520c2032ccf536adb35e9c087 Mon Sep 17 00:00:00 2001
From: Iddo Berger <iddoberger@gmail.com>
Date: Fri, 24 Mar 2017 18:27:44 +0300
Subject: [PATCH] add hebrew tokenizer

---
 spacy/he/__init__.py             |  18 +++
 spacy/he/language_data.py        |  17 +++
 spacy/he/stop_words.py           | 226 +++++++++++++++++++++++++++++++
 spacy/tests/conftest.py          |   6 +
 spacy/tests/he/__init__.py       |   0
 spacy/tests/he/test_tokenizer.py |  17 +++
 6 files changed, 284 insertions(+)
 create mode 100644 spacy/he/__init__.py
 create mode 100644 spacy/he/language_data.py
 create mode 100644 spacy/he/stop_words.py
 create mode 100644 spacy/tests/he/__init__.py
 create mode 100644 spacy/tests/he/test_tokenizer.py

diff --git a/spacy/he/__init__.py b/spacy/he/__init__.py
new file mode 100644
index 000000000..a3e86ed73
--- /dev/null
+++ b/spacy/he/__init__.py
@@ -0,0 +1,18 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from ..language import Language
+from ..attrs import LANG
+
+from .language_data import *
+
+
+class Hebrew(Language):
+    lang = 'he'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'he'
+
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        stop_words = STOP_WORDS
diff --git a/spacy/he/language_data.py b/spacy/he/language_data.py
new file mode 100644
index 000000000..a4a657c33
--- /dev/null
+++ b/spacy/he/language_data.py
@@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import update_exc, strings_to_exc
+
+from .stop_words import STOP_WORDS
+
+
+STOP_WORDS = set(STOP_WORDS)
+
+
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/he/stop_words.py b/spacy/he/stop_words.py
new file mode 100644
index 000000000..2914fa0d5
--- /dev/null
+++ b/spacy/he/stop_words.py
@@ -0,0 +1,226 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+אני
+את
+אתה
+אנחנו
+אתן
+אתם
+הם
+הן
+היא
+הוא
+שלי
+שלו
+שלך
+שלה
+שלנו
+שלכם
+שלכן
+שלהם
+שלהן
+לי
+לו
+לה
+לנו
+לכם
+לכן
+להם
+להן
+אותה
+אותו
+זה
+זאת
+אלה
+אלו
+תחת
+מתחת
+מעל
+בין
+עם
+עד
+נגר
+על
+אל
+מול
+של
+אצל
+כמו
+אחר
+אותו
+בלי
+לפני
+אחרי
+מאחורי
+עלי
+עליו
+עליה
+עליך
+עלינו
+עליכם
+לעיכן
+עליהם
+עליהן
+כל
+כולם
+כולן
+כך
+ככה
+כזה
+זה
+זות
+אותי
+אותה
+אותם
+אותך
+אותו
+אותן
+אותנו
+ואת
+את
+אתכם
+אתכן
+איתי
+איתו
+איתך
+איתה
+איתם
+איתן
+איתנו
+איתכם
+איתכן
+יהיה
+תהיה
+היתי
+היתה
+היה
+להיות
+עצמי
+עצמו
+עצמה
+עצמם
+עצמן
+עצמנו
+עצמהם
+עצמהן
+מי
+מה
+איפה
+היכן
+במקום שבו
+אם
+לאן
+למקום שבו
+מקום בו
+איזה
+מהיכן
+איך
+כיצד
+באיזו מידה
+מתי
+בשעה ש
+כאשר
+כש
+למרות
+לפני
+אחרי
+מאיזו סיבה
+הסיבה שבגללה
+למה
+מדוע
+לאיזו תכלית
+כי
+יש
+אין
+אך
+מנין
+מאין
+מאיפה
+יכל
+יכלה
+יכלו
+יכול
+יכולה
+יכולים
+יכולות
+יוכלו
+יוכל
+מסוגל
+לא
+רק
+אולי
+אין
+לאו
+אי
+כלל
+נגד
+אם
+עם
+אל
+אלה
+אלו
+אף
+על
+מעל
+מתחת
+מצד
+בשביל
+לבין
+באמצע
+בתוך
+דרך
+מבעד
+באמצעות
+למעלה
+למטה
+מחוץ
+מן
+לעבר
+מכאן
+כאן
+הנה
+הרי
+פה
+שם
+אך
+ברם
+שוב
+אבל
+מבלי
+בלי
+מלבד
+רק
+בגלל
+מכיוון
+עד
+אשר
+ואילו
+למרות
+אס
+כמו
+כפי
+אז
+אחרי
+כן
+לכן
+לפיכך
+מאד
+עז
+מעט
+מעטים
+במידה
+שוב
+יותר
+מדי
+גם
+כן
+נו
+אחר
+אחרת
+אחרים
+אחרות
+אשר
+או
+""".split())
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b6dcb905a..f049d2f91 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -11,6 +11,7 @@ from ..nl import Dutch
 from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
+from ..he import Hebrew
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@@ -73,6 +74,11 @@ def sv_tokenizer():
     return Swedish.Defaults.create_tokenizer()
 
 
+@pytest.fixture
+def he_tokenizer():
+    return Hebrew.Defaults.create_tokenizer()
+
+
 @pytest.fixture
 def stringstore():
     return StringStore()
diff --git a/spacy/tests/he/__init__.py b/spacy/tests/he/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py
new file mode 100644
index 000000000..a6c65805a
--- /dev/null
+++ b/spacy/tests/he/test_tokenizer.py
@@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+ABBREVIATION_TESTS = [
+    ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
+]
+
+TESTCASES = ABBREVIATION_TESTS
+
+
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
+def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
+    tokens = he_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
\ No newline at end of file