Merge branch 'pr/917'

2025-08-05 04:40:20 +03:00 · 2017-03-25 10:10:02 +01:00 · 2017-03-25 10:10:02 +01:00 · 89fe97ec43
commit 89fe97ec43
parent 8bc05c2ba9 97cb4d5e3c
9 changed files with 401 additions and 2 deletions
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -56,7 +56,8 @@ def get_version(model, comp):
 def download_model(filename):
    util.print_msg("Downloading {f}".format(f=filename))
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
+    subprocess.call([sys.executable, '-m',
+        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())


--- a/spacy/he/init.py
+++ b/spacy/he/init.py
@ -0,0 +1,18 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from ..language import Language
+from ..attrs import LANG
+
+from .language_data import *
+
+
+class Hebrew(Language):
+    lang = 'he'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'he'
+
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        stop_words = STOP_WORDS
--- a/spacy/he/language_data.py
+++ b/spacy/he/language_data.py
@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import update_exc, strings_to_exc
+
+from .stop_words import STOP_WORDS
+
+
+STOP_WORDS = set(STOP_WORDS)
+
+
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/he/stop_words.py
+++ b/spacy/he/stop_words.py
@ -0,0 +1,226 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+אני
+את
+אתה
+אנחנו
+אתן
+אתם
+הם
+הן
+היא
+הוא
+שלי
+שלו
+שלך
+שלה
+שלנו
+שלכם
+שלכן
+שלהם
+שלהן
+לי
+לו
+לה
+לנו
+לכם
+לכן
+להם
+להן
+אותה
+אותו
+זה
+זאת
+אלה
+אלו
+תחת
+מתחת
+מעל
+בין
+עם
+עד
+נגר
+על
+אל
+מול
+של
+אצל
+כמו
+אחר
+אותו
+בלי
+לפני
+אחרי
+מאחורי
+עלי
+עליו
+עליה
+עליך
+עלינו
+עליכם
+לעיכן
+עליהם
+עליהן
+כל
+כולם
+כולן
+כך
+ככה
+כזה
+זה
+זות
+אותי
+אותה
+אותם
+אותך
+אותו
+אותן
+אותנו
+ואת
+את
+אתכם
+אתכן
+איתי
+איתו
+איתך
+איתה
+איתם
+איתן
+איתנו
+איתכם
+איתכן
+יהיה
+תהיה
+היתי
+היתה
+היה
+להיות
+עצמי
+עצמו
+עצמה
+עצמם
+עצמן
+עצמנו
+עצמהם
+עצמהן
+מי
+מה
+איפה
+היכן
+במקום שבו
+אם
+לאן
+למקום שבו
+מקום בו
+איזה
+מהיכן
+איך
+כיצד
+באיזו מידה
+מתי
+בשעה ש
+כאשר
+כש
+למרות
+לפני
+אחרי
+מאיזו סיבה
+הסיבה שבגללה
+למה
+מדוע
+לאיזו תכלית
+כי
+יש
+אין
+אך
+מנין
+מאין
+מאיפה
+יכל
+יכלה
+יכלו
+יכול
+יכולה
+יכולים
+יכולות
+יוכלו
+יוכל
+מסוגל
+לא
+רק
+אולי
+אין
+לאו
+אי
+כלל
+נגד
+אם
+עם
+אל
+אלה
+אלו
+אף
+על
+מעל
+מתחת
+מצד
+בשביל
+לבין
+באמצע
+בתוך
+דרך
+מבעד
+באמצעות
+למעלה
+למטה
+מחוץ
+מן
+לעבר
+מכאן
+כאן
+הנה
+הרי
+פה
+שם
+אך
+ברם
+שוב
+אבל
+מבלי
+בלי
+מלבד
+רק
+בגלל
+מכיוון
+עד
+אשר
+ואילו
+למרות
+אס
+כמו
+כפי
+אז
+אחרי
+כן
+לכן
+לפיכך
+מאד
+עז
+מעט
+מעטים
+במידה
+שוב
+יותר
+מדי
+גם
+כן
+נו
+אחר
+אחרת
+אחרים
+אחרות
+אשר
+או
+""".split())
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -12,6 +12,8 @@ from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
 from ..bn import Bengali
+from ..he import Hebrew
+
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@ -77,6 +79,11 @@ def sv_tokenizer():
 def bn_tokenizer():
    return Bengali.Defaults.create_tokenizer()

+  
+@pytest.fixture  
+def he_tokenizer():
+    return Hebrew.Defaults.create_tokenizer()
+

@pytest.fixture
 def stringstore():
--- a/spacy/tests/he/init.py
+++ b/spacy/tests/he/init.py
--- a/spacy/tests/he/test_tokenizer.py
+++ b/spacy/tests/he/test_tokenizer.py
@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+ABBREVIATION_TESTS = [
+    ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
+]
+
+TESTCASES = ABBREVIATION_TESTS
+
+
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
+def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
+    tokens = he_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
--- a/spacy/tests/regression/test_issue910.py
+++ b/spacy/tests/regression/test_issue910.py
@ -0,0 +1,113 @@
+from __future__ import unicode_literals
+import json
+import os
+import random
+import contextlib
+import shutil
+import pytest
+import tempfile
+from pathlib import Path
+
+
+import pathlib
+from ...gold import GoldParse
+from ...pipeline import EntityRecognizer
+from ...en import English
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+@pytest.fixture
+def train_data():
+    return [
+            ["hey",[]],
+            ["howdy",[]],
+            ["hey there",[]],
+            ["hello",[]],
+            ["hi",[]],
+            ["i'm looking for a place to eat",[]],
+            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
+            ["show me chinese restaurants",[[8,15,"cuisine"]]],
+            ["show me chines restaurants",[[8,14,"cuisine"]]],
+            ["yes",[]],
+            ["yep",[]],
+            ["yeah",[]],
+            ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
+            ["bye",[]],["goodbye",[]],
+            ["good bye",[]],
+            ["stop",[]],
+            ["end",[]],
+            ["i am looking for an indian spot",[[20,26,"cuisine"]]],
+            ["search for restaurants",[]],
+            ["anywhere in the west",[[16,20,"location"]]],
+            ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
+            ["indeed",[]],
+            ["that's right",[]],
+            ["ok",[]],
+            ["great",[]]
+    ]
+
+@pytest.fixture
+def additional_entity_types():
+    return ['cuisine', 'location']
+
+
+@contextlib.contextmanager
+def temp_save_model(model):
+    model_dir = Path(tempfile.mkdtemp())
+    # store the fine tuned model
+    with (model_dir / "config.json").open('w') as file_:
+        data = json.dumps(model.cfg)
+        if not isinstance(data, unicode):
+            data = data.decode('utf8')
+        file_.write(data)
+    model.model.dump((model_dir / 'model').as_posix())
+    yield model_dir
+    shutil.rmtree(model_dir.as_posix())
+
+
+
+@pytest.mark.xfail
+@pytest.mark.models
+def test_issue910(train_data, additional_entity_types):
+    '''Test that adding entities and resuming training works passably OK.
+    There are two issues here:
+
+    1) We have to readd labels. This isn't very nice.
+    2) There's no way to set the learning rate for the weight update, so we
+        end up out-of-scale, causing it to learn too fast.
+    '''
+    nlp = English()
+    doc = nlp(u"I am looking for a restaurant in Berlin")
+    ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
+    # Fine tune the ner model
+    for entity_type in additional_entity_types:
+        if entity_type not in nlp.entity.cfg['actions']['1']:
+            nlp.entity.add_label(entity_type)
+
+    nlp.entity.learn_rate = 0.001
+    for itn in range(4):
+        random.shuffle(train_data)
+        for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
+            nlp.tagger(doc)
+            gold = GoldParse(doc, entities=entity_offsets)
+            loss = nlp.entity.update(doc, gold)
+
+    with temp_save_model(nlp.entity) as model_dir:
+        # Load the fine tuned model
+        loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
+
+    for entity_type in additional_entity_types:
+        if entity_type not in loaded_ner.cfg['actions']['1']:
+            loaded_ner.add_label(entity_type)
+
+    doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
+    nlp.tagger(doc)
+    loaded_ner(doc)
+
+    ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
+    assert ents_before_train == ents_after_train
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -82,7 +82,7 @@ p
    |  conjunction features out of the atomic predictors. Let's say you have
    |  two atomic predictors asking, "What is the part-of-speech of the
    |  previous token?", and "What is the part-of-speech of the previous
-    |  previous token?". These ppredictors will introduce a number of features,
+    |  previous token?". These predictors will introduce a number of features,
    |  e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
    |  template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].