Merge branch 'pr/917'

2025-10-14 15:56:52 +03:00 · 2017-03-25 10:10:02 +01:00 · 2017-03-25 10:10:02 +01:00 · 89fe97ec43
commit 89fe97ec43
parent 8bc05c2ba9 97cb4d5e3c
9 changed files with 401 additions and 2 deletions
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -56,7 +56,8 @@ def get_version(model, comp):
 def download_model(filename):
    util.print_msg("Downloading {f}".format(f=filename))
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
+    subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/he/init.py
+++ b/spacy/he/init.py
@ -0,0 +1,18 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
 class Hebrew(Language):
    lang = 'he'
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'he'
        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
        stop_words = STOP_WORDS
--- a/spacy/he/language_data.py
+++ b/spacy/he/language_data.py
@ -0,0 +1,17 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/he/stop_words.py
+++ b/spacy/he/stop_words.py
@ -0,0 +1,226 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 אני
 את
 אתה
 אנחנו
 אתן
 אתם
 הם
 הן
 היא
 הוא
 שלי
 שלו
 שלך
 שלה
 שלנו
 שלכם
 שלכן
 שלהם
 שלהן
 לי
 לו
 לה
 לנו
 לכם
 לכן
 להם
 להן
 אותה
 אותו
 זה
 זאת
 אלה
 אלו
 תחת
 מתחת
 מעל
 בין
 עם
 עד
 נגר
 על
 אל
 מול
 של
 אצל
 כמו
 אחר
 אותו
 בלי
 לפני
 אחרי
 מאחורי
 עלי
 עליו
 עליה
 עליך
 עלינו
 עליכם
 לעיכן
 עליהם
 עליהן
 כל
 כולם
 כולן
 כך
 ככה
 כזה
 זה
 זות
 אותי
 אותה
 אותם
 אותך
 אותו
 אותן
 אותנו
 ואת
 את
 אתכם
 אתכן
 איתי
 איתו
 איתך
 איתה
 איתם
 איתן
 איתנו
 איתכם
 איתכן
 יהיה
 תהיה
 היתי
 היתה
 היה
 להיות
 עצמי
 עצמו
 עצמה
 עצמם
 עצמן
 עצמנו
 עצמהם
 עצמהן
 מי
 מה
 איפה
 היכן
 במקום שבו
 אם
 לאן
 למקום שבו
 מקום בו
 איזה
 מהיכן
 איך
 כיצד
 באיזו מידה
 מתי
 בשעה ש
 כאשר
 כש
 למרות
 לפני
 אחרי
 מאיזו סיבה
 הסיבה שבגללה
 למה
 מדוע
 לאיזו תכלית
 כי
 יש
 אין
 אך
 מנין
 מאין
 מאיפה
 יכל
 יכלה
 יכלו
 יכול
 יכולה
 יכולים
 יכולות
 יוכלו
 יוכל
 מסוגל
 לא
 רק
 אולי
 אין
 לאו
 אי
 כלל
 נגד
 אם
 עם
 אל
 אלה
 אלו
 אף
 על
 מעל
 מתחת
 מצד
 בשביל
 לבין
 באמצע
 בתוך
 דרך
 מבעד
 באמצעות
 למעלה
 למטה
 מחוץ
 מן
 לעבר
 מכאן
 כאן
 הנה
 הרי
 פה
 שם
 אך
 ברם
 שוב
 אבל
 מבלי
 בלי
 מלבד
 רק
 בגלל
 מכיוון
 עד
 אשר
 ואילו
 למרות
 אס
 כמו
 כפי
 אז
 אחרי
 כן
 לכן
 לפיכך
 מאד
 עז
 מעט
 מעטים
 במידה
 שוב
 יותר
 מדי
 גם
 כן
 נו
 אחר
 אחרת
 אחרים
 אחרות
 אשר
 או
 """.split())
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -12,6 +12,8 @@ from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
 from ..bn import Bengali
 from ..he import Hebrew
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@ -78,6 +80,11 @@ def bn_tokenizer():
    return Bengali.Defaults.create_tokenizer()
@pytest.fixture  
 def he_tokenizer():
    return Hebrew.Defaults.create_tokenizer()
@pytest.fixture
 def stringstore():
    return StringStore()
--- a/spacy/tests/he/init.py
+++ b/spacy/tests/he/init.py
--- a/spacy/tests/he/test_tokenizer.py
+++ b/spacy/tests/he/test_tokenizer.py
@ -0,0 +1,17 @@
 # encoding: utf8
 from __future__ import unicode_literals
 import pytest
 ABBREVIATION_TESTS = [
    ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
 ]
 TESTCASES = ABBREVIATION_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
 def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
    tokens = he_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/regression/test_issue910.py
+++ b/spacy/tests/regression/test_issue910.py
@ -0,0 +1,113 @@
 from __future__ import unicode_literals
 import json
 import os
 import random
 import contextlib
 import shutil
 import pytest
 import tempfile
 from pathlib import Path
 import pathlib
 from ...gold import GoldParse
 from ...pipeline import EntityRecognizer
 from ...en import English
 try:
    unicode
 except NameError:
    unicode = str
@pytest.fixture
 def train_data():
    return [
            ["hey",[]],
            ["howdy",[]],
            ["hey there",[]],
            ["hello",[]],
            ["hi",[]],
            ["i'm looking for a place to eat",[]],
            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
            ["show me chinese restaurants",[[8,15,"cuisine"]]],
            ["show me chines restaurants",[[8,14,"cuisine"]]],
            ["yes",[]],
            ["yep",[]],
            ["yeah",[]],
            ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
            ["bye",[]],["goodbye",[]],
            ["good bye",[]],
            ["stop",[]],
            ["end",[]],
            ["i am looking for an indian spot",[[20,26,"cuisine"]]],
            ["search for restaurants",[]],
            ["anywhere in the west",[[16,20,"location"]]],
            ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
            ["indeed",[]],
            ["that's right",[]],
            ["ok",[]],
            ["great",[]]
    ]
@pytest.fixture
 def additional_entity_types():
    return ['cuisine', 'location']
@contextlib.contextmanager
 def temp_save_model(model):
    model_dir = Path(tempfile.mkdtemp())
    # store the fine tuned model
    with (model_dir / "config.json").open('w') as file_:
        data = json.dumps(model.cfg)
        if not isinstance(data, unicode):
            data = data.decode('utf8')
        file_.write(data)
    model.model.dump((model_dir / 'model').as_posix())
    yield model_dir
    shutil.rmtree(model_dir.as_posix())
@pytest.mark.xfail
@pytest.mark.models
 def test_issue910(train_data, additional_entity_types):
    '''Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    '''
    nlp = English()
    doc = nlp(u"I am looking for a restaurant in Berlin")
    ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
    # Fine tune the ner model
    for entity_type in additional_entity_types:
        if entity_type not in nlp.entity.cfg['actions']['1']:
            nlp.entity.add_label(entity_type)
    nlp.entity.learn_rate = 0.001
    for itn in range(4):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            gold = GoldParse(doc, entities=entity_offsets)
            loss = nlp.entity.update(doc, gold)
    with temp_save_model(nlp.entity) as model_dir:
        # Load the fine tuned model
        loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
    for entity_type in additional_entity_types:
        if entity_type not in loaded_ner.cfg['actions']['1']:
            loaded_ner.add_label(entity_type)
    doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
    nlp.tagger(doc)
    loaded_ner(doc)
    ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
    assert ents_before_train == ents_after_train
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -82,7 +82,7 @@ p
    |  conjunction features out of the atomic predictors. Let's say you have
    |  two atomic predictors asking, "What is the part-of-speech of the
    |  previous token?", and "What is the part-of-speech of the previous
-    |  previous token?". These ppredictors will introduce a number of features,
+    |  previous token?". These predictors will introduce a number of features,
    |  e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
    |  template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].