diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 446de4a37..56dbd5264 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -56,7 +56,8 @@ def get_version(model, comp): def download_model(filename): util.print_msg("Downloading {f}".format(f=filename)) download_url = about.__download_url__ + '/' + filename - subprocess.call([sys.executable, '-m', 'pip', 'install', download_url], + subprocess.call([sys.executable, '-m', + 'pip', 'install', '--no-cache-dir', download_url], env=os.environ.copy()) diff --git a/spacy/he/__init__.py b/spacy/he/__init__.py new file mode 100644 index 000000000..a3e86ed73 --- /dev/null +++ b/spacy/he/__init__.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from ..language import Language +from ..attrs import LANG + +from .language_data import * + + +class Hebrew(Language): + lang = 'he' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'he' + + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS diff --git a/spacy/he/language_data.py b/spacy/he/language_data.py new file mode 100644 index 000000000..a4a657c33 --- /dev/null +++ b/spacy/he/language_data.py @@ -0,0 +1,17 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS + + +STOP_WORDS = set(STOP_WORDS) + + +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/he/stop_words.py b/spacy/he/stop_words.py new file mode 100644 index 000000000..2914fa0d5 --- /dev/null +++ b/spacy/he/stop_words.py @@ -0,0 +1,226 @@ +# encoding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set(""" +אני +את +אתה +אנחנו +אתן +אתם +הם +הן +היא +הוא +שלי +שלו +שלך +שלה +שלנו +שלכם +שלכן +שלהם +שלהן +לי +לו +לה +לנו +לכם +לכן +להם +להן +אותה +אותו +זה +זאת +אלה +אלו +תחת +מתחת +מעל +בין +עם +עד +נגר +על +אל +מול +של +אצל +כמו +אחר +אותו +בלי +לפני +אחרי +מאחורי +עלי +עליו +עליה +עליך +עלינו +עליכם +לעיכן +עליהם +עליהן +כל +כולם +כולן +כך +ככה +כזה +זה +זות +אותי +אותה +אותם +אותך +אותו +אותן +אותנו +ואת +את +אתכם +אתכן +איתי +איתו +איתך +איתה +איתם +איתן +איתנו +איתכם +איתכן +יהיה +תהיה +היתי +היתה +היה +להיות +עצמי +עצמו +עצמה +עצמם +עצמן +עצמנו +עצמהם +עצמהן +מי +מה +איפה +היכן +במקום שבו +אם +לאן +למקום שבו +מקום בו +איזה +מהיכן +איך +כיצד +באיזו מידה +מתי +בשעה ש +כאשר +כש +למרות +לפני +אחרי +מאיזו סיבה +הסיבה שבגללה +למה +מדוע +לאיזו תכלית +כי +יש +אין +אך +מנין +מאין +מאיפה +יכל +יכלה +יכלו +יכול +יכולה +יכולים +יכולות +יוכלו +יוכל +מסוגל +לא +רק +אולי +אין +לאו +אי +כלל +נגד +אם +עם +אל +אלה +אלו +אף +על +מעל +מתחת +מצד +בשביל +לבין +באמצע +בתוך +דרך +מבעד +באמצעות +למעלה +למטה +מחוץ +מן +לעבר +מכאן +כאן +הנה +הרי +פה +שם +אך +ברם +שוב +אבל +מבלי +בלי +מלבד +רק +בגלל +מכיוון +עד +אשר +ואילו +למרות +אס +כמו +כפי +אז +אחרי +כן +לכן +לפיכך +מאד +עז +מעט +מעטים +במידה +שוב +יותר +מדי +גם +כן +נו +אחר +אחרת +אחרים +אחרות +אשר +או +""".split()) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 850eaa4c2..b2627f96f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -12,6 +12,8 @@ from ..sv import Swedish from ..hu import Hungarian from ..fi import Finnish from ..bn import Bengali +from ..he import Hebrew + from ..tokens import Doc from ..strings import StringStore from ..lemmatizer import Lemmatizer @@ -77,6 +79,11 @@ def sv_tokenizer(): def bn_tokenizer(): return Bengali.Defaults.create_tokenizer() + +@pytest.fixture +def he_tokenizer(): + return Hebrew.Defaults.create_tokenizer() + @pytest.fixture def stringstore(): diff --git a/spacy/tests/he/__init__.py b/spacy/tests/he/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py new file mode 100644 index 000000000..a6c65805a --- /dev/null +++ b/spacy/tests/he/test_tokenizer.py @@ -0,0 +1,17 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + +ABBREVIATION_TESTS = [ + ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית']) +] + +TESTCASES = ABBREVIATION_TESTS + + +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens): + tokens = he_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list \ No newline at end of file diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py new file mode 100644 index 000000000..9b2b2287b --- /dev/null +++ b/spacy/tests/regression/test_issue910.py @@ -0,0 +1,113 @@ +from __future__ import unicode_literals +import json +import os +import random +import contextlib +import shutil +import pytest +import tempfile +from pathlib import Path + + +import pathlib +from ...gold import GoldParse +from ...pipeline import EntityRecognizer +from ...en import English + +try: + unicode +except NameError: + unicode = str + + +@pytest.fixture +def train_data(): + return [ + ["hey",[]], + ["howdy",[]], + ["hey there",[]], + ["hello",[]], + ["hi",[]], + ["i'm looking for a place to eat",[]], + ["i'm looking for a place in the north of town",[[31,36,"location"]]], + ["show me chinese restaurants",[[8,15,"cuisine"]]], + ["show me chines restaurants",[[8,14,"cuisine"]]], + ["yes",[]], + ["yep",[]], + ["yeah",[]], + ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]], + ["bye",[]],["goodbye",[]], + ["good bye",[]], + ["stop",[]], + ["end",[]], + ["i am looking for an indian spot",[[20,26,"cuisine"]]], + ["search for restaurants",[]], + ["anywhere in the west",[[16,20,"location"]]], + ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]], + ["indeed",[]], + ["that's right",[]], + ["ok",[]], + ["great",[]] + ] + +@pytest.fixture +def additional_entity_types(): + return ['cuisine', 'location'] + + +@contextlib.contextmanager +def temp_save_model(model): + model_dir = Path(tempfile.mkdtemp()) + # store the fine tuned model + with (model_dir / "config.json").open('w') as file_: + data = json.dumps(model.cfg) + if not isinstance(data, unicode): + data = data.decode('utf8') + file_.write(data) + model.model.dump((model_dir / 'model').as_posix()) + yield model_dir + shutil.rmtree(model_dir.as_posix()) + + + +@pytest.mark.xfail +@pytest.mark.models +def test_issue910(train_data, additional_entity_types): + '''Test that adding entities and resuming training works passably OK. + There are two issues here: + + 1) We have to readd labels. This isn't very nice. + 2) There's no way to set the learning rate for the weight update, so we + end up out-of-scale, causing it to learn too fast. + ''' + nlp = English() + doc = nlp(u"I am looking for a restaurant in Berlin") + ents_before_train = [(ent.label_, ent.text) for ent in doc.ents] + # Fine tune the ner model + for entity_type in additional_entity_types: + if entity_type not in nlp.entity.cfg['actions']['1']: + nlp.entity.add_label(entity_type) + + nlp.entity.learn_rate = 0.001 + for itn in range(4): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + gold = GoldParse(doc, entities=entity_offsets) + loss = nlp.entity.update(doc, gold) + + with temp_save_model(nlp.entity) as model_dir: + # Load the fine tuned model + loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab) + + for entity_type in additional_entity_types: + if entity_type not in loaded_ner.cfg['actions']['1']: + loaded_ner.add_label(entity_type) + + doc = nlp(u"I am looking for a restaurant in Berlin", entity=False) + nlp.tagger(doc) + loaded_ner(doc) + + ents_after_train = [(ent.label_, ent.text) for ent in doc.ents] + assert ents_before_train == ents_after_train diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index da452ac83..39f524829 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -82,7 +82,7 @@ p | conjunction features out of the atomic predictors. Let's say you have | two atomic predictors asking, "What is the part-of-speech of the | previous token?", and "What is the part-of-speech of the previous - | previous token?". These ppredictors will introduce a number of features, + | previous token?". These predictors will introduce a number of features, | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].