mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-05 16:23:42 +03:00
Merge branch 'pr/917'
This commit is contained in:
commit
89fe97ec43
|
@ -56,7 +56,8 @@ def get_version(model, comp):
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
util.print_msg("Downloading {f}".format(f=filename))
|
util.print_msg("Downloading {f}".format(f=filename))
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
|
subprocess.call([sys.executable, '-m',
|
||||||
|
'pip', 'install', '--no-cache-dir', download_url],
|
||||||
env=os.environ.copy())
|
env=os.environ.copy())
|
||||||
|
|
||||||
|
|
||||||
|
|
18
spacy/he/__init__.py
Normal file
18
spacy/he/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from ..language import Language
|
||||||
|
from ..attrs import LANG
|
||||||
|
|
||||||
|
from .language_data import *
|
||||||
|
|
||||||
|
|
||||||
|
class Hebrew(Language):
|
||||||
|
lang = 'he'
|
||||||
|
|
||||||
|
class Defaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'he'
|
||||||
|
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
17
spacy/he/language_data.py
Normal file
17
spacy/he/language_data.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .. import language_data as base
|
||||||
|
from ..language_data import update_exc, strings_to_exc
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
226
spacy/he/stop_words.py
Normal file
226
spacy/he/stop_words.py
Normal file
|
@ -0,0 +1,226 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
אני
|
||||||
|
את
|
||||||
|
אתה
|
||||||
|
אנחנו
|
||||||
|
אתן
|
||||||
|
אתם
|
||||||
|
הם
|
||||||
|
הן
|
||||||
|
היא
|
||||||
|
הוא
|
||||||
|
שלי
|
||||||
|
שלו
|
||||||
|
שלך
|
||||||
|
שלה
|
||||||
|
שלנו
|
||||||
|
שלכם
|
||||||
|
שלכן
|
||||||
|
שלהם
|
||||||
|
שלהן
|
||||||
|
לי
|
||||||
|
לו
|
||||||
|
לה
|
||||||
|
לנו
|
||||||
|
לכם
|
||||||
|
לכן
|
||||||
|
להם
|
||||||
|
להן
|
||||||
|
אותה
|
||||||
|
אותו
|
||||||
|
זה
|
||||||
|
זאת
|
||||||
|
אלה
|
||||||
|
אלו
|
||||||
|
תחת
|
||||||
|
מתחת
|
||||||
|
מעל
|
||||||
|
בין
|
||||||
|
עם
|
||||||
|
עד
|
||||||
|
נגר
|
||||||
|
על
|
||||||
|
אל
|
||||||
|
מול
|
||||||
|
של
|
||||||
|
אצל
|
||||||
|
כמו
|
||||||
|
אחר
|
||||||
|
אותו
|
||||||
|
בלי
|
||||||
|
לפני
|
||||||
|
אחרי
|
||||||
|
מאחורי
|
||||||
|
עלי
|
||||||
|
עליו
|
||||||
|
עליה
|
||||||
|
עליך
|
||||||
|
עלינו
|
||||||
|
עליכם
|
||||||
|
לעיכן
|
||||||
|
עליהם
|
||||||
|
עליהן
|
||||||
|
כל
|
||||||
|
כולם
|
||||||
|
כולן
|
||||||
|
כך
|
||||||
|
ככה
|
||||||
|
כזה
|
||||||
|
זה
|
||||||
|
זות
|
||||||
|
אותי
|
||||||
|
אותה
|
||||||
|
אותם
|
||||||
|
אותך
|
||||||
|
אותו
|
||||||
|
אותן
|
||||||
|
אותנו
|
||||||
|
ואת
|
||||||
|
את
|
||||||
|
אתכם
|
||||||
|
אתכן
|
||||||
|
איתי
|
||||||
|
איתו
|
||||||
|
איתך
|
||||||
|
איתה
|
||||||
|
איתם
|
||||||
|
איתן
|
||||||
|
איתנו
|
||||||
|
איתכם
|
||||||
|
איתכן
|
||||||
|
יהיה
|
||||||
|
תהיה
|
||||||
|
היתי
|
||||||
|
היתה
|
||||||
|
היה
|
||||||
|
להיות
|
||||||
|
עצמי
|
||||||
|
עצמו
|
||||||
|
עצמה
|
||||||
|
עצמם
|
||||||
|
עצמן
|
||||||
|
עצמנו
|
||||||
|
עצמהם
|
||||||
|
עצמהן
|
||||||
|
מי
|
||||||
|
מה
|
||||||
|
איפה
|
||||||
|
היכן
|
||||||
|
במקום שבו
|
||||||
|
אם
|
||||||
|
לאן
|
||||||
|
למקום שבו
|
||||||
|
מקום בו
|
||||||
|
איזה
|
||||||
|
מהיכן
|
||||||
|
איך
|
||||||
|
כיצד
|
||||||
|
באיזו מידה
|
||||||
|
מתי
|
||||||
|
בשעה ש
|
||||||
|
כאשר
|
||||||
|
כש
|
||||||
|
למרות
|
||||||
|
לפני
|
||||||
|
אחרי
|
||||||
|
מאיזו סיבה
|
||||||
|
הסיבה שבגללה
|
||||||
|
למה
|
||||||
|
מדוע
|
||||||
|
לאיזו תכלית
|
||||||
|
כי
|
||||||
|
יש
|
||||||
|
אין
|
||||||
|
אך
|
||||||
|
מנין
|
||||||
|
מאין
|
||||||
|
מאיפה
|
||||||
|
יכל
|
||||||
|
יכלה
|
||||||
|
יכלו
|
||||||
|
יכול
|
||||||
|
יכולה
|
||||||
|
יכולים
|
||||||
|
יכולות
|
||||||
|
יוכלו
|
||||||
|
יוכל
|
||||||
|
מסוגל
|
||||||
|
לא
|
||||||
|
רק
|
||||||
|
אולי
|
||||||
|
אין
|
||||||
|
לאו
|
||||||
|
אי
|
||||||
|
כלל
|
||||||
|
נגד
|
||||||
|
אם
|
||||||
|
עם
|
||||||
|
אל
|
||||||
|
אלה
|
||||||
|
אלו
|
||||||
|
אף
|
||||||
|
על
|
||||||
|
מעל
|
||||||
|
מתחת
|
||||||
|
מצד
|
||||||
|
בשביל
|
||||||
|
לבין
|
||||||
|
באמצע
|
||||||
|
בתוך
|
||||||
|
דרך
|
||||||
|
מבעד
|
||||||
|
באמצעות
|
||||||
|
למעלה
|
||||||
|
למטה
|
||||||
|
מחוץ
|
||||||
|
מן
|
||||||
|
לעבר
|
||||||
|
מכאן
|
||||||
|
כאן
|
||||||
|
הנה
|
||||||
|
הרי
|
||||||
|
פה
|
||||||
|
שם
|
||||||
|
אך
|
||||||
|
ברם
|
||||||
|
שוב
|
||||||
|
אבל
|
||||||
|
מבלי
|
||||||
|
בלי
|
||||||
|
מלבד
|
||||||
|
רק
|
||||||
|
בגלל
|
||||||
|
מכיוון
|
||||||
|
עד
|
||||||
|
אשר
|
||||||
|
ואילו
|
||||||
|
למרות
|
||||||
|
אס
|
||||||
|
כמו
|
||||||
|
כפי
|
||||||
|
אז
|
||||||
|
אחרי
|
||||||
|
כן
|
||||||
|
לכן
|
||||||
|
לפיכך
|
||||||
|
מאד
|
||||||
|
עז
|
||||||
|
מעט
|
||||||
|
מעטים
|
||||||
|
במידה
|
||||||
|
שוב
|
||||||
|
יותר
|
||||||
|
מדי
|
||||||
|
גם
|
||||||
|
כן
|
||||||
|
נו
|
||||||
|
אחר
|
||||||
|
אחרת
|
||||||
|
אחרים
|
||||||
|
אחרות
|
||||||
|
אשר
|
||||||
|
או
|
||||||
|
""".split())
|
|
@ -12,6 +12,8 @@ from ..sv import Swedish
|
||||||
from ..hu import Hungarian
|
from ..hu import Hungarian
|
||||||
from ..fi import Finnish
|
from ..fi import Finnish
|
||||||
from ..bn import Bengali
|
from ..bn import Bengali
|
||||||
|
from ..he import Hebrew
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..strings import StringStore
|
from ..strings import StringStore
|
||||||
from ..lemmatizer import Lemmatizer
|
from ..lemmatizer import Lemmatizer
|
||||||
|
@ -78,6 +80,11 @@ def bn_tokenizer():
|
||||||
return Bengali.Defaults.create_tokenizer()
|
return Bengali.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def he_tokenizer():
|
||||||
|
return Hebrew.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stringstore():
|
def stringstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
0
spacy/tests/he/__init__.py
Normal file
0
spacy/tests/he/__init__.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
ABBREVIATION_TESTS = [
|
||||||
|
('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
|
||||||
|
]
|
||||||
|
|
||||||
|
TESTCASES = ABBREVIATION_TESTS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
|
||||||
|
tokens = he_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
113
spacy/tests/regression/test_issue910.py
Normal file
113
spacy/tests/regression/test_issue910.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import contextlib
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from ...gold import GoldParse
|
||||||
|
from ...pipeline import EntityRecognizer
|
||||||
|
from ...en import English
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except NameError:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def train_data():
|
||||||
|
return [
|
||||||
|
["hey",[]],
|
||||||
|
["howdy",[]],
|
||||||
|
["hey there",[]],
|
||||||
|
["hello",[]],
|
||||||
|
["hi",[]],
|
||||||
|
["i'm looking for a place to eat",[]],
|
||||||
|
["i'm looking for a place in the north of town",[[31,36,"location"]]],
|
||||||
|
["show me chinese restaurants",[[8,15,"cuisine"]]],
|
||||||
|
["show me chines restaurants",[[8,14,"cuisine"]]],
|
||||||
|
["yes",[]],
|
||||||
|
["yep",[]],
|
||||||
|
["yeah",[]],
|
||||||
|
["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
|
||||||
|
["bye",[]],["goodbye",[]],
|
||||||
|
["good bye",[]],
|
||||||
|
["stop",[]],
|
||||||
|
["end",[]],
|
||||||
|
["i am looking for an indian spot",[[20,26,"cuisine"]]],
|
||||||
|
["search for restaurants",[]],
|
||||||
|
["anywhere in the west",[[16,20,"location"]]],
|
||||||
|
["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
|
||||||
|
["indeed",[]],
|
||||||
|
["that's right",[]],
|
||||||
|
["ok",[]],
|
||||||
|
["great",[]]
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def additional_entity_types():
|
||||||
|
return ['cuisine', 'location']
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def temp_save_model(model):
|
||||||
|
model_dir = Path(tempfile.mkdtemp())
|
||||||
|
# store the fine tuned model
|
||||||
|
with (model_dir / "config.json").open('w') as file_:
|
||||||
|
data = json.dumps(model.cfg)
|
||||||
|
if not isinstance(data, unicode):
|
||||||
|
data = data.decode('utf8')
|
||||||
|
file_.write(data)
|
||||||
|
model.model.dump((model_dir / 'model').as_posix())
|
||||||
|
yield model_dir
|
||||||
|
shutil.rmtree(model_dir.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_issue910(train_data, additional_entity_types):
|
||||||
|
'''Test that adding entities and resuming training works passably OK.
|
||||||
|
There are two issues here:
|
||||||
|
|
||||||
|
1) We have to readd labels. This isn't very nice.
|
||||||
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
|
end up out-of-scale, causing it to learn too fast.
|
||||||
|
'''
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||||
|
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
|
# Fine tune the ner model
|
||||||
|
for entity_type in additional_entity_types:
|
||||||
|
if entity_type not in nlp.entity.cfg['actions']['1']:
|
||||||
|
nlp.entity.add_label(entity_type)
|
||||||
|
|
||||||
|
nlp.entity.learn_rate = 0.001
|
||||||
|
for itn in range(4):
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for raw_text, entity_offsets in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
loss = nlp.entity.update(doc, gold)
|
||||||
|
|
||||||
|
with temp_save_model(nlp.entity) as model_dir:
|
||||||
|
# Load the fine tuned model
|
||||||
|
loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
|
||||||
|
|
||||||
|
for entity_type in additional_entity_types:
|
||||||
|
if entity_type not in loaded_ner.cfg['actions']['1']:
|
||||||
|
loaded_ner.add_label(entity_type)
|
||||||
|
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
loaded_ner(doc)
|
||||||
|
|
||||||
|
ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
|
assert ents_before_train == ents_after_train
|
|
@ -82,7 +82,7 @@ p
|
||||||
| conjunction features out of the atomic predictors. Let's say you have
|
| conjunction features out of the atomic predictors. Let's say you have
|
||||||
| two atomic predictors asking, "What is the part-of-speech of the
|
| two atomic predictors asking, "What is the part-of-speech of the
|
||||||
| previous token?", and "What is the part-of-speech of the previous
|
| previous token?", and "What is the part-of-speech of the previous
|
||||||
| previous token?". These ppredictors will introduce a number of features,
|
| previous token?". These predictors will introduce a number of features,
|
||||||
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
||||||
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user