mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'pr/917'
This commit is contained in:
commit
89fe97ec43
|
@ -56,7 +56,8 @@ def get_version(model, comp):
|
|||
def download_model(filename):
|
||||
util.print_msg("Downloading {f}".format(f=filename))
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
|
||||
subprocess.call([sys.executable, '-m',
|
||||
'pip', 'install', '--no-cache-dir', download_url],
|
||||
env=os.environ.copy())
|
||||
|
||||
|
||||
|
|
18
spacy/he/__init__.py
Normal file
18
spacy/he/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = 'he'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
17
spacy/he/language_data.py
Normal file
17
spacy/he/language_data.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
226
spacy/he/stop_words.py
Normal file
226
spacy/he/stop_words.py
Normal file
|
@ -0,0 +1,226 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set("""
|
||||
אני
|
||||
את
|
||||
אתה
|
||||
אנחנו
|
||||
אתן
|
||||
אתם
|
||||
הם
|
||||
הן
|
||||
היא
|
||||
הוא
|
||||
שלי
|
||||
שלו
|
||||
שלך
|
||||
שלה
|
||||
שלנו
|
||||
שלכם
|
||||
שלכן
|
||||
שלהם
|
||||
שלהן
|
||||
לי
|
||||
לו
|
||||
לה
|
||||
לנו
|
||||
לכם
|
||||
לכן
|
||||
להם
|
||||
להן
|
||||
אותה
|
||||
אותו
|
||||
זה
|
||||
זאת
|
||||
אלה
|
||||
אלו
|
||||
תחת
|
||||
מתחת
|
||||
מעל
|
||||
בין
|
||||
עם
|
||||
עד
|
||||
נגר
|
||||
על
|
||||
אל
|
||||
מול
|
||||
של
|
||||
אצל
|
||||
כמו
|
||||
אחר
|
||||
אותו
|
||||
בלי
|
||||
לפני
|
||||
אחרי
|
||||
מאחורי
|
||||
עלי
|
||||
עליו
|
||||
עליה
|
||||
עליך
|
||||
עלינו
|
||||
עליכם
|
||||
לעיכן
|
||||
עליהם
|
||||
עליהן
|
||||
כל
|
||||
כולם
|
||||
כולן
|
||||
כך
|
||||
ככה
|
||||
כזה
|
||||
זה
|
||||
זות
|
||||
אותי
|
||||
אותה
|
||||
אותם
|
||||
אותך
|
||||
אותו
|
||||
אותן
|
||||
אותנו
|
||||
ואת
|
||||
את
|
||||
אתכם
|
||||
אתכן
|
||||
איתי
|
||||
איתו
|
||||
איתך
|
||||
איתה
|
||||
איתם
|
||||
איתן
|
||||
איתנו
|
||||
איתכם
|
||||
איתכן
|
||||
יהיה
|
||||
תהיה
|
||||
היתי
|
||||
היתה
|
||||
היה
|
||||
להיות
|
||||
עצמי
|
||||
עצמו
|
||||
עצמה
|
||||
עצמם
|
||||
עצמן
|
||||
עצמנו
|
||||
עצמהם
|
||||
עצמהן
|
||||
מי
|
||||
מה
|
||||
איפה
|
||||
היכן
|
||||
במקום שבו
|
||||
אם
|
||||
לאן
|
||||
למקום שבו
|
||||
מקום בו
|
||||
איזה
|
||||
מהיכן
|
||||
איך
|
||||
כיצד
|
||||
באיזו מידה
|
||||
מתי
|
||||
בשעה ש
|
||||
כאשר
|
||||
כש
|
||||
למרות
|
||||
לפני
|
||||
אחרי
|
||||
מאיזו סיבה
|
||||
הסיבה שבגללה
|
||||
למה
|
||||
מדוע
|
||||
לאיזו תכלית
|
||||
כי
|
||||
יש
|
||||
אין
|
||||
אך
|
||||
מנין
|
||||
מאין
|
||||
מאיפה
|
||||
יכל
|
||||
יכלה
|
||||
יכלו
|
||||
יכול
|
||||
יכולה
|
||||
יכולים
|
||||
יכולות
|
||||
יוכלו
|
||||
יוכל
|
||||
מסוגל
|
||||
לא
|
||||
רק
|
||||
אולי
|
||||
אין
|
||||
לאו
|
||||
אי
|
||||
כלל
|
||||
נגד
|
||||
אם
|
||||
עם
|
||||
אל
|
||||
אלה
|
||||
אלו
|
||||
אף
|
||||
על
|
||||
מעל
|
||||
מתחת
|
||||
מצד
|
||||
בשביל
|
||||
לבין
|
||||
באמצע
|
||||
בתוך
|
||||
דרך
|
||||
מבעד
|
||||
באמצעות
|
||||
למעלה
|
||||
למטה
|
||||
מחוץ
|
||||
מן
|
||||
לעבר
|
||||
מכאן
|
||||
כאן
|
||||
הנה
|
||||
הרי
|
||||
פה
|
||||
שם
|
||||
אך
|
||||
ברם
|
||||
שוב
|
||||
אבל
|
||||
מבלי
|
||||
בלי
|
||||
מלבד
|
||||
רק
|
||||
בגלל
|
||||
מכיוון
|
||||
עד
|
||||
אשר
|
||||
ואילו
|
||||
למרות
|
||||
אס
|
||||
כמו
|
||||
כפי
|
||||
אז
|
||||
אחרי
|
||||
כן
|
||||
לכן
|
||||
לפיכך
|
||||
מאד
|
||||
עז
|
||||
מעט
|
||||
מעטים
|
||||
במידה
|
||||
שוב
|
||||
יותר
|
||||
מדי
|
||||
גם
|
||||
כן
|
||||
נו
|
||||
אחר
|
||||
אחרת
|
||||
אחרים
|
||||
אחרות
|
||||
אשר
|
||||
או
|
||||
""".split())
|
|
@ -12,6 +12,8 @@ from ..sv import Swedish
|
|||
from ..hu import Hungarian
|
||||
from ..fi import Finnish
|
||||
from ..bn import Bengali
|
||||
from ..he import Hebrew
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..strings import StringStore
|
||||
from ..lemmatizer import Lemmatizer
|
||||
|
@ -77,6 +79,11 @@ def sv_tokenizer():
|
|||
def bn_tokenizer():
|
||||
return Bengali.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def he_tokenizer():
|
||||
return Hebrew.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stringstore():
|
||||
|
|
0
spacy/tests/he/__init__.py
Normal file
0
spacy/tests/he/__init__.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
ABBREVIATION_TESTS = [
|
||||
('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
|
||||
]
|
||||
|
||||
TESTCASES = ABBREVIATION_TESTS
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
|
||||
tokens = he_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
113
spacy/tests/regression/test_issue910.py
Normal file
113
spacy/tests/regression/test_issue910.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
from __future__ import unicode_literals
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import contextlib
|
||||
import shutil
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
import pathlib
|
||||
from ...gold import GoldParse
|
||||
from ...pipeline import EntityRecognizer
|
||||
from ...en import English
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
unicode = str
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def train_data():
|
||||
return [
|
||||
["hey",[]],
|
||||
["howdy",[]],
|
||||
["hey there",[]],
|
||||
["hello",[]],
|
||||
["hi",[]],
|
||||
["i'm looking for a place to eat",[]],
|
||||
["i'm looking for a place in the north of town",[[31,36,"location"]]],
|
||||
["show me chinese restaurants",[[8,15,"cuisine"]]],
|
||||
["show me chines restaurants",[[8,14,"cuisine"]]],
|
||||
["yes",[]],
|
||||
["yep",[]],
|
||||
["yeah",[]],
|
||||
["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
|
||||
["bye",[]],["goodbye",[]],
|
||||
["good bye",[]],
|
||||
["stop",[]],
|
||||
["end",[]],
|
||||
["i am looking for an indian spot",[[20,26,"cuisine"]]],
|
||||
["search for restaurants",[]],
|
||||
["anywhere in the west",[[16,20,"location"]]],
|
||||
["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
|
||||
["indeed",[]],
|
||||
["that's right",[]],
|
||||
["ok",[]],
|
||||
["great",[]]
|
||||
]
|
||||
|
||||
@pytest.fixture
|
||||
def additional_entity_types():
|
||||
return ['cuisine', 'location']
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def temp_save_model(model):
|
||||
model_dir = Path(tempfile.mkdtemp())
|
||||
# store the fine tuned model
|
||||
with (model_dir / "config.json").open('w') as file_:
|
||||
data = json.dumps(model.cfg)
|
||||
if not isinstance(data, unicode):
|
||||
data = data.decode('utf8')
|
||||
file_.write(data)
|
||||
model.model.dump((model_dir / 'model').as_posix())
|
||||
yield model_dir
|
||||
shutil.rmtree(model_dir.as_posix())
|
||||
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models
|
||||
def test_issue910(train_data, additional_entity_types):
|
||||
'''Test that adding entities and resuming training works passably OK.
|
||||
There are two issues here:
|
||||
|
||||
1) We have to readd labels. This isn't very nice.
|
||||
2) There's no way to set the learning rate for the weight update, so we
|
||||
end up out-of-scale, causing it to learn too fast.
|
||||
'''
|
||||
nlp = English()
|
||||
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||
# Fine tune the ner model
|
||||
for entity_type in additional_entity_types:
|
||||
if entity_type not in nlp.entity.cfg['actions']['1']:
|
||||
nlp.entity.add_label(entity_type)
|
||||
|
||||
nlp.entity.learn_rate = 0.001
|
||||
for itn in range(4):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
|
||||
with temp_save_model(nlp.entity) as model_dir:
|
||||
# Load the fine tuned model
|
||||
loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
|
||||
|
||||
for entity_type in additional_entity_types:
|
||||
if entity_type not in loaded_ner.cfg['actions']['1']:
|
||||
loaded_ner.add_label(entity_type)
|
||||
|
||||
doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
|
||||
nlp.tagger(doc)
|
||||
loaded_ner(doc)
|
||||
|
||||
ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||
assert ents_before_train == ents_after_train
|
|
@ -82,7 +82,7 @@ p
|
|||
| conjunction features out of the atomic predictors. Let's say you have
|
||||
| two atomic predictors asking, "What is the part-of-speech of the
|
||||
| previous token?", and "What is the part-of-speech of the previous
|
||||
| previous token?". These ppredictors will introduce a number of features,
|
||||
| previous token?". These predictors will introduce a number of features,
|
||||
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
||||
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user