From a77c9fc60dcad00198731703cd33c72a2812737c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:49:28 +0200 Subject: [PATCH] Reorganise Hebrew language data --- spacy/he/__init__.py | 16 +++++++++------- spacy/he/stop_words.py | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/he/__init__.py b/spacy/he/__init__.py index 9426247bd..839d174a0 100644 --- a/spacy/he/__init__.py +++ b/spacy/he/__init__.py @@ -1,10 +1,12 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function +# coding: utf8 +from __future__ import unicode_literals +from .stop_words import STOP_WORDS + +from ..language_data import BASE_EXCEPTIONS from ..language import Language from ..attrs import LANG - -from .language_data import * +from ..util import update_exc class Hebrew(Language): @@ -14,8 +16,8 @@ class Hebrew(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = set(STOP_WORDS) -EXPORT = Hebrew \ No newline at end of file +__all__ = ['Hebrew'] diff --git a/spacy/he/stop_words.py b/spacy/he/stop_words.py index 2914fa0d5..329c8847a 100644 --- a/spacy/he/stop_words.py +++ b/spacy/he/stop_words.py @@ -1,6 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals + STOP_WORDS = set(""" אני את