diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py index 67e3e80e5..5d3eb792e 100644 --- a/spacy/lang/en/punctuation.py +++ b/spacy/lang/en/punctuation.py @@ -1,5 +1,5 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS -from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA _infixes = ( LIST_ELLIPSES diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index d2af9c4b1..6197ab927 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -3,6 +3,7 @@ from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg @@ -64,6 +65,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index 166d0e061..cfad52261 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,8 +1,10 @@ from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language class MalayalamDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 74016d3e9..f0d8d8d31 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES +from .lex_attrs import LEX_ATTRS from ...language import Language # Lemma data note: @@ -14,6 +15,7 @@ class RomanianDefaults(Language.Defaults): prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES infixes = TOKENIZER_INFIXES + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 64206f2f2..ce7db895a 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -151,6 +151,6 @@ for orth in ABBREVIATIONS: # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: - _exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}] + _exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 70b277487..8bd0b93df 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,10 +1,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language class TurkishDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS