mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Merge branch 'feature/language-data-config' of https://github.com/explosion/spaCy into feature/language-data-config
This commit is contained in:
commit
44a0b072e0
|
@ -1,5 +1,5 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
|
|
|
@ -3,6 +3,7 @@ from thinc.api import Config
|
|||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
|
@ -64,6 +65,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
config = Config().from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
|
||||
|
||||
class MalayalamDefaults(Language.Defaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
|
||||
# Lemma data note:
|
||||
|
@ -14,6 +15,7 @@ class RomanianDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
|
|
|
@ -151,6 +151,6 @@ for orth in ABBREVIATIONS:
|
|||
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||
# should be tokenized as two separate tokens.
|
||||
for orth in ["i", "m"]:
|
||||
_exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}]
|
||||
_exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user