mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
3e8bc1272f
* add punctuation to grc Add support for special editorial punctuation that is common in ancient Greek texts. Ancient Greek texts, as found in digital and print form, have been largely edited by scholars. Restorations and improvements are normally marked with special characters that need to be handled properly by the tokenizer. * add unit tests * simplify regex * move generic quotes to char classes * rename unit test * fix regex Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: svlandeg <svlandeg@github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
23 lines
620 B
Python
23 lines
620 B
Python
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
from .stop_words import STOP_WORDS
|
|
from .lex_attrs import LEX_ATTRS
|
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
|
from ...language import Language, BaseDefaults
|
|
|
|
|
|
class AncientGreekDefaults(BaseDefaults):
|
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
prefixes = TOKENIZER_PREFIXES
|
|
suffixes = TOKENIZER_SUFFIXES
|
|
infixes = TOKENIZER_INFIXES
|
|
lex_attr_getters = LEX_ATTRS
|
|
stop_words = STOP_WORDS
|
|
|
|
|
|
class AncientGreek(Language):
|
|
lang = "grc"
|
|
Defaults = AncientGreekDefaults
|
|
|
|
|
|
__all__ = ["AncientGreek"]
|