spaCy/spacy/lang/grc/lex_attrs.py
jmyerston 993b0fab0e
Added ancient Greek language support (#8606)
* Add ancient Greek language support

Initial commit

* Contributor Agreement

* grc tokenizer test added  and files formatted with black, unnecessary import removed

Co-Authored-By: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Commas in lists fixed. __init__py added to test

* Update lex_attrs.py

* Update stop_words.py

* Update stop_words.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2021-07-15 10:27:17 +02:00

315 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from ...attrs import LIKE_NUM
_num_words = [
# CARDINALS
"εἷς",
"ἑνός",
"ἑνί",
"ἕνα",
"μία",
"μιᾶς",
"μιᾷ",
"μίαν",
"ἕν",
"δύο",
"δυοῖν",
"τρεῖς",
"τριῶν",
"τρισί",
"τρία",
"τέτταρες",
"τεττάρων",
"τέτταρσι",
"τέτταρα",
"τέτταρας",
"πέντε",
"ἕξ",
"ἑπτά",
"ὀκτώ",
"ἐννέα",
"δέκα",
"ἕνδεκα",
"δώδεκα",
"πεντεκαίδεκα",
"ἑκκαίδεκα",
"ἑπτακαίδεκα",
"ὀκτωκαίδεκα",
"ἐννεακαίδεκα",
"εἴκοσι",
"τριάκοντα",
"τετταράκοντα",
"πεντήκοντα",
"ἑξήκοντα",
"ἑβδομήκοντα",
"ὀγδοήκοντα",
"ἐνενήκοντα",
"ἑκατόν",
"διακόσιοι",
"διακοσίων",
"διακοσιᾶν",
"διακοσίους",
"διακοσίοις",
"διακόσια",
"διακόσιαι",
"διακοσίαις",
"διακοσίαισι",
"διηκόσιοι",
"διηκοσίων",
"διηκοσιέων",
"διακοσίας",
"διηκόσια",
"διηκόσιαι",
"διηκοσίας",
"τριακόσιοι",
"τριακοσίων",
"τριακοσιᾶν",
"τριακοσίους",
"τριακοσίοις",
"τριακόσια",
"τριακόσιαι",
"τριακοσίαις",
"τριακοσίαισι",
"τριακοσιέων",
"τριακοσίας",
"τριηκόσια",
"τριηκοσίας",
"τριηκόσιοι",
"τριηκοσίοισιν",
"τριηκοσίους",
"τριηκοσίων",
"τετρακόσιοι",
"τετρακοσίων",
"τετρακοσιᾶν",
"τετρακοσίους",
"τετρακοσίοις",
"τετρακόσια",
"τετρακόσιαι",
"τετρακοσίαις",
"τετρακοσίαισι",
"τετρακοσιέων",
"τετρακοσίας",
"πεντακόσιοι",
"πεντακοσίων",
"πεντακοσιᾶν",
"πεντακοσίους",
"πεντακοσίοις",
"πεντακόσια",
"πεντακόσιαι",
"πεντακοσίαις",
"πεντακοσίαισι",
"πεντακοσιέων",
"πεντακοσίας",
"ἑξακόσιοι",
"ἑξακοσίων",
"ἑξακοσιᾶν",
"ἑξακοσίους",
"ἑξακοσίοις",
"ἑξακόσια",
"ἑξακόσιαι",
"ἑξακοσίαις",
"ἑξακοσίαισι",
"ἑξακοσιέων",
"ἑξακοσίας",
"ἑπτακόσιοι",
"ἑπτακοσίων",
"ἑπτακοσιᾶν",
"ἑπτακοσίους",
"ἑπτακοσίοις",
"ἑπτακόσια",
"ἑπτακόσιαι",
"ἑπτακοσίαις",
"ἑπτακοσίαισι",
"ἑπτακοσιέων",
"ἑπτακοσίας",
"ὀκτακόσιοι",
"ὀκτακοσίων",
"ὀκτακοσιᾶν",
"ὀκτακοσίους",
"ὀκτακοσίοις",
"ὀκτακόσια",
"ὀκτακόσιαι",
"ὀκτακοσίαις",
"ὀκτακοσίαισι",
"ὀκτακοσιέων",
"ὀκτακοσίας",
"ἐνακόσιοι",
"ἐνακοσίων",
"ἐνακοσιᾶν",
"ἐνακοσίους",
"ἐνακοσίοις",
"ἐνακόσια",
"ἐνακόσιαι",
"ἐνακοσίαις",
"ἐνακοσίαισι",
"ἐνακοσιέων",
"ἐνακοσίας",
"χίλιοι",
"χιλίων",
"χιλιῶν",
"χιλίους",
"χιλίοις",
"χίλιαι",
"χιλίας",
"χιλίαις",
"χίλια",
"χίλι",
"δισχίλιοι",
"δισχιλίων",
"δισχιλιῶν",
"δισχιλίους",
"δισχιλίοις",
"δισχίλιαι",
"δισχιλίας",
"δισχιλίαις",
"δισχίλια",
"δισχίλι",
"τρισχίλιοι",
"τρισχιλίων",
"τρισχιλιῶν",
"τρισχιλίους",
"τρισχιλίοις",
"τρισχίλιαι",
"τρισχιλίας",
"τρισχιλίαις",
"τρισχίλια",
"τρισχίλι",
"μύριοι",
"μύριοί",
"μυρίων",
"μυρίοις",
"μυρίους",
"μύριαι",
"μυρίαις",
"μυρίας",
"μύρια",
"δισμύριοι",
"δισμύριοί",
"δισμυρίων",
"δισμυρίοις",
"δισμυρίους",
"δισμύριαι",
"δισμυρίαις",
"δισμυρίας",
"δισμύρια",
"δεκακισμύριοι",
"δεκακισμύριοί",
"δεκακισμυρίων",
"δεκακισμυρίοις",
"δεκακισμυρίους",
"δεκακισμύριαι",
"δεκακισμυρίαις",
"δεκακισμυρίας",
"δεκακισμύρια",
# ANCIENT GREEK NUMBERS (1-100)
"α",
"β",
"γ",
"δ",
"ε",
"ϛ",
"ζ",
"η",
"θ",
"ι",
"ια",
"ιβ",
"ιγ",
"ιδ",
"ιε",
"ιϛ",
"ιζ",
"ιη",
"ιθ",
"κ",
"κα",
"κβ",
"κγ",
"κδ",
"κε",
"κϛ",
"κζ",
"κη",
"κθ",
"λ",
"λα",
"λβ",
"λγ",
"λδ",
"λε",
"λϛ",
"λζ",
"λη",
"λθ",
"μ",
"μα",
"μβ",
"μγ",
"μδ",
"με",
"μϛ",
"μζ",
"μη",
"μθ",
"ν",
"να",
"νβ",
"νγ",
"νδ",
"νε",
"νϛ",
"νζ",
"νη",
"νθ",
"ξ",
"ξα",
"ξβ",
"ξγ",
"ξδ",
"ξε",
"ξϛ",
"ξζ",
"ξη",
"ξθ",
"ο",
"οα",
"οβ",
"ογ",
"οδ",
"οε",
"οϛ",
"οζ",
"οη",
"οθ",
"π",
"πα",
"πβ",
"πγ",
"πδ",
"πε",
"πϛ",
"πζ",
"πη",
"πθ",
"ϟ",
"ϟα",
"ϟβ",
"ϟγ",
"ϟδ",
"ϟε",
"ϟϛ",
"ϟζ",
"ϟη",
"ϟθ",
"ρ",
]
def like_num(text):
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}