spaCy/spacy/lang/grc/tokenizer_exceptions.py
jmyerston 993b0fab0e
Added ancient Greek language support (#8606)
* Add ancient Greek language support

Initial commit

* Contributor Agreement

* grc tokenizer test added  and files formatted with black, unnecessary import removed

Co-Authored-By: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Commas in lists fixed. __init__py added to test

* Update lex_attrs.py

* Update stop_words.py

* Update stop_words.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2021-07-15 10:27:17 +02:00

116 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {}
for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]:
_exc[token] = [{ORTH: token, NORM: "από"}]
for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]:
_exc[token] = [{ORTH: token, NORM: "ἀλλά"}]
for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]:
_exc[token] = [{ORTH: token, NORM: "παρά"}]
for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]:
_exc[token] = [{ORTH: token, NORM: "κατά"}]
for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]:
_exc[token] = [{ORTH: token, NORM: "επί"}]
for token in ["Δι'", "δι'", "διὰ"]:
_exc[token] = [{ORTH: token, NORM: "διά"}]
for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]:
_exc[token] = [{ORTH: token, NORM: "ὑπό"}]
for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]:
_exc[token] = [{ORTH: token, NORM: "μετά"}]
for token in ["Μ'", "μ'", "μέ", "μὲ"]:
_exc[token] = [{ORTH: token, NORM: "με"}]
for token in ["Σ'", "σ'", "σέ", "σὲ"]:
_exc[token] = [{ORTH: token, NORM: "σε"}]
for token in ["Τ'", "τ'", "τέ", "τὲ"]:
_exc[token] = [{ORTH: token, NORM: "τε"}]
for token in ["Δ'", "δ'", "δὲ"]:
_exc[token] = [{ORTH: token, NORM: "δέ"}]
_other_exc = {
"μὲν": [{ORTH: "μὲν", NORM: "μέν"}],
"μὴν": [{ORTH: "μὴν", NORM: "μήν"}],
"τὴν": [{ORTH: "τὴν", NORM: "τήν"}],
"τὸν": [{ORTH: "τὸν", NORM: "τόν"}],
"καὶ": [{ORTH: "καὶ", NORM: "καί"}],
"καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}],
"καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}],
"κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}],
"χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: ""}],
"χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}],
"χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}],
"κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}],
"κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγώ": [{ORTH: "", NORM: ""}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγὼ": [{ORTH: "", NORM: ""}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}],
"ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}],
"κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}],
"κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}],
"κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}],
"κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}],
"χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}],
"χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}],
"χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}],
"χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}],
"οὑμός": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μός", NORM: "ἐμός"}],
"οὑμὸς": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μὸς", NORM: "ἐμός"}],
"οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}],
"οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}],
"σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}],
"σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}],
"μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}],
"οὑν": [{ORTH: "οὑ", NORM: ""}, {ORTH: "ν", NORM: "ἐν"}],
"ὦνερ": [{ORTH: "", NORM: ""}, {ORTH: "νερ", NORM: "ἄνερ"}],
"ὦνδρες": [{ORTH: "", NORM: ""}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}],
"προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}],
"θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}],
"ἅνθρωπος": [{ORTH: "", NORM: ""}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}],
"τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}],
"τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}],
"ἁνήρ": [{ORTH: "", NORM: ""}, {ORTH: "νήρ", NORM: "ἀνήρ"}],
"ἁνὴρ": [{ORTH: "", NORM: ""}, {ORTH: "νὴρ", NORM: "ἀνήρ"}],
"ἅνδρες": [{ORTH: "", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"ἁγαθαί": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}],
"ἁγαθαὶ": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}],
"ἁλήθεια": [{ORTH: "", NORM: ""}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}],
"τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}],
"τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}],
"τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}],
"τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}],
"αὑτός": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τός", NORM: "αὐτός"}],
"αὑτὸς": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τὸς", NORM: "αὐτός"}],
"ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}],
}
_exc.update(_other_exc)
_exc_data = {}
_exc.update(_exc_data)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)