spaCy/spacy/lang/grc/tokenizer_exceptions.py

112 lines
6.6 KiB
Python
Raw Normal View History

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {}
for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]:
_exc[token] = [{ORTH: token, NORM: "από"}]
for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]:
_exc[token] = [{ORTH: token, NORM: "ἀλλά"}]
for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]:
_exc[token] = [{ORTH: token, NORM: "παρά"}]
for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]:
_exc[token] = [{ORTH: token, NORM: "κατά"}]
for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]:
_exc[token] = [{ORTH: token, NORM: "επί"}]
for token in ["Δι'", "δι'", "διὰ"]:
_exc[token] = [{ORTH: token, NORM: "διά"}]
for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]:
_exc[token] = [{ORTH: token, NORM: "ὑπό"}]
for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]:
_exc[token] = [{ORTH: token, NORM: "μετά"}]
for token in ["Μ'", "μ'", "μέ", "μὲ"]:
_exc[token] = [{ORTH: token, NORM: "με"}]
for token in ["Σ'", "σ'", "σέ", "σὲ"]:
_exc[token] = [{ORTH: token, NORM: "σε"}]
for token in ["Τ'", "τ'", "τέ", "τὲ"]:
_exc[token] = [{ORTH: token, NORM: "τε"}]
for token in ["Δ'", "δ'", "δὲ"]:
_exc[token] = [{ORTH: token, NORM: "δέ"}]
_other_exc = {
"μὲν": [{ORTH: "μὲν", NORM: "μέν"}],
"μὴν": [{ORTH: "μὴν", NORM: "μήν"}],
"τὴν": [{ORTH: "τὴν", NORM: "τήν"}],
"τὸν": [{ORTH: "τὸν", NORM: "τόν"}],
"καὶ": [{ORTH: "καὶ", NORM: "καί"}],
"καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}],
"καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}],
"κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}],
"χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: ""}],
"χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}],
"χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}],
"κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}],
"κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγώ": [{ORTH: "", NORM: ""}, {ORTH: "γώ", NORM: "ἐγώ"}],
"ἁγὼ": [{ORTH: "", NORM: ""}, {ORTH: "γὼ", NORM: "ἐγώ"}],
"ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}],
"ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}],
"κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}],
"κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}],
"κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}],
"κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}],
"χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}],
"χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}],
"χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}],
"χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}],
"οὑμός": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μός", NORM: "ἐμός"}],
"οὑμὸς": [{ORTH: "οὑ", NORM: ""}, {ORTH: "μὸς", NORM: "ἐμός"}],
"οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}],
"οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}],
"σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}],
"σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}],
"μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
"τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}],
"οὑν": [{ORTH: "οὑ", NORM: ""}, {ORTH: "ν", NORM: "ἐν"}],
"ὦνερ": [{ORTH: "", NORM: ""}, {ORTH: "νερ", NORM: "ἄνερ"}],
"ὦνδρες": [{ORTH: "", NORM: ""}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}],
"προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}],
"ὥνεκα": [{ORTH: "", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
"τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}],
"θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}],
"ἅνθρωπος": [{ORTH: "", NORM: ""}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}],
"τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}],
"τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}],
"ἁνήρ": [{ORTH: "", NORM: ""}, {ORTH: "νήρ", NORM: "ἀνήρ"}],
"ἁνὴρ": [{ORTH: "", NORM: ""}, {ORTH: "νὴρ", NORM: "ἀνήρ"}],
"ἅνδρες": [{ORTH: "", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
"ἁγαθαί": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}],
"ἁγαθαὶ": [{ORTH: "", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}],
"ἁλήθεια": [{ORTH: "", NORM: ""}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}],
"τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}],
"τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}],
"τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}],
"τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}],
"αὑτός": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τός", NORM: "αὐτός"}],
"αὑτὸς": [{ORTH: "αὑ", NORM: ""}, {ORTH: "τὸς", NORM: "αὐτός"}],
"ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}],
}
_exc.update(_other_exc)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)