mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	* Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo
		
			
				
	
	
		
			112 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			112 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from ...symbols import NORM, ORTH
 | 
						||
from ...util import update_exc
 | 
						||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
						||
 | 
						||
_exc = {}
 | 
						||
 | 
						||
for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "από"}]
 | 
						||
 | 
						||
for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "ἀλλά"}]
 | 
						||
 | 
						||
for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "παρά"}]
 | 
						||
 | 
						||
for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "κατά"}]
 | 
						||
 | 
						||
for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "επί"}]
 | 
						||
 | 
						||
for token in ["Δι'", "δι'", "διὰ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "διά"}]
 | 
						||
 | 
						||
for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "ὑπό"}]
 | 
						||
 | 
						||
for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "μετά"}]
 | 
						||
 | 
						||
for token in ["Μ'", "μ'", "μέ", "μὲ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "με"}]
 | 
						||
 | 
						||
for token in ["Σ'", "σ'", "σέ", "σὲ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "σε"}]
 | 
						||
 | 
						||
for token in ["Τ'", "τ'", "τέ", "τὲ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "τε"}]
 | 
						||
 | 
						||
for token in ["Δ'", "δ'", "δὲ"]:
 | 
						||
    _exc[token] = [{ORTH: token, NORM: "δέ"}]
 | 
						||
 | 
						||
 | 
						||
_other_exc = {
 | 
						||
    "μὲν": [{ORTH: "μὲν", NORM: "μέν"}],
 | 
						||
    "μὴν": [{ORTH: "μὴν", NORM: "μήν"}],
 | 
						||
    "τὴν": [{ORTH: "τὴν", NORM: "τήν"}],
 | 
						||
    "τὸν": [{ORTH: "τὸν", NORM: "τόν"}],
 | 
						||
    "καὶ": [{ORTH: "καὶ", NORM: "καί"}],
 | 
						||
    "καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}],
 | 
						||
    "καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}],
 | 
						||
    "κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}],
 | 
						||
    "χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἡ"}],
 | 
						||
    "χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}],
 | 
						||
    "χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}],
 | 
						||
    "κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}],
 | 
						||
    "κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
 | 
						||
    "κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}],
 | 
						||
    "ἁγώ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γώ", NORM: "ἐγώ"}],
 | 
						||
    "ἁγὼ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γὼ", NORM: "ἐγώ"}],
 | 
						||
    "ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}],
 | 
						||
    "ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}],
 | 
						||
    "κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}],
 | 
						||
    "κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}],
 | 
						||
    "κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}],
 | 
						||
    "κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}],
 | 
						||
    "χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}],
 | 
						||
    "χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}],
 | 
						||
    "χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}],
 | 
						||
    "χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}],
 | 
						||
    "οὑμός": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μός", NORM: "ἐμός"}],
 | 
						||
    "οὑμὸς": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μὸς", NORM: "ἐμός"}],
 | 
						||
    "οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}],
 | 
						||
    "οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}],
 | 
						||
    "σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}],
 | 
						||
    "σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}],
 | 
						||
    "σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}],
 | 
						||
    "μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
 | 
						||
    "μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}],
 | 
						||
    "τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}],
 | 
						||
    "οὑν": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "ν", NORM: "ἐν"}],
 | 
						||
    "ὦνερ": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νερ", NORM: "ἄνερ"}],
 | 
						||
    "ὦνδρες": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
 | 
						||
    "προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}],
 | 
						||
    "προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}],
 | 
						||
    "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
 | 
						||
    "θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}],
 | 
						||
    "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}],
 | 
						||
    "τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}],
 | 
						||
    "θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}],
 | 
						||
    "ἅνθρωπος": [{ORTH: "ἅ", NORM: "ὁ"}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}],
 | 
						||
    "τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}],
 | 
						||
    "τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}],
 | 
						||
    "ἁνήρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νήρ", NORM: "ἀνήρ"}],
 | 
						||
    "ἁνὴρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νὴρ", NORM: "ἀνήρ"}],
 | 
						||
    "ἅνδρες": [{ORTH: "ἅ", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}],
 | 
						||
    "ἁγαθαί": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}],
 | 
						||
    "ἁγαθαὶ": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}],
 | 
						||
    "ἁλήθεια": [{ORTH: "ἁ", NORM: "ἡ"}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}],
 | 
						||
    "τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}],
 | 
						||
    "τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}],
 | 
						||
    "τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}],
 | 
						||
    "τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}],
 | 
						||
    "αὑτός": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τός", NORM: "αὐτός"}],
 | 
						||
    "αὑτὸς": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τὸς", NORM: "αὐτός"}],
 | 
						||
    "ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}],
 | 
						||
}
 | 
						||
 | 
						||
_exc.update(_other_exc)
 | 
						||
 | 
						||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |