mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
6ed18412d0
* Greek language optimizations * Add encoding on files containing greek words * Add encoding on files containing greek words
103 lines
2.7 KiB
Python
103 lines
2.7 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
|
||
ADJECTIVE_RULES = [
|
||
["οί","ός"], # καρδιακοί
|
||
["ές","ός"], # επιφανειακές
|
||
["ές","ος"], # καρδιακές
|
||
["ές","ύς"], # πολλές
|
||
["οι","ος"],
|
||
["αία","ος"], # ωραία
|
||
["ωδη","ες"], # δασώδη
|
||
["ώδη","ες"],
|
||
["ότερη","ός"],
|
||
["ότερος","ός"],
|
||
["ότεροι", "ός"],
|
||
["ότερων","ός"],
|
||
["ότερες", "ός"],
|
||
]
|
||
|
||
|
||
NOUN_RULES = [
|
||
["ιά","ί"], # παιδιά
|
||
["ια","ι"], # ποτήρια
|
||
["ες","α"], # κεραμίδες
|
||
["ές","ά"],
|
||
["ές","ά"],
|
||
["ες","α"], # εσπερινές
|
||
["ες","η"], # ζάχαρη
|
||
["ές","ή"], # φυλακές
|
||
["ές","ής"], # καθηγητής
|
||
["α","ο"], # πρόβατα
|
||
["α","α"], # ζήτημα
|
||
["ατα","α"], # στόματα
|
||
["άτα","άτα"], # ντομάτα
|
||
["άτες","άτα"], # πατάτες
|
||
["ία","ία"],
|
||
["ιά","ιά"],
|
||
["οί","ός"], # υπουργοί
|
||
["ίας","ία"], # δικτατορίας, δυσωδείας, τρομοκρατίας
|
||
["άτων","ατα"], # δικαιωμάτων
|
||
["ώπων","ωπος"], # ανθρώπων
|
||
]
|
||
|
||
|
||
VERB_RULES = [
|
||
["εις", "ω"],
|
||
["εις","ώ"],
|
||
["ει","ω"],
|
||
["ει","ώ"],
|
||
["ουμε","ω"],
|
||
["ουμε","ώ"],
|
||
["ούμε","ώ"], # θεώρησα
|
||
["ούνε","ώ"], #
|
||
["ετε","ω"],
|
||
["ετε","ώ"],
|
||
["ουν","ω"],
|
||
["ουν","ώ"],
|
||
["είς","ώ"],
|
||
["εί","ώ"],
|
||
["ούν","ώ"],
|
||
["εσαι","ομαι"], #αισθάνεσαι
|
||
["εσαι","όμαι"],
|
||
["έσαι","ομαι"],
|
||
["έσαι","όμαι"],
|
||
["εται","ομαι"],
|
||
["εται","όμαι"],
|
||
["έται","ομαι"],
|
||
["έται","όμαι"],
|
||
["όμαστε","όμαι"],
|
||
["όμαστε","ομαι"],
|
||
["έσθε","όμαι"],
|
||
["εσθε","όμαι"],
|
||
["άς","ώ"], # αγαπάς
|
||
["άει","ώ"],
|
||
["άμε","ώ"],
|
||
["άτε","ώ"],
|
||
["άνε","ώ"],
|
||
["άν","ώ"],
|
||
["άμε","ώ"],
|
||
["άω","ώ"], # _verbs.py could contain any of the two
|
||
["ώ","άω"],
|
||
["όμουν", "ομαι"], # ζαλιζόμουν
|
||
["όμουν", "όμαι"],
|
||
["όμουν", "αμαι"], # κοιμόμουν
|
||
["όμουν", "αμαι"],
|
||
["ούσα", "ώ"], # ζητούσα -> ζητώ
|
||
["ούσες", "ώ"],
|
||
["ούσε", "ώ"],
|
||
["ούσαμε", "ώ"],
|
||
["ούσατε", "ώ"],
|
||
["ούσαν", "ώ"],
|
||
["ούσανε", "ώ"],
|
||
]
|
||
|
||
|
||
PUNCT_RULES = [
|
||
["“", "\""],
|
||
["”", "\""],
|
||
["\u2018", "'"],
|
||
["\u2019", "'"]
|
||
]
|