spaCy/spacy/lang/el/lemmatizer/_lemma_rules.py
Ioannis Daras 6ed18412d0 Greek language optimizations (#2558)
* Greek language optimizations

* Add encoding on files containing greek words

* Add encoding on files containing greek words
2018-07-18 18:51:38 +02:00

103 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
ADJECTIVE_RULES = [
["οί","ός"], # καρδιακοί
["ές","ός"], # επιφανειακές
["ές","ος"], # καρδιακές
["ές","ύς"], # πολλές
["οι","ος"],
["αία","ος"], # ωραία
["ωδη","ες"], # δασώδη
["ώδη","ες"],
["ότερη","ός"],
["ότερος","ός"],
["ότεροι", "ός"],
["ότερων","ός"],
["ότερες", "ός"],
]
NOUN_RULES = [
["ιά","ί"], # παιδιά
["ια","ι"], # ποτήρια
["ες","α"], # κεραμίδες
["ές","ά"],
["ές","ά"],
["ες","α"], # εσπερινές
["ες","η"], # ζάχαρη
["ές","ή"], # φυλακές
["ές","ής"], # καθηγητής
["α","ο"], # πρόβατα
["α","α"], # ζήτημα
["ατα","α"], # στόματα
["άτα","άτα"], # ντομάτα
["άτες","άτα"], # πατάτες
["ία","ία"],
["ιά","ιά"],
["οί","ός"], # υπουργοί
["ίας","ία"], # δικτατορίας, δυσωδείας, τρομοκρατίας
["άτων","ατα"], # δικαιωμάτων
["ώπων","ωπος"], # ανθρώπων
]
VERB_RULES = [
["εις", "ω"],
["εις","ώ"],
["ει","ω"],
["ει","ώ"],
["ουμε","ω"],
["ουμε","ώ"],
["ούμε","ώ"], # θεώρησα
["ούνε","ώ"], #
["ετε","ω"],
["ετε","ώ"],
["ουν","ω"],
["ουν","ώ"],
["είς","ώ"],
["εί","ώ"],
["ούν","ώ"],
["εσαι","ομαι"], #αισθάνεσαι
["εσαι","όμαι"],
["έσαι","ομαι"],
["έσαι","όμαι"],
["εται","ομαι"],
["εται","όμαι"],
["έται","ομαι"],
["έται","όμαι"],
["όμαστε","όμαι"],
["όμαστε","ομαι"],
["έσθε","όμαι"],
["εσθε","όμαι"],
["άς","ώ"], # αγαπάς
["άει","ώ"],
["άμε","ώ"],
["άτε","ώ"],
["άνε","ώ"],
["άν","ώ"],
["άμε","ώ"],
["άω","ώ"], # _verbs.py could contain any of the two
["ώ","άω"],
["όμουν", "ομαι"], # ζαλιζόμουν
["όμουν", "όμαι"],
["όμουν", "αμαι"], # κοιμόμουν
["όμουν", "αμαι"],
["ούσα", "ώ"], # ζητούσα -> ζητώ
["ούσες", "ώ"],
["ούσε", "ώ"],
["ούσαμε", "ώ"],
["ούσατε", "ώ"],
["ούσαν", "ώ"],
["ούσανε", "ώ"],
]
PUNCT_RULES = [
["", "\""],
["", "\""],
["\u2018", "'"],
["\u2019", "'"]
]