spaCy/spacy/lang/el/tokenizer_exceptions.py
Ines Montani eddeb36c96
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 17:03:03 +01:00

399 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM
_exc = {}
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}]
for token in ["Αλλ'", "αλλ'"]:
_exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}]
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}]
for token in ["καθ'", "Καθ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}]
for token in ["κατ'", "Κατ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}]
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
_exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}]
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}]
for token in ["Δι'", "δι'"]:
_exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}]
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
_exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}]
for token in ["υπ'", "Υπ'"]:
_exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}]
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
_exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}]
for token in ["Μ'", "μ'"]:
_exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}]
for token in ["Γι'", "ΓΙ'", "γι'"]:
_exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}]
for token in ["Σ'", "σ'"]:
_exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}]
for token in ["Θ'", "θ'"]:
_exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}]
for token in ["Ν'", "ν'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
for token in ["'γω", "'σένα", "'μεις"]:
_exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}]
for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}]
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
_exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}]
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
_exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}]
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
_exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}]
for token in ["Πάρ'", "πάρ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}]
for token in ["μέσ'", "Μέσ'", "μεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}]
for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}]
for token in ["'κανε", "Κάν'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}]
_other_exc = {
"κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}],
"Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}],
"Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}],
"ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}],
"ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}],
"'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}],
"Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}],
"'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}],
"'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}],
"'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}],
"'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}],
"όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}],
"κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}],
"μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}],
"'ξομολογήθηκε": [
{ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"}
],
"'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}],
"'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}],
"έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}],
"εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}],
"δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}],
"τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}],
"Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}],
"άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}],
"Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}],
"Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}],
"Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}],
"Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}],
"Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}],
"Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}],
"'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}],
"'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}],
"'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}],
"'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}],
"'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}],
"'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}],
"'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}],
"'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}],
"'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}],
"'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}],
"'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}],
}
_exc.update(_other_exc)
for h in range(1, 12 + 1):
for period in ["π.μ.", "πμ"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
]
for period in ["μ.μ.", "μμ"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
]
for exc_data in [
{ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"},
{ORTH: "Αγ. Γρ.", LEMMA: "Αγία Γραφή", NORM: "Αγία Γραφή"},
{ORTH: "Αθ.", LEMMA: "Αθανάσιος", NORM: "Αθανάσιος"},
{ORTH: "Αλεξ.", LEMMA: "Αλέξανδρος", NORM: "Αλέξανδρος"},
{ORTH: "Απρ.", LEMMA: "Απρίλιος", NORM: "Απρίλιος"},
{ORTH: "Αύγ.", LEMMA: "Αύγουστος", NORM: "Αύγουστος"},
{ORTH: "Δεκ.", LEMMA: "Δεκέμβριος", NORM: "Δεκέμβριος"},
{ORTH: "Δημ.", LEMMA: "Δήμος", NORM: "Δήμος"},
{ORTH: "Ιαν.", LEMMA: "Ιανουάριος", NORM: "Ιανουάριος"},
{ORTH: "Ιούλ.", LEMMA: "Ιούλιος", NORM: "Ιούλιος"},
{ORTH: "Ιούν.", LEMMA: "Ιούνιος", NORM: "Ιούνιος"},
{ORTH: "Ιωαν.", LEMMA: "Ιωάννης", NORM: "Ιωάννης"},
{ORTH: "Μ. Ασία", LEMMA: "Μικρά Ασία", NORM: "Μικρά Ασία"},
{ORTH: "Μάρτ.", LEMMA: "Μάρτιος", NORM: "Μάρτιος"},
{ORTH: "Μάρτ'", LEMMA: "Μάρτιος", NORM: "Μάρτιος"},
{ORTH: "Νοέμβρ.", LEMMA: "Νοέμβριος", NORM: "Νοέμβριος"},
{ORTH: "Οκτ.", LEMMA: "Οκτώβριος", NORM: "Οκτώβριος"},
{ORTH: "Σεπτ.", LEMMA: "Σεπτέμβριος", NORM: "Σεπτέμβριος"},
{ORTH: "Φεβρ.", LEMMA: "Φεβρουάριος", NORM: "Φεβρουάριος"},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"$ΗΠΑ",
"Α'",
"Α.Ε.",
"Α.Ε.Β.Ε.",
"Α.Ε.Ι.",
"Α.Ε.Π.",
"Α.Μ.Α.",
"Α.Π.Θ.",
"Α.Τ.",
"Α.Χ.",
"ΑΝ.",
"Αγ.",
"Αλ.",
"Αν.",
"Αντ.",
"Απ.",
"Β'",
"Β)",
"Β.Ζ.",
"Β.Ι.Ο.",
"Β.Κ.",
"Β.Μ.Α.",
"Βασ.",
"Γ'",
"Γ)",
"Γ.Γ.",
"Γ.Δ.",
"Γκ.",
"Δ.Ε.Η.",
"Δ.Ε.Σ.Ε.",
"Δ.Ν.",
"Δ.Ο.Υ.",
"Δ.Σ.",
"Δ.Υ.",
"ΔΙ.ΚΑ.Τ.Σ.Α.",
"Δηλ.",
"Διον.",
"Ε.Α.",
"Ε.Α.Κ.",
"Ε.Α.Π.",
"Ε.Ε.",
"Ε.Κ.",
"Ε.ΚΕ.ΠΙΣ.",
"Ε.Λ.Α.",
"Ε.Λ.Ι.Α.",
"Ε.Π.Σ.",
"Ε.Π.Τ.Α.",
"Ε.Σ.Ε.Ε.Κ.",
"Ε.Υ.Κ.",
"ΕΕ.",
"ΕΚ.",
"ΕΛ.",
"ΕΛ.ΑΣ.",
"Εθν.",
"Ελ.",
"Εμ.",
"Επ.",
"Ευ.",
"Η'",
"Η.Π.Α.",
"ΘΕ.",
"Θεμ.",
"Θεοδ.",
"Θρ.",
"Ι.Ε.Κ.",
"Ι.Κ.Α.",
"Ι.Κ.Υ.",
"Ι.Σ.Θ.",
"Ι.Χ.",
"ΙΖ'",
"ΙΧ.",
"Κ.Α.Α.",
"Κ.Α.Ε.",
"Κ.Β.Σ.",
"Κ.Δ.",
"Κ.Ε.",
"Κ.Ε.Κ.",
"Κ.Ι.",
"Κ.Κ.",
"Κ.Ι.Θ.",
"Κ.Ι.Θ.",
"Κ.ΚΕΚ.",
"Κ.Ο.",
"Κ.Π.Ρ.",
"ΚΑΤ.",
"ΚΚ.",
"Καν.",
"Καρ.",
"Κατ.",
"Κυρ.",
"Κων.",
"Λ.Α.",
"Λ.χ.",
"Λ.Χ.",
"Λεωφ.",
"Λι.",
"Μ.Δ.Ε.",
"Μ.Ε.Ο.",
"Μ.Ζ.",
"Μ.Μ.Ε.",
"Μ.Ο.",
"Μεγ.",
"Μιλτ.",
"Μιχ.",
"Ν.Δ.",
"Ν.Ε.Α.",
"Ν.Κ.",
"Ν.Ο.",
"Ν.Ο.Θ.",
"Ν.Π.Δ.Δ.",
"Ν.Υ.",
"ΝΔ.",
"Νικ.",
"Ντ'",
"Ντ.",
"Ο'",
"Ο.Α.",
"Ο.Α.Ε.Δ.",
"Ο.Δ.",
"Ο.Ε.Ε.",
"Ο.Ε.Ε.Κ.",
"Ο.Η.Ε.",
"Ο.Κ.",
"Π.Δ.",
"Π.Ε.Κ.Δ.Υ.",
"Π.Ε.Π.",
"Π.Μ.Σ.",
"ΠΟΛ.",
"Π.Χ.",
"Παρ.",
"Πλ.",
"Πρ.",
"Σ.Δ.Ο.Ε.",
"Σ.Ε.",
"Σ.Ε.Κ.",
"Σ.Π.Δ.Ω.Β.",
"Σ.Τ.",
"Σαβ.",
"Στ.",
"ΣτΕ.",
"Στρ.",
"Τ.Α.",
"Τ.Ε.Ε.",
"Τ.Ε.Ι.",
"ΤΡ.",
"Τζ.",
"Τηλ.",
"Υ.Γ.",
"ΥΓ.",
"ΥΠ.Ε.Π.Θ.",
"Φ.Α.Β.Ε.",
"Φ.Κ.",
"Φ.Σ.",
"Φ.Χ.",
"Φ.Π.Α.",
"Φιλ.",
"Χ.Α.Α.",
"ΧΡ.",
"Χ.Χ.",
"Χαρ.",
"Χιλ.",
"Χρ.",
"άγ.",
"άρθρ.",
"αι.",
"αν.",
"απ.",
"αρ.",
"αριθ.",
"αριθμ.",
"β'",
"βλ.",
"γ.γ.",
"γεν.",
"γραμμ.",
"δ.δ.",
"δ.σ.",
"δηλ.",
"δισ.",
"δολ.",
"δρχ.",
"εκ.",
"εκατ.",
"ελ.",
"θιν'",
"κ.",
"κ.ά.",
"κ.α.",
"κ.κ.",
"κ.λπ.",
"κ.ο.κ.",
"κ.τ.λ.",
"κλπ.",
"κτλ.",
"κυβ.",
"λ.χ.",
"μ.",
"μ.Χ.",
"μ.μ.",
"μιλ.",
"ντ'",
"π.Χ.",
"π.β.",
"π.δ.",
"π.μ.",
"π.χ.",
"σ.",
"σ.α.λ.",
"σ.σ.",
"σελ.",
"στρ.",
"τ'ς",
"τ.μ.",
"τετ.",
"τετρ.",
"τηλ.",
"τρισ.",
"τόν.",
"υπ.",
"χ.μ.",
"χγρ.",
"χιλ.",
"χλμ.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc